aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile5
-rw-r--r--fs/9p/acl.c392
-rw-r--r--fs/9p/acl.h49
-rw-r--r--fs/9p/fid.c115
-rw-r--r--fs/9p/v9fs.c25
-rw-r--r--fs/9p/v9fs.h11
-rw-r--r--fs/9p/v9fs_vfs.h7
-rw-r--r--fs/9p/vfs_addr.c30
-rw-r--r--fs/9p/vfs_dir.c142
-rw-r--r--fs/9p/vfs_file.c289
-rw-r--r--fs/9p/vfs_inode.c924
-rw-r--r--fs/9p/vfs_super.c100
-rw-r--r--fs/9p/xattr.c172
-rw-r--r--fs/9p/xattr.h33
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/inode.c16
-rw-r--r--fs/adfs/super.c17
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/file.c15
-rw-r--r--fs/affs/inode.c40
-rw-r--r--fs/affs/super.c59
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/cell.c96
-rw-r--r--fs/afs/dir.c49
-rw-r--r--fs/afs/flock.c5
-rw-r--r--fs/afs/inode.c91
-rw-r--r--fs/afs/internal.h25
-rw-r--r--fs/afs/main.c9
-rw-r--r--fs/afs/mntpt.c79
-rw-r--r--fs/afs/proc.c2
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/super.c46
-rw-r--r--fs/afs/write.c19
-rw-r--r--fs/aio.c48
-rw-r--r--fs/anon_inodes.c16
-rw-r--r--fs/attr.c88
-rw-r--r--fs/autofs/Kconfig21
-rw-r--r--fs/autofs/Makefile7
-rw-r--r--fs/autofs/autofs_i.h165
-rw-r--r--fs/autofs/dirhash.c250
-rw-r--r--fs/autofs/init.c52
-rw-r--r--fs/autofs/inode.c288
-rw-r--r--fs/autofs/root.c582
-rw-r--r--fs/autofs/symlink.c26
-rw-r--r--fs/autofs/waitq.c205
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/init.c8
-rw-r--r--fs/autofs4/inode.c1
-rw-r--r--fs/autofs4/root.c62
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/befs/linuxvfs.c13
-rw-r--r--fs/bfs/bfs.h1
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/file.c17
-rw-r--r--fs/bfs/inode.c129
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_misc.c21
-rw-r--r--fs/binfmt_script.c3
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/bio.c5
-rw-r--r--fs/block_dev.c85
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c57
-rw-r--r--fs/btrfs/ctree.h104
-rw-r--r--fs/btrfs/dir-item.c2
-rw-r--r--fs/btrfs/disk-io.c59
-rw-r--r--fs/btrfs/extent-tree.c697
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/free-space-cache.c751
-rw-r--r--fs/btrfs/free-space-cache.h18
-rw-r--r--fs/btrfs/inode.c242
-rw-r--r--fs/btrfs/ioctl.c398
-rw-r--r--fs/btrfs/ioctl.h13
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/relocation.c109
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c60
-rw-r--r--fs/btrfs/transaction.c234
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c29
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/buffer.c265
-rw-r--r--fs/cachefiles/bind.c2
-rw-r--r--fs/cachefiles/daemon.c39
-rw-r--r--fs/cachefiles/internal.h13
-rw-r--r--fs/cachefiles/namei.c13
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/ceph/Kconfig15
-rw-r--r--fs/ceph/Makefile13
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c103
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h92
-rw-r--r--fs/ceph/auth_none.c131
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c684
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c439
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h705
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c609
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c415
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c115
-rw-r--r--fs/ceph/export.c26
-rw-r--r--fs/ceph/file.c239
-rw-r--r--fs/ceph/inode.c40
-rw-r--r--fs/ceph/ioctl.c101
-rw-r--r--fs/ceph/ioctl.h6
-rw-r--r--fs/ceph/locks.c273
-rw-r--r--fs/ceph/mds_client.c431
-rw-r--r--fs/ceph/mds_client.h51
-rw-r--r--fs/ceph/mdsmap.c17
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2276
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c882
-rw-r--r--fs/ceph/mon_client.h116
-rw-r--r--fs/ceph/msgpool.c64
-rw-r--r--fs/ceph/msgpool.h25
-rw-r--r--fs/ceph/msgr.h175
-rw-r--r--fs/ceph/osd_client.c1542
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1087
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h396
-rw-r--r--fs/ceph/snap.c187
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)83
-rw-r--r--fs/ceph/super.c1188
-rw-r--r--fs/ceph/super.h442
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c21
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/cifs/Kconfig30
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README20
-rw-r--r--fs/cifs/cache.c331
-rw-r--r--fs/cifs/cifs_debug.c37
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c57
-rw-r--r--fs/cifs/cifs_fs_sb.h14
-rw-r--r--fs/cifs/cifs_spnego.c7
-rw-r--r--fs/cifs/cifs_unicode.h18
-rw-r--r--fs/cifs/cifs_uniupr.h16
-rw-r--r--fs/cifs/cifsacl.c46
-rw-r--r--fs/cifs/cifsencrypt.c604
-rw-r--r--fs/cifs/cifsfs.c159
-rw-r--r--fs/cifs/cifsfs.h10
-rw-r--r--fs/cifs/cifsglob.h186
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifsproto.h37
-rw-r--r--fs/cifs/cifssmb.c79
-rw-r--r--fs/cifs/cn_cifs.h37
-rw-r--r--fs/cifs/connect.c812
-rw-r--r--fs/cifs/dir.c371
-rw-r--r--fs/cifs/dns_resolve.c231
-rw-r--r--fs/cifs/dns_resolve.h2
-rw-r--r--fs/cifs/file.c913
-rw-r--r--fs/cifs/fscache.c237
-rw-r--r--fs/cifs/fscache.h136
-rw-r--r--fs/cifs/inode.c431
-rw-r--r--fs/cifs/ioctl.c18
-rw-r--r--fs/cifs/link.c372
-rw-r--r--fs/cifs/misc.c52
-rw-r--r--fs/cifs/netmisc.c67
-rw-r--r--fs/cifs/ntlmssp.h15
-rw-r--r--fs/cifs/readdir.c84
-rw-r--r--fs/cifs/sess.c247
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c60
-rw-r--r--fs/coda/cache.c17
-rw-r--r--fs/coda/cnode.c19
-rw-r--r--fs/coda/dir.c157
-rw-r--r--fs/coda/file.c31
-rw-r--r--fs/coda/inode.c73
-rw-r--r--fs/coda/pioctl.c23
-rw-r--r--fs/coda/psdev.c58
-rw-r--r--fs/coda/symlink.c3
-rw-r--r--fs/coda/upcall.c101
-rw-r--r--fs/compat.c98
-rw-r--r--fs/compat_ioctl.c144
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c8
-rw-r--r--fs/cramfs/inode.c97
-rw-r--r--fs/dcache.c571
-rw-r--r--fs/debugfs/file.c3
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/devpts/inode.c32
-rw-r--r--fs/direct-io.c104
-rw-r--r--fs/dlm/debug_fs.c3
-rw-r--r--fs/dlm/lock.c3
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/netlink.c15
-rw-r--r--fs/dlm/plock.c3
-rw-r--r--fs/dlm/user.c3
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h3
-rw-r--r--fs/ecryptfs/file.c64
-rw-r--r--fs/ecryptfs/inode.c140
-rw-r--r--fs/ecryptfs/keystore.c47
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/main.c20
-rw-r--r--fs/ecryptfs/messaging.c4
-rw-r--r--fs/ecryptfs/miscdev.c3
-rw-r--r--fs/ecryptfs/super.c16
-rw-r--r--fs/efs/super.c8
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c264
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/exofs.h3
-rw-r--r--fs/exofs/file.c26
-rw-r--r--fs/exofs/inode.c208
-rw-r--r--fs/exofs/ios.c56
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exofs/super.c13
-rw-r--r--fs/exportfs/expfs.c17
-rw-r--r--fs/ext2/balloc.c14
-rw-r--r--fs/ext2/dir.c25
-rw-r--r--fs/ext2/ext2.h6
-rw-r--r--fs/ext2/ialloc.c13
-rw-r--r--fs/ext2/inode.c102
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c32
-rw-r--r--fs/ext2/xattr.c27
-rw-r--r--fs/ext3/Kconfig1
-rw-r--r--fs/ext3/balloc.c17
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext3/ialloc.c23
-rw-r--r--fs/ext3/inode.c167
-rw-r--r--fs/ext3/namei.c5
-rw-r--r--fs/ext3/resize.c15
-rw-r--r--fs/ext3/super.c97
-rw-r--r--fs/ext3/xattr.c12
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c1
-rw-r--r--fs/ext4/balloc.c11
-rw-r--r--fs/ext4/block_validity.c15
-rw-r--r--fs/ext4/dir.c25
-rw-r--r--fs/ext4/ext4.h265
-rw-r--r--fs/ext4/ext4_extents.h65
-rw-r--r--fs/ext4/ext4_jbd2.c71
-rw-r--r--fs/ext4/ext4_jbd2.h56
-rw-r--r--fs/ext4/extents.c386
-rw-r--r--fs/ext4/file.c49
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/ialloc.c141
-rw-r--r--fs/ext4/inode.c836
-rw-r--r--fs/ext4/mballoc.c698
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c32
-rw-r--r--fs/ext4/namei.c103
-rw-r--r--fs/ext4/page-io.c430
-rw-r--r--fs/ext4/resize.c60
-rw-r--r--fs/ext4/super.c912
-rw-r--r--fs/ext4/xattr.c19
-rw-r--r--fs/ext4/xattr.h10
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/file.c49
-rw-r--r--fs/fat/inode.c31
-rw-r--r--fs/fat/misc.c9
-rw-r--r--fs/fat/namei_msdos.c15
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c81
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file.c60
-rw-r--r--fs/file_table.c160
-rw-r--r--fs/freevxfs/vxfs_extern.h2
-rw-r--r--fs/freevxfs/vxfs_inode.c9
-rw-r--r--fs/freevxfs/vxfs_lookup.c14
-rw-r--r--fs/freevxfs/vxfs_super.c20
-rw-r--r--fs/fs-writeback.c397
-rw-r--r--fs/fs_struct.c39
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/internal.h22
-rw-r--r--fs/fscache/main.c106
-rw-r--r--fs/fscache/object-list.c11
-rw-r--r--fs/fscache/object.c106
-rw-r--r--fs/fscache/operation.c67
-rw-r--r--fs/fscache/page.c36
-rw-r--r--fs/fuse/control.c15
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c278
-rw-r--r--fs/fuse/dir.c19
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c23
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/gfs2/Kconfig3
-rw-r--r--fs/gfs2/aops.c42
-rw-r--r--fs/gfs2/bmap.c270
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c42
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c128
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h12
-rw-r--r--fs/gfs2/inode.c36
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/log.c21
-rw-r--r--fs/gfs2/main.c18
-rw-r--r--fs/gfs2/meta_io.c10
-rw-r--r--fs/gfs2/ops_fstype.c164
-rw-r--r--fs/gfs2/ops_inode.c342
-rw-r--r--fs/gfs2/quota.c41
-rw-r--r--fs/gfs2/recovery.c69
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c56
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c77
-rw-r--r--fs/gfs2/sys.c70
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c26
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfs/hfs_fs.h15
-rw-r--r--fs/hfs/inode.c72
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c18
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c207
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h86
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c262
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c330
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--fs/hostfs/hostfs.h32
-rw-r--r--fs/hostfs/hostfs_kern.c527
-rw-r--r--fs/hostfs/hostfs_user.c126
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/buffer.c4
-rw-r--r--fs/hpfs/file.c11
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c24
-rw-r--r--fs/hpfs/super.c21
-rw-r--r--fs/hppfs/hppfs.c17
-rw-r--r--fs/hugetlbfs/inode.c66
-rw-r--r--fs/inode.c692
-rw-r--r--fs/internal.h14
-rw-r--r--fs/ioctl.c57
-rw-r--r--fs/isofs/dir.c6
-rw-r--r--fs/isofs/inode.c82
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/namei.c8
-rw-r--r--fs/isofs/rock.c10
-rw-r--r--fs/jbd/checkpoint.c8
-rw-r--r--fs/jbd/commit.c41
-rw-r--r--fs/jbd/journal.c53
-rw-r--r--fs/jbd/recovery.c13
-rw-r--r--fs/jbd/revoke.c2
-rw-r--r--fs/jbd/transaction.c6
-rw-r--r--fs/jbd2/checkpoint.c35
-rw-r--r--fs/jbd2/commit.c131
-rw-r--r--fs/jbd2/journal.c141
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jbd2/transaction.c252
-rw-r--r--fs/jffs2/background.c1
-rw-r--r--fs/jffs2/build.c3
-rw-r--r--fs/jffs2/compr.c11
-rw-r--r--fs/jffs2/compr.h5
-rw-r--r--fs/jffs2/compr_lzo.c5
-rw-r--r--fs/jffs2/compr_rtime.c7
-rw-r--r--fs/jffs2/compr_rubin.c12
-rw-r--r--fs/jffs2/compr_zlib.c7
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/debug.h1
-rw-r--r--fs/jffs2/dir.c24
-rw-r--r--fs/jffs2/erase.c3
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/fs.c37
-rw-r--r--fs/jffs2/gc.c8
-rw-r--r--fs/jffs2/ioctl.c1
-rw-r--r--fs/jffs2/jffs2_fs_i.h1
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/nodelist.c8
-rw-r--r--fs/jffs2/nodelist.h4
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c12
-rw-r--r--fs/jffs2/super.c20
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jfs/file.c14
-rw-r--r--fs/jfs/inode.c63
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/jfs_logmgr.c6
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c40
-rw-r--r--fs/jfs/xattr.c87
-rw-r--r--fs/libfs.c121
-rw-r--r--fs/lockd/clntlock.c15
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c13
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c37
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/lockd/svcsubs.c9
-rw-r--r--fs/locks.c251
-rw-r--r--fs/logfs/dev_bdev.c15
-rw-r--r--fs/logfs/dev_mtd.c18
-rw-r--r--fs/logfs/dir.c10
-rw-r--r--fs/logfs/file.c24
-rw-r--r--fs/logfs/inode.c51
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/logfs.h29
-rw-r--r--fs/logfs/readwrite.c62
-rw-r--r--fs/logfs/segment.c1
-rw-r--r--fs/logfs/super.c98
-rw-r--r--fs/mbcache.c196
-rw-r--r--fs/minix/bitmap.c6
-rw-r--r--fs/minix/dir.c21
-rw-r--r--fs/minix/file.c22
-rw-r--r--fs/minix/inode.c44
-rw-r--r--fs/minix/minix.h4
-rw-r--r--fs/minix/namei.c4
-rw-r--r--fs/namei.c160
-rw-r--r--fs/namespace.c217
-rw-r--r--fs/ncpfs/dir.c221
-rw-r--r--fs/ncpfs/file.c25
-rw-r--r--fs/ncpfs/inode.c103
-rw-r--r--fs/ncpfs/ioctl.c471
-rw-r--r--fs/ncpfs/ncplib_kernel.c101
-rw-r--r--fs/ncpfs/ncplib_kernel.h15
-rw-r--r--fs/ncpfs/ncpsign_kernel.c10
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/nfs/Kconfig48
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.c15
-rw-r--r--fs/nfs/callback_proc.c27
-rw-r--r--fs/nfs/client.c51
-rw-r--r--fs/nfs/delegation.c26
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c1035
-rw-r--r--fs/nfs/direct.c29
-rw-r--r--fs/nfs/dns_resolve.c30
-rw-r--r--fs/nfs/dns_resolve.h12
-rw-r--r--fs/nfs/file.c142
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/idmap.c211
-rw-r--r--fs/nfs/inode.c124
-rw-r--r--fs/nfs/internal.h23
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs2xdr.c114
-rw-r--r--fs/nfs/nfs3proc.c62
-rw-r--r--fs/nfs/nfs3xdr.c204
-rw-r--r--fs/nfs/nfs4_fs.h61
-rw-r--r--fs/nfs/nfs4filelayout.c280
-rw-r--r--fs/nfs/nfs4filelayout.h94
-rw-r--r--fs/nfs/nfs4filelayoutdev.c448
-rw-r--r--fs/nfs/nfs4proc.c950
-rw-r--r--fs/nfs/nfs4renewd.c4
-rw-r--r--fs/nfs/nfs4state.c132
-rw-r--r--fs/nfs/nfs4xdr.c807
-rw-r--r--fs/nfs/nfsroot.c568
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/pnfs.c783
-rw-r--r--fs/nfs/pnfs.h189
-rw-r--r--fs/nfs/proc.c35
-rw-r--r--fs/nfs/read.c7
-rw-r--r--fs/nfs/super.c191
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c261
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig14
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4callback.c296
-rw-r--r--fs/nfsd/nfs4idmap.c105
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4state.c924
-rw-r--r--fs/nfsd/nfs4xdr.c27
-rw-r--r--fs/nfsd/nfsctl.c59
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/nfsproc.c4
-rw-r--r--fs/nfsd/nfssvc.c152
-rw-r--r--fs/nfsd/state.h92
-rw-r--r--fs/nfsd/vfs.c113
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/bmap.c24
-rw-r--r--fs/nilfs2/bmap.h26
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c40
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c914
-rw-r--r--fs/nilfs2/btree.h12
-rw-r--r--fs/nilfs2/cpfile.c72
-rw-r--r--fs/nilfs2/cpfile.h4
-rw-r--r--fs/nilfs2/dat.c92
-rw-r--r--fs/nilfs2/dat.h4
-rw-r--r--fs/nilfs2/dir.c58
-rw-r--r--fs/nilfs2/direct.c96
-rw-r--r--fs/nilfs2/direct.h11
-rw-r--r--fs/nilfs2/export.h17
-rw-r--r--fs/nilfs2/gcdat.c87
-rw-r--r--fs/nilfs2/gcinode.c151
-rw-r--r--fs/nilfs2/ifile.c51
-rw-r--r--fs/nilfs2/ifile.h4
-rw-r--r--fs/nilfs2/inode.c243
-rw-r--r--fs/nilfs2/ioctl.c24
-rw-r--r--fs/nilfs2/mdt.c314
-rw-r--r--fs/nilfs2/mdt.h32
-rw-r--r--fs/nilfs2/namei.c141
-rw-r--r--fs/nilfs2/nilfs.h62
-rw-r--r--fs/nilfs2/page.c60
-rw-r--r--fs/nilfs2/page.h8
-rw-r--r--fs/nilfs2/recovery.c374
-rw-r--r--fs/nilfs2/sb.h10
-rw-r--r--fs/nilfs2/segbuf.c5
-rw-r--r--fs/nilfs2/segbuf.h24
-rw-r--r--fs/nilfs2/segment.c123
-rw-r--r--fs/nilfs2/segment.h20
-rw-r--r--fs/nilfs2/sufile.c77
-rw-r--r--fs/nilfs2/sufile.h6
-rw-r--r--fs/nilfs2/super.c954
-rw-r--r--fs/nilfs2/the_nilfs.c490
-rw-r--r--fs/nilfs2/the_nilfs.h124
-rw-r--r--fs/no-block.c1
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c213
-rw-r--r--fs/notify/fanotify/Kconfig26
-rw-r--r--fs/notify/fanotify/Makefile1
-rw-r--r--fs/notify/fanotify/fanotify.c224
-rw-r--r--fs/notify/fanotify/fanotify_user.c870
-rw-r--r--fs/notify/fsnotify.c247
-rw-r--r--fs/notify/fsnotify.h27
-rw-r--r--fs/notify/group.c182
-rw-r--r--fs/notify/inode_mark.c344
-rw-r--r--fs/notify/inotify/Kconfig15
-rw-r--r--fs/notify/inotify/Makefile1
-rw-r--r--fs/notify/inotify/inotify.c873
-rw-r--r--fs/notify/inotify/inotify.h7
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c151
-rw-r--r--fs/notify/inotify/inotify_user.c370
-rw-r--r--fs/notify/mark.c371
-rw-r--r--fs/notify/notification.c209
-rw-r--r--fs/notify/vfsmount_mark.c191
-rw-r--r--fs/ntfs/inode.c10
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ntfs/super.c54
-rw-r--r--fs/ocfs2/acl.c36
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c44
-rw-r--r--fs/ocfs2/aops.h6
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c24
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dir.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h30
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c27
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c401
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c49
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c29
-rw-r--r--fs/ocfs2/dlmglue.c12
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c114
-rw-r--r--fs/ocfs2/inode.c36
-rw-r--r--fs/ocfs2/inode.h17
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c13
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c15
-rw-r--r--fs/ocfs2/namei.c307
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h83
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h103
-rw-r--r--fs/ocfs2/refcounttree.c73
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/reservations.c22
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/suballoc.c239
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/super.c186
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c6
-rw-r--r--fs/omfs/dir.c22
-rw-r--r--fs/omfs/file.c46
-rw-r--r--fs/omfs/inode.c62
-rw-r--r--fs/omfs/omfs.h1
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/open.c28
-rw-r--r--fs/openpromfs/inode.c8
-rw-r--r--fs/partitions/acorn.c35
-rw-r--r--fs/partitions/amiga.c20
-rw-r--r--fs/partitions/atari.c12
-rw-r--r--fs/partitions/check.c65
-rw-r--r--fs/partitions/check.h9
-rw-r--r--fs/partitions/efi.c27
-rw-r--r--fs/partitions/ibm.c17
-rw-r--r--fs/partitions/karma.c2
-rw-r--r--fs/partitions/ldm.c6
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c4
-rw-r--r--fs/partitions/msdos.c67
-rw-r--r--fs/partitions/osf.c2
-rw-r--r--fs/partitions/sgi.c2
-rw-r--r--fs/partitions/sun.c2
-rw-r--r--fs/partitions/sysv68.c9
-rw-r--r--fs/partitions/ultrix.c2
-rw-r--r--fs/pipe.c13
-rw-r--r--fs/pnode.c11
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/base.c241
-rw-r--r--fs/proc/generic.c18
-rw-r--r--fs/proc/inode.c23
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_sysctl.c18
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/softirqs.c4
-rw-r--r--fs/proc/stat.c14
-rw-r--r--fs/proc/task_mmu.c20
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/dir.c4
-rw-r--r--fs/qnx4/inode.c26
-rw-r--r--fs/qnx4/namei.c4
-rw-r--r--fs/quota/Kconfig4
-rw-r--r--fs/quota/dquot.c111
-rw-r--r--fs/quota/quota_tree.c85
-rw-r--r--fs/quota/quota_tree.h6
-rw-r--r--fs/quota/quota_v1.c3
-rw-r--r--fs/quota/quota_v2.c11
-rw-r--r--fs/ramfs/file-nommu.c7
-rw-r--r--fs/ramfs/inode.c18
-rw-r--r--fs/read_write.c103
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/reiserfs/Kconfig6
-rw-r--r--fs/reiserfs/README2
-rw-r--r--fs/reiserfs/file.c53
-rw-r--r--fs/reiserfs/inode.c163
-rw-r--r--fs/reiserfs/ioctl.c13
-rw-r--r--fs/reiserfs/journal.c109
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/super.c19
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/romfs/super.c18
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c8
-rw-r--r--fs/signalfd.c13
-rw-r--r--fs/smbfs/Kconfig55
-rw-r--r--fs/smbfs/Makefile18
-rw-r--r--fs/smbfs/cache.c208
-rw-r--r--fs/smbfs/dir.c702
-rw-r--r--fs/smbfs/file.c454
-rw-r--r--fs/smbfs/getopt.c64
-rw-r--r--fs/smbfs/getopt.h14
-rw-r--r--fs/smbfs/inode.c841
-rw-r--r--fs/smbfs/ioctl.c69
-rw-r--r--fs/smbfs/proc.c3507
-rw-r--r--fs/smbfs/proto.h87
-rw-r--r--fs/smbfs/request.c818
-rw-r--r--fs/smbfs/request.h70
-rw-r--r--fs/smbfs/smb_debug.h34
-rw-r--r--fs/smbfs/smbiod.c344
-rw-r--r--fs/smbfs/sock.c386
-rw-r--r--fs/smbfs/symlink.c68
-rw-r--r--fs/splice.c14
-rw-r--r--fs/squashfs/Kconfig25
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/decompressor.c6
-rw-r--r--fs/squashfs/dir.c3
-rw-r--r--fs/squashfs/lzo_wrapper.c136
-rw-r--r--fs/squashfs/squashfs.h3
-rw-r--r--fs/squashfs/squashfs_fs.h20
-rw-r--r--fs/squashfs/super.c15
-rw-r--r--fs/squashfs/xattr.c13
-rw-r--r--fs/squashfs/xattr.h6
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/stat.c29
-rw-r--r--fs/statfs.c95
-rw-r--r--fs/super.c188
-rw-r--r--fs/sync.c25
-rw-r--r--fs/sysfs/bin.c68
-rw-r--r--fs/sysfs/file.c5
-rw-r--r--fs/sysfs/group.c59
-rw-r--r--fs/sysfs/inode.c8
-rw-r--r--fs/sysfs/mount.c34
-rw-r--r--fs/sysfs/sysfs.h2
-rw-r--r--fs/sysv/dir.c21
-rw-r--r--fs/sysv/file.c22
-rw-r--r--fs/sysv/ialloc.c1
-rw-r--r--fs/sysv/inode.c19
-rw-r--r--fs/sysv/itree.c19
-rw-r--r--fs/sysv/namei.c2
-rw-r--r--fs/sysv/super.c91
-rw-r--r--fs/sysv/sysv.h4
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c4
-rw-r--r--fs/ubifs/debug.c157
-rw-r--r--fs/ubifs/debug.h4
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c30
-rw-r--r--fs/ubifs/gc.c82
-rw-r--r--fs/ubifs/io.c20
-rw-r--r--fs/ubifs/journal.c3
-rw-r--r--fs/ubifs/key.h14
-rw-r--r--fs/ubifs/log.c6
-rw-r--r--fs/ubifs/lpt.c21
-rw-r--r--fs/ubifs/lpt_commit.c5
-rw-r--r--fs/ubifs/master.c3
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c34
-rw-r--r--fs/ubifs/replay.c20
-rw-r--r--fs/ubifs/sb.c9
-rw-r--r--fs/ubifs/scan.c6
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c109
-rw-r--r--fs/ubifs/tnc.c5
-rw-r--r--fs/ubifs/ubifs.h25
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/file.c23
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c61
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c22
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c13
-rw-r--r--fs/ufs/ialloc.c20
-rw-r--r--fs/ufs/inode.c63
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/super.c15
-rw-r--r--fs/ufs/truncate.c34
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/ufs/util.c20
-rw-r--r--fs/ufs/util.h7
-rw-r--r--fs/utimes.c7
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c649
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c303
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h179
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_dmapi_priv.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c104
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c35
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c49
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c77
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c227
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c521
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h133
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
-rw-r--r--fs/xfs/quota/xfs_dquot.c276
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c301
-rw-r--r--fs/xfs/quota/xfs_qm.c228
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c12
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c10
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c133
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c35
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_alloc.c19
-rw-r--r--fs/xfs/xfs_alloc.h20
-rw-r--r--fs/xfs/xfs_alloc_btree.c38
-rw-r--r--fs/xfs/xfs_attr.c128
-rw-r--r--fs/xfs/xfs_attr_leaf.c5
-rw-r--r--fs/xfs/xfs_bmap.c383
-rw-r--r--fs/xfs/xfs_bmap.h44
-rw-r--r--fs/xfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/xfs_btree.c61
-rw-r--r--fs/xfs/xfs_btree.h14
-rw-r--r--fs/xfs/xfs_buf_item.c235
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_da_btree.c22
-rw-r--r--fs/xfs/xfs_dfrag.c16
-rw-r--r--fs/xfs/xfs_dinode.h5
-rw-r--r--fs/xfs/xfs_dir2.c11
-rw-r--r--fs/xfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_dmapi.h170
-rw-r--r--fs/xfs/xfs_dmops.c55
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c278
-rw-r--r--fs/xfs/xfs_filestream.c84
-rw-r--r--fs/xfs/xfs_filestream.h82
-rw-r--r--fs/xfs/xfs_fs.h11
-rw-r--r--fs/xfs/xfs_fsops.c50
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c22
-rw-r--r--fs/xfs/xfs_ialloc_btree.c37
-rw-r--r--fs/xfs/xfs_iget.c112
-rw-r--r--fs/xfs/xfs_inode.c126
-rw-r--r--fs/xfs/xfs_inode.h36
-rw-r--r--fs/xfs/xfs_inode_item.c282
-rw-r--r--fs/xfs/xfs_inode_item.h12
-rw-r--r--fs/xfs/xfs_iomap.c76
-rw-r--r--fs/xfs/xfs_iomap.h22
-rw-r--r--fs/xfs/xfs_itable.c11
-rw-r--r--fs/xfs/xfs_log.c41
-rw-r--r--fs/xfs/xfs_log.h11
-rw-r--r--fs/xfs/xfs_log_cil.c481
-rw-r--r--fs/xfs/xfs_log_priv.h50
-rw-r--r--fs/xfs/xfs_log_recover.c67
-rw-r--r--fs/xfs/xfs_mount.c311
-rw-r--r--fs/xfs/xfs_mount.h78
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c77
-rw-r--r--fs/xfs/xfs_rtalloc.c38
-rw-r--r--fs/xfs/xfs_rw.c15
-rw-r--r--fs/xfs/xfs_sb.h10
-rw-r--r--fs/xfs/xfs_trans.c305
-rw-r--r--fs/xfs/xfs_trans.h120
-rw-r--r--fs/xfs/xfs_trans_ail.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c77
-rw-r--r--fs/xfs/xfs_trans_extfree.c23
-rw-r--r--fs/xfs/xfs_trans_inode.c98
-rw-r--r--fs/xfs/xfs_trans_item.c441
-rw-r--r--fs/xfs/xfs_trans_priv.h17
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_utils.c96
-rw-r--r--fs/xfs/xfs_utils.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c409
-rw-r--r--fs/xfs/xfs_vnodeops.h6
897 files changed, 40895 insertions, 47855 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4..7e051147679 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
17 Choose Y here to enable persistent, read-only local 17 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache 18 caching support for 9p clients using FS-Cache
19 19
20
21config 9P_FS_POSIX_ACL
22 bool "9P POSIX Access Control Lists"
23 depends on 9P_FS
24 select FS_POSIX_ACL
25 help
26 POSIX Access Control Lists (ACLs) support permissions for users and
27 groups beyond the owner/group/world scheme.
28
29 To learn more about Access Control Lists, visit the POSIX ACLs for
30 Linux website <http://acl.bestbits.at/>.
31
32 If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af6..f8ba37effd1 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
8 vfs_dir.o \ 8 vfs_dir.o \
9 vfs_dentry.o \ 9 vfs_dentry.o \
10 v9fs.o \ 10 v9fs.o \
11 fid.o 11 fid.o \
12 xattr.o \
13 xattr_user.o
12 14
139p-$(CONFIG_9P_FSCACHE) += cache.o 159p-$(CONFIG_9P_FSCACHE) += cache.o
169p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 00000000000..12d602351db
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,392 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <net/9p/9p.h>
18#include <net/9p/client.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/posix_acl_xattr.h>
22#include "xattr.h"
23#include "acl.h"
24#include "v9fs_vfs.h"
25#include "v9fs.h"
26
27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{
29 ssize_t size;
30 void *value = NULL;
31 struct posix_acl *acl = NULL;;
32
33 size = v9fs_fid_xattr_get(fid, name, NULL, 0);
34 if (size > 0) {
35 value = kzalloc(size, GFP_NOFS);
36 if (!value)
37 return ERR_PTR(-ENOMEM);
38 size = v9fs_fid_xattr_get(fid, name, value, size);
39 if (size > 0) {
40 acl = posix_acl_from_xattr(value, size);
41 if (IS_ERR(acl))
42 goto err_out;
43 }
44 } else if (size == -ENODATA || size == 0 ||
45 size == -ENOSYS || size == -EOPNOTSUPP) {
46 acl = NULL;
47 } else
48 acl = ERR_PTR(-EIO);
49
50err_out:
51 kfree(value);
52 return acl;
53}
54
55int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
56{
57 int retval = 0;
58 struct posix_acl *pacl, *dacl;
59 struct v9fs_session_info *v9ses;
60
61 v9ses = v9fs_inode2v9ses(inode);
62 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
63 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
64 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
65 return 0;
66 }
67 /* get the default/access acl values and cache them */
68 dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
69 pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
70
71 if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
72 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
73 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
74 posix_acl_release(dacl);
75 posix_acl_release(pacl);
76 } else
77 retval = -EIO;
78
79 return retval;
80}
81
82static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
83{
84 struct posix_acl *acl;
85 /*
86 * 9p Always cache the acl value when
87 * instantiating the inode (v9fs_inode_from_fid)
88 */
89 acl = get_cached_acl(inode, type);
90 BUG_ON(acl == ACL_NOT_CACHED);
91 return acl;
92}
93
94int v9fs_check_acl(struct inode *inode, int mask)
95{
96 struct posix_acl *acl;
97 struct v9fs_session_info *v9ses;
98
99 v9ses = v9fs_inode2v9ses(inode);
100 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
101 /*
102 * On access = client mode get the acl
103 * values from the server
104 */
105 return 0;
106 }
107 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
108
109 if (IS_ERR(acl))
110 return PTR_ERR(acl);
111 if (acl) {
112 int error = posix_acl_permission(inode, acl, mask);
113 posix_acl_release(acl);
114 return error;
115 }
116 return -EAGAIN;
117}
118
119static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
120{
121 int retval;
122 char *name;
123 size_t size;
124 void *buffer;
125 struct inode *inode = dentry->d_inode;
126
127 set_cached_acl(inode, type, acl);
128 /* Set a setxattr request to server */
129 size = posix_acl_xattr_size(acl->a_count);
130 buffer = kmalloc(size, GFP_KERNEL);
131 if (!buffer)
132 return -ENOMEM;
133 retval = posix_acl_to_xattr(acl, buffer, size);
134 if (retval < 0)
135 goto err_free_out;
136 switch (type) {
137 case ACL_TYPE_ACCESS:
138 name = POSIX_ACL_XATTR_ACCESS;
139 break;
140 case ACL_TYPE_DEFAULT:
141 name = POSIX_ACL_XATTR_DEFAULT;
142 break;
143 default:
144 BUG();
145 }
146 retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
147err_free_out:
148 kfree(buffer);
149 return retval;
150}
151
152int v9fs_acl_chmod(struct dentry *dentry)
153{
154 int retval = 0;
155 struct posix_acl *acl, *clone;
156 struct inode *inode = dentry->d_inode;
157
158 if (S_ISLNK(inode->i_mode))
159 return -EOPNOTSUPP;
160 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
161 if (acl) {
162 clone = posix_acl_clone(acl, GFP_KERNEL);
163 posix_acl_release(acl);
164 if (!clone)
165 return -ENOMEM;
166 retval = posix_acl_chmod_masq(clone, inode->i_mode);
167 if (!retval)
168 retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
169 posix_acl_release(clone);
170 }
171 return retval;
172}
173
174int v9fs_set_create_acl(struct dentry *dentry,
175 struct posix_acl *dpacl, struct posix_acl *pacl)
176{
177 if (dpacl)
178 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
179 if (pacl)
180 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
181 posix_acl_release(dpacl);
182 posix_acl_release(pacl);
183 return 0;
184}
185
186int v9fs_acl_mode(struct inode *dir, mode_t *modep,
187 struct posix_acl **dpacl, struct posix_acl **pacl)
188{
189 int retval = 0;
190 mode_t mode = *modep;
191 struct posix_acl *acl = NULL;
192
193 if (!S_ISLNK(mode)) {
194 acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
195 if (IS_ERR(acl))
196 return PTR_ERR(acl);
197 if (!acl)
198 mode &= ~current_umask();
199 }
200 if (acl) {
201 struct posix_acl *clone;
202
203 if (S_ISDIR(mode))
204 *dpacl = acl;
205 clone = posix_acl_clone(acl, GFP_NOFS);
206 retval = -ENOMEM;
207 if (!clone)
208 goto cleanup;
209
210 retval = posix_acl_create_masq(clone, &mode);
211 if (retval < 0) {
212 posix_acl_release(clone);
213 goto cleanup;
214 }
215 if (retval > 0)
216 *pacl = clone;
217 }
218 *modep = mode;
219 return 0;
220cleanup:
221 posix_acl_release(acl);
222 return retval;
223
224}
225
226static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
227 void *buffer, size_t size, int type)
228{
229 char *full_name;
230
231 switch (type) {
232 case ACL_TYPE_ACCESS:
233 full_name = POSIX_ACL_XATTR_ACCESS;
234 break;
235 case ACL_TYPE_DEFAULT:
236 full_name = POSIX_ACL_XATTR_DEFAULT;
237 break;
238 default:
239 BUG();
240 }
241 return v9fs_xattr_get(dentry, full_name, buffer, size);
242}
243
244static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
245 void *buffer, size_t size, int type)
246{
247 struct v9fs_session_info *v9ses;
248 struct posix_acl *acl;
249 int error;
250
251 if (strcmp(name, "") != 0)
252 return -EINVAL;
253
254 v9ses = v9fs_inode2v9ses(dentry->d_inode);
255 /*
256 * We allow set/get/list of acl when access=client is not specified
257 */
258 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
259 return v9fs_remote_get_acl(dentry, name, buffer, size, type);
260
261 acl = v9fs_get_cached_acl(dentry->d_inode, type);
262 if (IS_ERR(acl))
263 return PTR_ERR(acl);
264 if (acl == NULL)
265 return -ENODATA;
266 error = posix_acl_to_xattr(acl, buffer, size);
267 posix_acl_release(acl);
268
269 return error;
270}
271
272static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
273 const void *value, size_t size,
274 int flags, int type)
275{
276 char *full_name;
277
278 switch (type) {
279 case ACL_TYPE_ACCESS:
280 full_name = POSIX_ACL_XATTR_ACCESS;
281 break;
282 case ACL_TYPE_DEFAULT:
283 full_name = POSIX_ACL_XATTR_DEFAULT;
284 break;
285 default:
286 BUG();
287 }
288 return v9fs_xattr_set(dentry, full_name, value, size, flags);
289}
290
291
292static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
293 const void *value, size_t size,
294 int flags, int type)
295{
296 int retval;
297 struct posix_acl *acl;
298 struct v9fs_session_info *v9ses;
299 struct inode *inode = dentry->d_inode;
300
301 if (strcmp(name, "") != 0)
302 return -EINVAL;
303
304 v9ses = v9fs_inode2v9ses(dentry->d_inode);
305 /*
306 * set the attribute on the remote. Without even looking at the
307 * xattr value. We leave it to the server to validate
308 */
309 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
310 return v9fs_remote_set_acl(dentry, name,
311 value, size, flags, type);
312
313 if (S_ISLNK(inode->i_mode))
314 return -EOPNOTSUPP;
315 if (!is_owner_or_cap(inode))
316 return -EPERM;
317 if (value) {
318 /* update the cached acl value */
319 acl = posix_acl_from_xattr(value, size);
320 if (IS_ERR(acl))
321 return PTR_ERR(acl);
322 else if (acl) {
323 retval = posix_acl_valid(acl);
324 if (retval)
325 goto err_out;
326 }
327 } else
328 acl = NULL;
329
330 switch (type) {
331 case ACL_TYPE_ACCESS:
332 name = POSIX_ACL_XATTR_ACCESS;
333 if (acl) {
334 mode_t mode = inode->i_mode;
335 retval = posix_acl_equiv_mode(acl, &mode);
336 if (retval < 0)
337 goto err_out;
338 else {
339 struct iattr iattr;
340 if (retval == 0) {
341 /*
342 * ACL can be represented
343 * by the mode bits. So don't
344 * update ACL.
345 */
346 acl = NULL;
347 value = NULL;
348 size = 0;
349 }
350 /* Updte the mode bits */
351 iattr.ia_mode = ((mode & S_IALLUGO) |
352 (inode->i_mode & ~S_IALLUGO));
353 iattr.ia_valid = ATTR_MODE;
354 /* FIXME should we update ctime ?
355 * What is the following setxattr update the
356 * mode ?
357 */
358 v9fs_vfs_setattr_dotl(dentry, &iattr);
359 }
360 }
361 break;
362 case ACL_TYPE_DEFAULT:
363 name = POSIX_ACL_XATTR_DEFAULT;
364 if (!S_ISDIR(inode->i_mode)) {
365 retval = -EINVAL;
366 goto err_out;
367 }
368 break;
369 default:
370 BUG();
371 }
372 retval = v9fs_xattr_set(dentry, name, value, size, flags);
373 if (!retval)
374 set_cached_acl(inode, type, acl);
375err_out:
376 posix_acl_release(acl);
377 return retval;
378}
379
380const struct xattr_handler v9fs_xattr_acl_access_handler = {
381 .prefix = POSIX_ACL_XATTR_ACCESS,
382 .flags = ACL_TYPE_ACCESS,
383 .get = v9fs_xattr_get_acl,
384 .set = v9fs_xattr_set_acl,
385};
386
387const struct xattr_handler v9fs_xattr_acl_default_handler = {
388 .prefix = POSIX_ACL_XATTR_DEFAULT,
389 .flags = ACL_TYPE_DEFAULT,
390 .get = v9fs_xattr_get_acl,
391 .set = v9fs_xattr_set_acl,
392};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 00000000000..59e18c2e8c7
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14#ifndef FS_9P_ACL_H
15#define FS_9P_ACL_H
16
17#ifdef CONFIG_9P_FS_POSIX_ACL
18extern int v9fs_get_acl(struct inode *, struct p9_fid *);
19extern int v9fs_check_acl(struct inode *inode, int mask);
20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl *, struct posix_acl *);
23extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
24 struct posix_acl **dpacl, struct posix_acl **pacl);
25#else
26#define v9fs_check_acl NULL
27static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
28{
29 return 0;
30}
31static inline int v9fs_acl_chmod(struct dentry *dentry)
32{
33 return 0;
34}
35static inline int v9fs_set_create_acl(struct dentry *dentry,
36 struct posix_acl *dpacl,
37 struct posix_acl *pacl)
38{
39 return 0;
40}
41static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
42 struct posix_acl **dpacl,
43 struct posix_acl **pacl)
44{
45 return 0;
46}
47
48#endif
49#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b281..b00223c99d7 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
97 return ret; 97 return ret;
98} 98}
99 99
100/*
101 * We need to hold v9ses->rename_sem as long as we hold references
102 * to returned path array. Array element contain pointers to
103 * dentry names.
104 */
105static int build_path_from_dentry(struct v9fs_session_info *v9ses,
106 struct dentry *dentry, char ***names)
107{
108 int n = 0, i;
109 char **wnames;
110 struct dentry *ds;
111
112 for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
113 n++;
114
115 wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
116 if (!wnames)
117 goto err_out;
118
119 for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
120 wnames[i] = (char *)ds->d_name.name;
121
122 *names = wnames;
123 return n;
124err_out:
125 return -ENOMEM;
126}
127
100/** 128/**
101 * v9fs_fid_lookup - lookup for a fid, try to walk if not found 129 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
102 * @dentry: dentry to look for fid in 130 * @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
112 int i, n, l, clone, any, access; 140 int i, n, l, clone, any, access;
113 u32 uid; 141 u32 uid;
114 struct p9_fid *fid, *old_fid = NULL; 142 struct p9_fid *fid, *old_fid = NULL;
115 struct dentry *d, *ds; 143 struct dentry *ds;
116 struct v9fs_session_info *v9ses; 144 struct v9fs_session_info *v9ses;
117 char **wnames, *uname; 145 char **wnames, *uname;
118 146
@@ -121,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
121 switch (access) { 149 switch (access) {
122 case V9FS_ACCESS_SINGLE: 150 case V9FS_ACCESS_SINGLE:
123 case V9FS_ACCESS_USER: 151 case V9FS_ACCESS_USER:
152 case V9FS_ACCESS_CLIENT:
124 uid = current_fsuid(); 153 uid = current_fsuid();
125 any = 0; 154 any = 0;
126 break; 155 break;
@@ -139,49 +168,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
139 fid = v9fs_fid_find(dentry, uid, any); 168 fid = v9fs_fid_find(dentry, uid, any);
140 if (fid) 169 if (fid)
141 return fid; 170 return fid;
142 171 /*
172 * we don't have a matching fid. To do a TWALK we need
173 * parent fid. We need to prevent rename when we want to
174 * look at the parent.
175 */
176 down_read(&v9ses->rename_sem);
143 ds = dentry->d_parent; 177 ds = dentry->d_parent;
144 fid = v9fs_fid_find(ds, uid, any); 178 fid = v9fs_fid_find(ds, uid, any);
145 if (!fid) { /* walk from the root */ 179 if (fid) {
146 n = 0; 180 /* Found the parent fid do a lookup with that */
147 for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) 181 fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
148 n++; 182 goto fid_out;
183 }
184 up_read(&v9ses->rename_sem);
149 185
150 fid = v9fs_fid_find(ds, uid, any); 186 /* start from the root and try to do a lookup */
151 if (!fid) { /* the user is not attached to the fs yet */ 187 fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
152 if (access == V9FS_ACCESS_SINGLE) 188 if (!fid) {
153 return ERR_PTR(-EPERM); 189 /* the user is not attached to the fs yet */
190 if (access == V9FS_ACCESS_SINGLE)
191 return ERR_PTR(-EPERM);
154 192
155 if (v9fs_proto_dotu(v9ses)) 193 if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
156 uname = NULL; 194 uname = NULL;
157 else 195 else
158 uname = v9ses->uname; 196 uname = v9ses->uname;
159 197
160 fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, 198 fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
161 v9ses->aname); 199 v9ses->aname);
162 200 if (IS_ERR(fid))
163 if (IS_ERR(fid)) 201 return fid;
164 return fid;
165
166 v9fs_fid_add(ds, fid);
167 }
168 } else /* walk from the parent */
169 n = 1;
170 202
171 if (ds == dentry) 203 v9fs_fid_add(dentry->d_sb->s_root, fid);
204 }
205 /* If we are root ourself just return that */
206 if (dentry->d_sb->s_root == dentry)
172 return fid; 207 return fid;
173 208 /*
174 wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); 209 * Do a multipath walk with attached root.
175 if (!wnames) 210 * When walking parent we need to make sure we
176 return ERR_PTR(-ENOMEM); 211 * don't have a parallel rename happening
177 212 */
178 for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent) 213 down_read(&v9ses->rename_sem);
179 wnames[i] = (char *) d->d_name.name; 214 n = build_path_from_dentry(v9ses, dentry, &wnames);
180 215 if (n < 0) {
216 fid = ERR_PTR(n);
217 goto err_out;
218 }
181 clone = 1; 219 clone = 1;
182 i = 0; 220 i = 0;
183 while (i < n) { 221 while (i < n) {
184 l = min(n - i, P9_MAXWELEM); 222 l = min(n - i, P9_MAXWELEM);
223 /*
224 * We need to hold rename lock when doing a multipath
225 * walk to ensure none of the patch component change
226 */
185 fid = p9_client_walk(fid, l, &wnames[i], clone); 227 fid = p9_client_walk(fid, l, &wnames[i], clone);
186 if (IS_ERR(fid)) { 228 if (IS_ERR(fid)) {
187 if (old_fid) { 229 if (old_fid) {
@@ -193,15 +235,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
193 p9_client_clunk(old_fid); 235 p9_client_clunk(old_fid);
194 } 236 }
195 kfree(wnames); 237 kfree(wnames);
196 return fid; 238 goto err_out;
197 } 239 }
198 old_fid = fid; 240 old_fid = fid;
199 i += l; 241 i += l;
200 clone = 0; 242 clone = 0;
201 } 243 }
202
203 kfree(wnames); 244 kfree(wnames);
204 v9fs_fid_add(dentry, fid); 245fid_out:
246 if (!IS_ERR(fid))
247 v9fs_fid_add(dentry, fid);
248err_out:
249 up_read(&v9ses->rename_sem);
205 return fid; 250 return fid;
206} 251}
207 252
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd6..2f77cd33ba8 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
193 v9ses->flags |= V9FS_ACCESS_USER; 193 v9ses->flags |= V9FS_ACCESS_USER;
194 else if (strcmp(s, "any") == 0) 194 else if (strcmp(s, "any") == 0)
195 v9ses->flags |= V9FS_ACCESS_ANY; 195 v9ses->flags |= V9FS_ACCESS_ANY;
196 else { 196 else if (strcmp(s, "client") == 0) {
197#ifdef CONFIG_9P_FS_POSIX_ACL
198 v9ses->flags |= V9FS_ACCESS_CLIENT;
199#else
200 P9_DPRINTK(P9_DEBUG_ERROR,
201 "access=client option not supported\n");
202 kfree(s);
203 ret = -EINVAL;
204 goto free_and_return;
205#endif
206 } else {
197 v9ses->flags |= V9FS_ACCESS_SINGLE; 207 v9ses->flags |= V9FS_ACCESS_SINGLE;
198 v9ses->uid = simple_strtoul(s, &e, 10); 208 v9ses->uid = simple_strtoul(s, &e, 10);
199 if (*e != '\0') 209 if (*e != '\0')
@@ -237,6 +247,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
237 __putname(v9ses->uname); 247 __putname(v9ses->uname);
238 return ERR_PTR(-ENOMEM); 248 return ERR_PTR(-ENOMEM);
239 } 249 }
250 init_rwsem(&v9ses->rename_sem);
240 251
241 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 252 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
242 if (rc) { 253 if (rc) {
@@ -277,8 +288,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
277 288
278 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 289 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
279 290
291 if (!v9fs_proto_dotl(v9ses) &&
292 ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
293 /*
294 * We support ACCESS_CLIENT only for dotl.
295 * Fall back to ACCESS_USER
296 */
297 v9ses->flags &= ~V9FS_ACCESS_MASK;
298 v9ses->flags |= V9FS_ACCESS_USER;
299 }
300 /*FIXME !! */
280 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 301 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
281 if (!v9fs_proto_dotu(v9ses) && 302 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
282 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 303 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
283 304
284 v9ses->flags &= ~V9FS_ACCESS_MASK; 305 v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb45..cb6396855e2 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
33 * 33 *
34 * Session flags reflect options selected by users at mount time 34 * Session flags reflect options selected by users at mount time
35 */ 35 */
36#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
37 V9FS_ACCESS_USER | \
38 V9FS_ACCESS_CLIENT)
39#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
40
36enum p9_session_flags { 41enum p9_session_flags {
37 V9FS_PROTO_2000U = 0x01, 42 V9FS_PROTO_2000U = 0x01,
38 V9FS_PROTO_2000L = 0x02, 43 V9FS_PROTO_2000L = 0x02,
39 V9FS_ACCESS_SINGLE = 0x04, 44 V9FS_ACCESS_SINGLE = 0x04,
40 V9FS_ACCESS_USER = 0x08, 45 V9FS_ACCESS_USER = 0x08,
41 V9FS_ACCESS_ANY = 0x0C, 46 V9FS_ACCESS_CLIENT = 0x10
42 V9FS_ACCESS_MASK = 0x0C,
43}; 47};
44 48
45/* possible values of ->cache */ 49/* possible values of ->cache */
@@ -104,6 +108,7 @@ struct v9fs_session_info {
104 struct p9_client *clnt; /* 9p client */ 108 struct p9_client *clnt; /* 9p client */
105 struct list_head slist; /* list of sessions registered with v9fs */ 109 struct list_head slist; /* list of sessions registered with v9fs */
106 struct backing_dev_info bdi; 110 struct backing_dev_info bdi;
111 struct rw_semaphore rename_sem;
107}; 112};
108 113
109struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
@@ -112,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
112void v9fs_session_cancel(struct v9fs_session_info *v9ses); 117void v9fs_session_cancel(struct v9fs_session_info *v9ses);
113void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); 118void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
114 119
115#define V9FS_MAGIC 0x01021997
116
117/* other default globals */ 120/* other default globals */
118#define V9FS_PORT 564 121#define V9FS_PORT 564
119#define V9FS_DEFUSER "nobody" 122#define V9FS_DEFUSER "nobody"
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 32ef4009d03..bab0eac873f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,9 +52,10 @@ void v9fs_destroy_inode(struct inode *inode);
52#endif 52#endif
53 53
54struct inode *v9fs_get_inode(struct super_block *sb, int mode); 54struct inode *v9fs_get_inode(struct super_block *sb, int mode);
55void v9fs_clear_inode(struct inode *inode); 55void v9fs_evict_inode(struct inode *inode);
56ino_t v9fs_qid2ino(struct p9_qid *qid); 56ino_t v9fs_qid2ino(struct p9_qid *qid);
57void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 57void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
58void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
58int v9fs_dir_release(struct inode *inode, struct file *filp); 59int v9fs_dir_release(struct inode *inode, struct file *filp);
59int v9fs_file_open(struct inode *inode, struct file *file); 60int v9fs_file_open(struct inode *inode, struct file *file);
60void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); 61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -63,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended);
63 64
64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
65void v9fs_blank_wstat(struct p9_wstat *wstat); 66void v9fs_blank_wstat(struct p9_wstat *wstat);
67int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
68int v9fs_file_fsync_dotl(struct file *filp, int datasync);
69
70#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b..b7f2a8e3863 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
154 return 0; 154 return 0;
155} 155}
156 156
157/**
158 * v9fs_direct_IO - 9P address space operation for direct I/O
159 * @rw: direction (read or write)
160 * @iocb: target I/O control block
161 * @iov: array of vectors that define I/O buffer
162 * @pos: offset in file to begin the operation
163 * @nr_segs: size of iovec array
164 *
165 * The presence of v9fs_direct_IO() in the address space ops vector
166 * allowes open() O_DIRECT flags which would have failed otherwise.
167 *
168 * In the non-cached mode, we shunt off direct read and write requests before
169 * the VFS gets them, so this method should never be called.
170 *
171 * Direct IO is not 'yet' supported in the cached mode. Hence when
172 * this routine is called through generic_file_aio_read(), the read/write fails
173 * with an error.
174 *
175 */
176ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
177 loff_t pos, unsigned long nr_segs)
178{
179 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
180 "off/no(%lld/%lu) EINVAL\n",
181 iocb->ki_filp->f_path.dentry->d_name.name,
182 (long long) pos, nr_segs);
183
184 return -EINVAL;
185}
157const struct address_space_operations v9fs_addr_operations = { 186const struct address_space_operations v9fs_addr_operations = {
158 .readpage = v9fs_vfs_readpage, 187 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages, 188 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page, 189 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page, 190 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page, 191 .launder_page = v9fs_launder_page,
192 .direct_IO = v9fs_direct_IO,
163}; 193};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 36d961f342a..b84ebe8cefe 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
87} 87}
88 88
89/** 89/**
90 * v9fs_dir_readdir - read a directory 90 * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
91 * @filp: opened file structure 91 * @filp: opened file structure
92 * @dirent: directory structure ??? 92 * @buflen: Length in bytes of buffer to allocate
93 * @filldir: function to populate directory structure ???
94 * 93 *
95 */ 94 */
96 95
97static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) 96static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
98{ 97{
99 int over;
100 struct p9_wstat st;
101 int err = 0;
102 struct p9_fid *fid;
103 int buflen;
104 int reclen = 0;
105 struct p9_rdir *rdir; 98 struct p9_rdir *rdir;
99 struct p9_fid *fid;
100 int err = 0;
106 101
107 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
108 fid = filp->private_data; 102 fid = filp->private_data;
109
110 buflen = fid->clnt->msize - P9_IOHDRSZ;
111
112 /* allocate rdir on demand */
113 if (!fid->rdir) { 103 if (!fid->rdir) {
114 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); 104 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
115 105
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
128 spin_unlock(&filp->f_dentry->d_lock); 118 spin_unlock(&filp->f_dentry->d_lock);
129 kfree(rdir); 119 kfree(rdir);
130 } 120 }
121exit:
122 return err;
123}
124
125/**
126 * v9fs_dir_readdir - read a directory
127 * @filp: opened file structure
128 * @dirent: directory structure ???
129 * @filldir: function to populate directory structure ???
130 *
131 */
132
133static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
134{
135 int over;
136 struct p9_wstat st;
137 int err = 0;
138 struct p9_fid *fid;
139 int buflen;
140 int reclen = 0;
141 struct p9_rdir *rdir;
142
143 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
144 fid = filp->private_data;
145
146 buflen = fid->clnt->msize - P9_IOHDRSZ;
147
148 err = v9fs_alloc_rdir_buf(filp, buflen);
149 if (err)
150 goto exit;
131 rdir = (struct p9_rdir *) fid->rdir; 151 rdir = (struct p9_rdir *) fid->rdir;
132 152
133 err = mutex_lock_interruptible(&rdir->mutex); 153 err = mutex_lock_interruptible(&rdir->mutex);
@@ -176,6 +196,89 @@ exit:
176 return err; 196 return err;
177} 197}
178 198
199/**
200 * v9fs_dir_readdir_dotl - read a directory
201 * @filp: opened file structure
202 * @dirent: buffer to fill dirent structures
203 * @filldir: function to populate dirent structures
204 *
205 */
206static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
207 filldir_t filldir)
208{
209 int over;
210 int err = 0;
211 struct p9_fid *fid;
212 int buflen;
213 struct p9_rdir *rdir;
214 struct p9_dirent curdirent;
215 u64 oldoffset = 0;
216
217 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
218 fid = filp->private_data;
219
220 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
221
222 err = v9fs_alloc_rdir_buf(filp, buflen);
223 if (err)
224 goto exit;
225 rdir = (struct p9_rdir *) fid->rdir;
226
227 err = mutex_lock_interruptible(&rdir->mutex);
228 if (err)
229 return err;
230
231 while (err == 0) {
232 if (rdir->tail == rdir->head) {
233 err = p9_client_readdir(fid, rdir->buf, buflen,
234 filp->f_pos);
235 if (err <= 0)
236 goto unlock_and_exit;
237
238 rdir->head = 0;
239 rdir->tail = err;
240 }
241
242 while (rdir->head < rdir->tail) {
243
244 err = p9dirent_read(rdir->buf + rdir->head,
245 rdir->tail - rdir->head,
246 &curdirent,
247 fid->clnt->proto_version);
248 if (err < 0) {
249 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
250 err = -EIO;
251 goto unlock_and_exit;
252 }
253
254 /* d_off in dirent structure tracks the offset into
255 * the next dirent in the dir. However, filldir()
256 * expects offset into the current dirent. Hence
257 * while calling filldir send the offset from the
258 * previous dirent structure.
259 */
260 over = filldir(dirent, curdirent.d_name,
261 strlen(curdirent.d_name),
262 oldoffset, v9fs_qid2ino(&curdirent.qid),
263 curdirent.d_type);
264 oldoffset = curdirent.d_off;
265
266 if (over) {
267 err = 0;
268 goto unlock_and_exit;
269 }
270
271 filp->f_pos = curdirent.d_off;
272 rdir->head += err;
273 }
274 }
275
276unlock_and_exit:
277 mutex_unlock(&rdir->mutex);
278exit:
279 return err;
280}
281
179 282
180/** 283/**
181 * v9fs_dir_release - close a directory 284 * v9fs_dir_release - close a directory
@@ -190,9 +293,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
190 293
191 fid = filp->private_data; 294 fid = filp->private_data;
192 P9_DPRINTK(P9_DEBUG_VFS, 295 P9_DPRINTK(P9_DEBUG_VFS,
193 "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); 296 "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
297 inode, filp, fid ? fid->fid : -1);
194 filemap_write_and_wait(inode->i_mapping); 298 filemap_write_and_wait(inode->i_mapping);
195 p9_client_clunk(fid); 299 if (fid)
300 p9_client_clunk(fid);
196 return 0; 301 return 0;
197} 302}
198 303
@@ -207,7 +312,8 @@ const struct file_operations v9fs_dir_operations = {
207const struct file_operations v9fs_dir_operations_dotl = { 312const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir, 313 .read = generic_read_dir,
209 .llseek = generic_file_llseek, 314 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir, 315 .readdir = v9fs_dir_readdir_dotl,
211 .open = v9fs_file_open, 316 .open = v9fs_file_open,
212 .release = v9fs_dir_release, 317 .release = v9fs_dir_release,
318 .fsync = v9fs_file_fsync_dotl,
213}; 319};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2bedc6c94fc..240c3067439 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/utsname.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <linux/idr.h> 38#include <linux/idr.h>
38#include <net/9p/9p.h> 39#include <net/9p/9p.h>
@@ -44,6 +45,7 @@
44#include "cache.h" 45#include "cache.h"
45 46
46static const struct file_operations v9fs_cached_file_operations; 47static const struct file_operations v9fs_cached_file_operations;
48static const struct file_operations v9fs_cached_file_operations_dotl;
47 49
48/** 50/**
49 * v9fs_file_open - open a file (or directory) 51 * v9fs_file_open - open a file (or directory)
@@ -59,9 +61,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
59 struct p9_fid *fid; 61 struct p9_fid *fid;
60 int omode; 62 int omode;
61 63
62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
63 v9ses = v9fs_inode2v9ses(inode); 65 v9ses = v9fs_inode2v9ses(inode);
64 omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); 66 if (v9fs_proto_dotl(v9ses))
67 omode = file->f_flags;
68 else
69 omode = v9fs_uflags2omode(file->f_flags,
70 v9fs_proto_dotu(v9ses));
65 fid = file->private_data; 71 fid = file->private_data;
66 if (!fid) { 72 if (!fid) {
67 fid = v9fs_fid_clone(file->f_path.dentry); 73 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +79,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
73 p9_client_clunk(fid); 79 p9_client_clunk(fid);
74 return err; 80 return err;
75 } 81 }
76 if (omode & P9_OTRUNC) { 82 if (file->f_flags & O_TRUNC) {
77 i_size_write(inode, 0); 83 i_size_write(inode, 0);
78 inode->i_blocks = 0; 84 inode->i_blocks = 0;
79 } 85 }
80 if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) 86 if ((file->f_flags & O_APPEND) &&
87 (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
81 generic_file_llseek(file, 0, SEEK_END); 88 generic_file_llseek(file, 0, SEEK_END);
82 } 89 }
83 90
@@ -87,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
87 /* enable cached file options */ 94 /* enable cached file options */
88 if(file->f_op == &v9fs_file_operations) 95 if(file->f_op == &v9fs_file_operations)
89 file->f_op = &v9fs_cached_file_operations; 96 file->f_op = &v9fs_cached_file_operations;
97 else if (file->f_op == &v9fs_file_operations_dotl)
98 file->f_op = &v9fs_cached_file_operations_dotl;
90 99
91#ifdef CONFIG_9P_FSCACHE 100#ifdef CONFIG_9P_FSCACHE
92 v9fs_cache_inode_set_cookie(inode, file); 101 v9fs_cache_inode_set_cookie(inode, file);
@@ -125,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
125 return res; 134 return res;
126} 135}
127 136
137static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
138{
139 struct p9_flock flock;
140 struct p9_fid *fid;
141 uint8_t status;
142 int res = 0;
143 unsigned char fl_type;
144
145 fid = filp->private_data;
146 BUG_ON(fid == NULL);
147
148 if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
149 BUG();
150
151 res = posix_lock_file_wait(filp, fl);
152 if (res < 0)
153 goto out;
154
155 /* convert posix lock to p9 tlock args */
156 memset(&flock, 0, sizeof(flock));
157 flock.type = fl->fl_type;
158 flock.start = fl->fl_start;
159 if (fl->fl_end == OFFSET_MAX)
160 flock.length = 0;
161 else
162 flock.length = fl->fl_end - fl->fl_start + 1;
163 flock.proc_id = fl->fl_pid;
164 flock.client_id = utsname()->nodename;
165 if (IS_SETLKW(cmd))
166 flock.flags = P9_LOCK_FLAGS_BLOCK;
167
168 /*
169 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
170 * for lock request, keep on trying
171 */
172 for (;;) {
173 res = p9_client_lock_dotl(fid, &flock, &status);
174 if (res < 0)
175 break;
176
177 if (status != P9_LOCK_BLOCKED)
178 break;
179 if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
180 break;
181 schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
182 }
183
184 /* map 9p status to VFS status */
185 switch (status) {
186 case P9_LOCK_SUCCESS:
187 res = 0;
188 break;
189 case P9_LOCK_BLOCKED:
190 res = -EAGAIN;
191 break;
192 case P9_LOCK_ERROR:
193 case P9_LOCK_GRACE:
194 res = -ENOLCK;
195 break;
196 default:
197 BUG();
198 }
199
200 /*
201 * incase server returned error for lock request, revert
202 * it locally
203 */
204 if (res < 0 && fl->fl_type != F_UNLCK) {
205 fl_type = fl->fl_type;
206 fl->fl_type = F_UNLCK;
207 res = posix_lock_file_wait(filp, fl);
208 fl->fl_type = fl_type;
209 }
210out:
211 return res;
212}
213
214static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
215{
216 struct p9_getlock glock;
217 struct p9_fid *fid;
218 int res = 0;
219
220 fid = filp->private_data;
221 BUG_ON(fid == NULL);
222
223 posix_test_lock(filp, fl);
224 /*
225 * if we have a conflicting lock locally, no need to validate
226 * with server
227 */
228 if (fl->fl_type != F_UNLCK)
229 return res;
230
231 /* convert posix lock to p9 tgetlock args */
232 memset(&glock, 0, sizeof(glock));
233 glock.type = fl->fl_type;
234 glock.start = fl->fl_start;
235 if (fl->fl_end == OFFSET_MAX)
236 glock.length = 0;
237 else
238 glock.length = fl->fl_end - fl->fl_start + 1;
239 glock.proc_id = fl->fl_pid;
240 glock.client_id = utsname()->nodename;
241
242 res = p9_client_getlock_dotl(fid, &glock);
243 if (res < 0)
244 return res;
245 if (glock.type != F_UNLCK) {
246 fl->fl_type = glock.type;
247 fl->fl_start = glock.start;
248 if (glock.length == 0)
249 fl->fl_end = OFFSET_MAX;
250 else
251 fl->fl_end = glock.start + glock.length - 1;
252 fl->fl_pid = glock.proc_id;
253 } else
254 fl->fl_type = F_UNLCK;
255
256 return res;
257}
258
259/**
260 * v9fs_file_lock_dotl - lock a file (or directory)
261 * @filp: file to be locked
262 * @cmd: lock command
263 * @fl: file lock structure
264 *
265 */
266
267static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
268{
269 struct inode *inode = filp->f_path.dentry->d_inode;
270 int ret = -ENOLCK;
271
272 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
273 cmd, fl, filp->f_path.dentry->d_name.name);
274
275 /* No mandatory locks */
276 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
277 goto out_err;
278
279 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
280 filemap_write_and_wait(inode->i_mapping);
281 invalidate_mapping_pages(&inode->i_data, 0, -1);
282 }
283
284 if (IS_SETLK(cmd) || IS_SETLKW(cmd))
285 ret = v9fs_file_do_lock(filp, cmd, fl);
286 else if (IS_GETLK(cmd))
287 ret = v9fs_file_getlock(filp, fl);
288 else
289 ret = -EINVAL;
290out_err:
291 return ret;
292}
293
294/**
295 * v9fs_file_flock_dotl - lock a file
296 * @filp: file to be locked
297 * @cmd: lock command
298 * @fl: file lock structure
299 *
300 */
301
302static int v9fs_file_flock_dotl(struct file *filp, int cmd,
303 struct file_lock *fl)
304{
305 struct inode *inode = filp->f_path.dentry->d_inode;
306 int ret = -ENOLCK;
307
308 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
309 cmd, fl, filp->f_path.dentry->d_name.name);
310
311 /* No mandatory locks */
312 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
313 goto out_err;
314
315 if (!(fl->fl_flags & FL_FLOCK))
316 goto out_err;
317
318 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
319 filemap_write_and_wait(inode->i_mapping);
320 invalidate_mapping_pages(&inode->i_data, 0, -1);
321 }
322 /* Convert flock to posix lock */
323 fl->fl_owner = (fl_owner_t)filp;
324 fl->fl_start = 0;
325 fl->fl_end = OFFSET_MAX;
326 fl->fl_flags |= FL_POSIX;
327 fl->fl_flags ^= FL_FLOCK;
328
329 if (IS_SETLK(cmd) | IS_SETLKW(cmd))
330 ret = v9fs_file_do_lock(filp, cmd, fl);
331 else
332 ret = -EINVAL;
333out_err:
334 return ret;
335}
336
128/** 337/**
129 * v9fs_file_readn - read from a file 338 * v9fs_file_readn - read from a file
130 * @filp: file pointer to read 339 * @filp: file pointer to read
@@ -139,7 +348,7 @@ ssize_t
139v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, 348v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
140 u64 offset) 349 u64 offset)
141{ 350{
142 int n, total; 351 int n, total, size;
143 struct p9_fid *fid = filp->private_data; 352 struct p9_fid *fid = filp->private_data;
144 353
145 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, 354 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +356,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
147 356
148 n = 0; 357 n = 0;
149 total = 0; 358 total = 0;
359 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
150 do { 360 do {
151 n = p9_client_read(fid, data, udata, offset, count); 361 n = p9_client_read(fid, data, udata, offset, count);
152 if (n <= 0) 362 if (n <= 0)
@@ -160,7 +370,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
160 offset += n; 370 offset += n;
161 count -= n; 371 count -= n;
162 total += n; 372 total += n;
163 } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); 373 } while (count > 0 && n == size);
164 374
165 if (n < 0) 375 if (n < 0)
166 total = n; 376 total = n;
@@ -183,11 +393,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
183{ 393{
184 int ret; 394 int ret;
185 struct p9_fid *fid; 395 struct p9_fid *fid;
396 size_t size;
186 397
187 P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); 398 P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
188 fid = filp->private_data; 399 fid = filp->private_data;
189 400
190 if (count > (fid->clnt->msize - P9_IOHDRSZ)) 401 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
402 if (count > size)
191 ret = v9fs_file_readn(filp, NULL, udata, count, *offset); 403 ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
192 else 404 else
193 ret = p9_client_read(fid, NULL, udata, *offset, count); 405 ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -211,7 +423,9 @@ static ssize_t
211v9fs_file_write(struct file *filp, const char __user * data, 423v9fs_file_write(struct file *filp, const char __user * data,
212 size_t count, loff_t * offset) 424 size_t count, loff_t * offset)
213{ 425{
214 int n, rsize, total = 0; 426 ssize_t retval;
427 size_t total = 0;
428 int n;
215 struct p9_fid *fid; 429 struct p9_fid *fid;
216 struct p9_client *clnt; 430 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 431 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -224,16 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
224 fid = filp->private_data; 438 fid = filp->private_data;
225 clnt = fid->clnt; 439 clnt = fid->clnt;
226 440
227 rsize = fid->iounit; 441 retval = generic_write_checks(filp, &origin, &count, 0);
228 if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) 442 if (retval)
229 rsize = clnt->msize - P9_IOHDRSZ; 443 goto out;
230 444
231 do { 445 retval = -EINVAL;
232 if (count < rsize) 446 if ((ssize_t) count < 0)
233 rsize = count; 447 goto out;
448 retval = 0;
449 if (!count)
450 goto out;
234 451
235 n = p9_client_write(fid, NULL, data+total, origin+total, 452 do {
236 rsize); 453 n = p9_client_write(fid, NULL, data+total, origin+total, count);
237 if (n <= 0) 454 if (n <= 0)
238 break; 455 break;
239 count -= n; 456 count -= n;
@@ -252,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
252 } 469 }
253 470
254 if (n < 0) 471 if (n < 0)
255 return n; 472 retval = n;
256 473 else
257 return total; 474 retval = total;
475out:
476 return retval;
258} 477}
259 478
260static int v9fs_file_fsync(struct file *filp, int datasync) 479static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -272,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
272 return retval; 491 return retval;
273} 492}
274 493
494int v9fs_file_fsync_dotl(struct file *filp, int datasync)
495{
496 struct p9_fid *fid;
497 int retval;
498
499 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
500 filp, datasync);
501
502 fid = filp->private_data;
503
504 retval = p9_client_fsync(fid, datasync);
505 return retval;
506}
507
275static const struct file_operations v9fs_cached_file_operations = { 508static const struct file_operations v9fs_cached_file_operations = {
276 .llseek = generic_file_llseek, 509 .llseek = generic_file_llseek,
277 .read = do_sync_read, 510 .read = do_sync_read,
@@ -284,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
284 .fsync = v9fs_file_fsync, 517 .fsync = v9fs_file_fsync,
285}; 518};
286 519
520static const struct file_operations v9fs_cached_file_operations_dotl = {
521 .llseek = generic_file_llseek,
522 .read = do_sync_read,
523 .aio_read = generic_file_aio_read,
524 .write = v9fs_file_write,
525 .open = v9fs_file_open,
526 .release = v9fs_dir_release,
527 .lock = v9fs_file_lock_dotl,
528 .flock = v9fs_file_flock_dotl,
529 .mmap = generic_file_readonly_mmap,
530 .fsync = v9fs_file_fsync_dotl,
531};
532
287const struct file_operations v9fs_file_operations = { 533const struct file_operations v9fs_file_operations = {
288 .llseek = generic_file_llseek, 534 .llseek = generic_file_llseek,
289 .read = v9fs_file_read, 535 .read = v9fs_file_read,
@@ -301,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
301 .write = v9fs_file_write, 547 .write = v9fs_file_write,
302 .open = v9fs_file_open, 548 .open = v9fs_file_open,
303 .release = v9fs_dir_release, 549 .release = v9fs_dir_release,
304 .lock = v9fs_file_lock, 550 .lock = v9fs_file_lock_dotl,
551 .flock = v9fs_file_flock_dotl,
305 .mmap = generic_file_readonly_mmap, 552 .mmap = generic_file_readonly_mmap,
306 .fsync = v9fs_file_fsync, 553 .fsync = v9fs_file_fsync_dotl,
307}; 554};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4331b3b5ee1..34bf71b5654 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,8 @@
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
38#include <net/9p/9p.h> 40#include <net/9p/9p.h>
39#include <net/9p/client.h> 41#include <net/9p/client.h>
40 42
@@ -42,6 +44,8 @@
42#include "v9fs_vfs.h" 44#include "v9fs_vfs.h"
43#include "fid.h" 45#include "fid.h"
44#include "cache.h" 46#include "cache.h"
47#include "xattr.h"
48#include "acl.h"
45 49
46static const struct inode_operations v9fs_dir_inode_operations; 50static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_dotu; 51static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -51,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
51static const struct inode_operations v9fs_symlink_inode_operations; 55static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl; 56static const struct inode_operations v9fs_symlink_inode_operations_dotl;
53 57
58static int
59v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
60 dev_t rdev);
61
54/** 62/**
55 * unixmode2p9mode - convert unix mode bits to plan 9 63 * unixmode2p9mode - convert unix mode bits to plan 9
56 * @v9ses: v9fs session information 64 * @v9ses: v9fs session information
@@ -236,6 +244,41 @@ void v9fs_destroy_inode(struct inode *inode)
236#endif 244#endif
237 245
238/** 246/**
247 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
248 * new file system object. This checks the S_ISGID to determine the owning
249 * group of the new file system object.
250 */
251
252static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
253{
254 BUG_ON(dir_inode == NULL);
255
256 if (dir_inode->i_mode & S_ISGID) {
257 /* set_gid bit is set.*/
258 return dir_inode->i_gid;
259 }
260 return current_fsgid();
261}
262
263/**
264 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
265 * dir inode.
266 *
267 */
268
269static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
270{
271 struct dentry *dentry;
272
273 spin_lock(&dcache_lock);
274 /* Directory should have only one entry. */
275 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
276 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
277 spin_unlock(&dcache_lock);
278 return dentry;
279}
280
281/**
239 * v9fs_get_inode - helper function to setup an inode 282 * v9fs_get_inode - helper function to setup an inode
240 * @sb: superblock 283 * @sb: superblock
241 * @mode: mode to setup inode with 284 * @mode: mode to setup inode with
@@ -267,7 +310,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
267 case S_IFBLK: 310 case S_IFBLK:
268 case S_IFCHR: 311 case S_IFCHR:
269 case S_IFSOCK: 312 case S_IFSOCK:
270 if (!v9fs_proto_dotu(v9ses)) { 313 if (v9fs_proto_dotl(v9ses)) {
314 inode->i_op = &v9fs_file_inode_operations_dotl;
315 inode->i_fop = &v9fs_file_operations_dotl;
316 } else if (v9fs_proto_dotu(v9ses)) {
317 inode->i_op = &v9fs_file_inode_operations;
318 inode->i_fop = &v9fs_file_operations;
319 } else {
271 P9_DPRINTK(P9_DEBUG_ERROR, 320 P9_DPRINTK(P9_DEBUG_ERROR,
272 "special files without extended mode\n"); 321 "special files without extended mode\n");
273 err = -EINVAL; 322 err = -EINVAL;
@@ -387,8 +436,10 @@ error:
387 * @inode: inode to release 436 * @inode: inode to release
388 * 437 *
389 */ 438 */
390void v9fs_clear_inode(struct inode *inode) 439void v9fs_evict_inode(struct inode *inode)
391{ 440{
441 truncate_inode_pages(inode->i_mapping, 0);
442 end_writeback(inode);
392 filemap_fdatawrite(inode->i_mapping); 443 filemap_fdatawrite(inode->i_mapping);
393 444
394#ifdef CONFIG_9P_FSCACHE 445#ifdef CONFIG_9P_FSCACHE
@@ -396,23 +447,14 @@ void v9fs_clear_inode(struct inode *inode)
396#endif 447#endif
397} 448}
398 449
399/**
400 * v9fs_inode_from_fid - populate an inode by issuing a attribute request
401 * @v9ses: session information
402 * @fid: fid to issue attribute request for
403 * @sb: superblock on which to create inode
404 *
405 */
406
407static struct inode * 450static struct inode *
408v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, 451v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
409 struct super_block *sb) 452 struct super_block *sb)
410{ 453{
411 int err, umode; 454 int err, umode;
412 struct inode *ret; 455 struct inode *ret = NULL;
413 struct p9_wstat *st; 456 struct p9_wstat *st;
414 457
415 ret = NULL;
416 st = p9_client_stat(fid); 458 st = p9_client_stat(fid);
417 if (IS_ERR(st)) 459 if (IS_ERR(st))
418 return ERR_CAST(st); 460 return ERR_CAST(st);
@@ -433,15 +475,67 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
433#endif 475#endif
434 p9stat_free(st); 476 p9stat_free(st);
435 kfree(st); 477 kfree(st);
436
437 return ret; 478 return ret;
438
439error: 479error:
440 p9stat_free(st); 480 p9stat_free(st);
441 kfree(st); 481 kfree(st);
442 return ERR_PTR(err); 482 return ERR_PTR(err);
443} 483}
444 484
485static struct inode *
486v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
487 struct super_block *sb)
488{
489 struct inode *ret = NULL;
490 int err;
491 struct p9_stat_dotl *st;
492
493 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
494 if (IS_ERR(st))
495 return ERR_CAST(st);
496
497 ret = v9fs_get_inode(sb, st->st_mode);
498 if (IS_ERR(ret)) {
499 err = PTR_ERR(ret);
500 goto error;
501 }
502
503 v9fs_stat2inode_dotl(st, ret);
504 ret->i_ino = v9fs_qid2ino(&st->qid);
505#ifdef CONFIG_9P_FSCACHE
506 v9fs_vcookie_set_qid(ret, &st->qid);
507 v9fs_cache_inode_get_cookie(ret);
508#endif
509 err = v9fs_get_acl(ret, fid);
510 if (err) {
511 iput(ret);
512 goto error;
513 }
514 kfree(st);
515 return ret;
516error:
517 kfree(st);
518 return ERR_PTR(err);
519}
520
521/**
522 * v9fs_inode_from_fid - Helper routine to populate an inode by
523 * issuing a attribute request
524 * @v9ses: session information
525 * @fid: fid to issue attribute request for
526 * @sb: superblock on which to create inode
527 *
528 */
529static inline struct inode *
530v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
531 struct super_block *sb)
532{
533 if (v9fs_proto_dotl(v9ses))
534 return v9fs_inode_dotl(v9ses, fid, sb);
535 else
536 return v9fs_inode(v9ses, fid, sb);
537}
538
445/** 539/**
446 * v9fs_remove - helper function to remove files and directories 540 * v9fs_remove - helper function to remove files and directories
447 * @dir: directory inode that is being deleted 541 * @dir: directory inode that is being deleted
@@ -470,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
470 return retval; 564 return retval;
471} 565}
472 566
473static int
474v9fs_open_created(struct inode *inode, struct file *file)
475{
476 return 0;
477}
478
479
480/** 567/**
481 * v9fs_create - Create a file 568 * v9fs_create - Create a file
482 * @v9ses: session information 569 * @v9ses: session information
@@ -563,6 +650,144 @@ error:
563} 650}
564 651
565/** 652/**
653 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
654 * @dir: directory inode that is being created
655 * @dentry: dentry that is being deleted
656 * @mode: create permissions
657 * @nd: path information
658 *
659 */
660
661static int
662v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
663 struct nameidata *nd)
664{
665 int err = 0;
666 char *name = NULL;
667 gid_t gid;
668 int flags;
669 mode_t mode;
670 struct v9fs_session_info *v9ses;
671 struct p9_fid *fid = NULL;
672 struct p9_fid *dfid, *ofid;
673 struct file *filp;
674 struct p9_qid qid;
675 struct inode *inode;
676 struct posix_acl *pacl = NULL, *dacl = NULL;
677
678 v9ses = v9fs_inode2v9ses(dir);
679 if (nd && nd->flags & LOOKUP_OPEN)
680 flags = nd->intent.open.flags - 1;
681 else {
682 /*
683 * create call without LOOKUP_OPEN is due
684 * to mknod of regular files. So use mknod
685 * operation.
686 */
687 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
688 }
689
690 name = (char *) dentry->d_name.name;
691 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
692 "mode:0x%x\n", name, flags, omode);
693
694 dfid = v9fs_fid_lookup(dentry->d_parent);
695 if (IS_ERR(dfid)) {
696 err = PTR_ERR(dfid);
697 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
698 return err;
699 }
700
701 /* clone a fid to use for creation */
702 ofid = p9_client_walk(dfid, 0, NULL, 1);
703 if (IS_ERR(ofid)) {
704 err = PTR_ERR(ofid);
705 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
706 return err;
707 }
708
709 gid = v9fs_get_fsgid_for_create(dir);
710
711 mode = omode;
712 /* Update mode based on ACL value */
713 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
714 if (err) {
715 P9_DPRINTK(P9_DEBUG_VFS,
716 "Failed to get acl values in creat %d\n", err);
717 goto error;
718 }
719 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
720 if (err < 0) {
721 P9_DPRINTK(P9_DEBUG_VFS,
722 "p9_client_open_dotl failed in creat %d\n",
723 err);
724 goto error;
725 }
726 /* instantiate inode and assign the unopened fid to the dentry */
727 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
728 (nd && nd->flags & LOOKUP_OPEN)) {
729 fid = p9_client_walk(dfid, 1, &name, 1);
730 if (IS_ERR(fid)) {
731 err = PTR_ERR(fid);
732 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
733 err);
734 fid = NULL;
735 goto error;
736 }
737
738 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
739 if (IS_ERR(inode)) {
740 err = PTR_ERR(inode);
741 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
742 err);
743 goto error;
744 }
745 dentry->d_op = &v9fs_cached_dentry_operations;
746 d_instantiate(dentry, inode);
747 err = v9fs_fid_add(dentry, fid);
748 if (err < 0)
749 goto error;
750 /* The fid would get clunked via a dput */
751 fid = NULL;
752 } else {
753 /*
754 * Not in cached mode. No need to populate
755 * inode with stat. We need to get an inode
756 * so that we can set the acl with dentry
757 */
758 inode = v9fs_get_inode(dir->i_sb, mode);
759 if (IS_ERR(inode)) {
760 err = PTR_ERR(inode);
761 goto error;
762 }
763 dentry->d_op = &v9fs_dentry_operations;
764 d_instantiate(dentry, inode);
765 }
766 /* Now set the ACL based on the default value */
767 v9fs_set_create_acl(dentry, dacl, pacl);
768
769 /* if we are opening a file, assign the open fid to the file */
770 if (nd && nd->flags & LOOKUP_OPEN) {
771 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
772 if (IS_ERR(filp)) {
773 p9_client_clunk(ofid);
774 return PTR_ERR(filp);
775 }
776 filp->private_data = ofid;
777 } else
778 p9_client_clunk(ofid);
779
780 return 0;
781
782error:
783 if (ofid)
784 p9_client_clunk(ofid);
785 if (fid)
786 p9_client_clunk(fid);
787 return err;
788}
789
790/**
566 * v9fs_vfs_create - VFS hook to create files 791 * v9fs_vfs_create - VFS hook to create files
567 * @dir: directory inode that is being created 792 * @dir: directory inode that is being created
568 * @dentry: dentry that is being deleted 793 * @dentry: dentry that is being deleted
@@ -602,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
602 827
603 /* if we are opening a file, assign the open fid to the file */ 828 /* if we are opening a file, assign the open fid to the file */
604 if (nd && nd->flags & LOOKUP_OPEN) { 829 if (nd && nd->flags & LOOKUP_OPEN) {
605 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); 830 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
606 if (IS_ERR(filp)) { 831 if (IS_ERR(filp)) {
607 err = PTR_ERR(filp); 832 err = PTR_ERR(filp);
608 goto error; 833 goto error;
@@ -652,6 +877,107 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
652 return err; 877 return err;
653} 878}
654 879
880
881/**
882 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
883 * @dir: inode that is being unlinked
884 * @dentry: dentry that is being unlinked
885 * @mode: mode for new directory
886 *
887 */
888
889static int v9fs_vfs_mkdir_dotl(struct inode *dir,
890 struct dentry *dentry, int omode)
891{
892 int err;
893 struct v9fs_session_info *v9ses;
894 struct p9_fid *fid = NULL, *dfid = NULL;
895 gid_t gid;
896 char *name;
897 mode_t mode;
898 struct inode *inode;
899 struct p9_qid qid;
900 struct dentry *dir_dentry;
901 struct posix_acl *dacl = NULL, *pacl = NULL;
902
903 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
904 err = 0;
905 v9ses = v9fs_inode2v9ses(dir);
906
907 omode |= S_IFDIR;
908 if (dir->i_mode & S_ISGID)
909 omode |= S_ISGID;
910
911 dir_dentry = v9fs_dentry_from_dir_inode(dir);
912 dfid = v9fs_fid_lookup(dir_dentry);
913 if (IS_ERR(dfid)) {
914 err = PTR_ERR(dfid);
915 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
916 dfid = NULL;
917 goto error;
918 }
919
920 gid = v9fs_get_fsgid_for_create(dir);
921 mode = omode;
922 /* Update mode based on ACL value */
923 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
924 if (err) {
925 P9_DPRINTK(P9_DEBUG_VFS,
926 "Failed to get acl values in mkdir %d\n", err);
927 goto error;
928 }
929 name = (char *) dentry->d_name.name;
930 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
931 if (err < 0)
932 goto error;
933
934 /* instantiate inode and assign the unopened fid to the dentry */
935 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
936 fid = p9_client_walk(dfid, 1, &name, 1);
937 if (IS_ERR(fid)) {
938 err = PTR_ERR(fid);
939 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
940 err);
941 fid = NULL;
942 goto error;
943 }
944
945 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
946 if (IS_ERR(inode)) {
947 err = PTR_ERR(inode);
948 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
949 err);
950 goto error;
951 }
952 dentry->d_op = &v9fs_cached_dentry_operations;
953 d_instantiate(dentry, inode);
954 err = v9fs_fid_add(dentry, fid);
955 if (err < 0)
956 goto error;
957 fid = NULL;
958 } else {
959 /*
960 * Not in cached mode. No need to populate
961 * inode with stat. We need to get an inode
962 * so that we can set the acl with dentry
963 */
964 inode = v9fs_get_inode(dir->i_sb, mode);
965 if (IS_ERR(inode)) {
966 err = PTR_ERR(inode);
967 goto error;
968 }
969 dentry->d_op = &v9fs_dentry_operations;
970 d_instantiate(dentry, inode);
971 }
972 /* Now set the ACL based on the default value */
973 v9fs_set_create_acl(dentry, dacl, pacl);
974
975error:
976 if (fid)
977 p9_client_clunk(fid);
978 return err;
979}
980
655/** 981/**
656 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 982 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
657 * @dir: inode that is being walked from 983 * @dir: inode that is being walked from
@@ -678,6 +1004,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
678 1004
679 sb = dir->i_sb; 1005 sb = dir->i_sb;
680 v9ses = v9fs_inode2v9ses(dir); 1006 v9ses = v9fs_inode2v9ses(dir);
1007 /* We can walk d_parent because we hold the dir->i_mutex */
681 dfid = v9fs_fid_lookup(dentry->d_parent); 1008 dfid = v9fs_fid_lookup(dentry->d_parent);
682 if (IS_ERR(dfid)) 1009 if (IS_ERR(dfid))
683 return ERR_CAST(dfid); 1010 return ERR_CAST(dfid);
@@ -703,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
703 1030
704 result = v9fs_fid_add(dentry, fid); 1031 result = v9fs_fid_add(dentry, fid);
705 if (result < 0) 1032 if (result < 0)
706 goto error; 1033 goto error_iput;
707 1034
708inst_out: 1035inst_out:
709 if (v9ses->cache) 1036 if (v9ses->cache)
@@ -714,6 +1041,8 @@ inst_out:
714 d_add(dentry, inode); 1041 d_add(dentry, inode);
715 return NULL; 1042 return NULL;
716 1043
1044error_iput:
1045 iput(inode);
717error: 1046error:
718 p9_client_clunk(fid); 1047 p9_client_clunk(fid);
719 1048
@@ -785,27 +1114,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
785 goto clunk_olddir; 1114 goto clunk_olddir;
786 } 1115 }
787 1116
1117 down_write(&v9ses->rename_sem);
788 if (v9fs_proto_dotl(v9ses)) { 1118 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid, 1119 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name); 1120 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS) 1121 if (retval != -ENOSYS)
792 goto clunk_newdir; 1122 goto clunk_newdir;
793 } 1123 }
1124 if (old_dentry->d_parent != new_dentry->d_parent) {
1125 /*
1126 * 9P .u can only handle file rename in the same directory
1127 */
794 1128
795 /* 9P can only handle file rename in the same directory */
796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
797 P9_DPRINTK(P9_DEBUG_ERROR, 1129 P9_DPRINTK(P9_DEBUG_ERROR,
798 "old dir and new dir are different\n"); 1130 "old dir and new dir are different\n");
799 retval = -EXDEV; 1131 retval = -EXDEV;
800 goto clunk_newdir; 1132 goto clunk_newdir;
801 } 1133 }
802
803 v9fs_blank_wstat(&wstat); 1134 v9fs_blank_wstat(&wstat);
804 wstat.muid = v9ses->uname; 1135 wstat.muid = v9ses->uname;
805 wstat.name = (char *) new_dentry->d_name.name; 1136 wstat.name = (char *) new_dentry->d_name.name;
806 retval = p9_client_wstat(oldfid, &wstat); 1137 retval = p9_client_wstat(oldfid, &wstat);
807 1138
808clunk_newdir: 1139clunk_newdir:
1140 if (!retval)
1141 /* successful rename */
1142 d_move(old_dentry, new_dentry);
1143 up_write(&v9ses->rename_sem);
809 p9_client_clunk(newdirfid); 1144 p9_client_clunk(newdirfid);
810 1145
811clunk_olddir: 1146clunk_olddir:
@@ -849,6 +1184,43 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
849 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); 1184 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
850 generic_fillattr(dentry->d_inode, stat); 1185 generic_fillattr(dentry->d_inode, stat);
851 1186
1187 p9stat_free(st);
1188 kfree(st);
1189 return 0;
1190}
1191
1192static int
1193v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
1194 struct kstat *stat)
1195{
1196 int err;
1197 struct v9fs_session_info *v9ses;
1198 struct p9_fid *fid;
1199 struct p9_stat_dotl *st;
1200
1201 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1202 err = -EPERM;
1203 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1204 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1205 return simple_getattr(mnt, dentry, stat);
1206
1207 fid = v9fs_fid_lookup(dentry);
1208 if (IS_ERR(fid))
1209 return PTR_ERR(fid);
1210
1211 /* Ask for all the fields in stat structure. Server will return
1212 * whatever it supports
1213 */
1214
1215 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
1216 if (IS_ERR(st))
1217 return PTR_ERR(st);
1218
1219 v9fs_stat2inode_dotl(st, dentry->d_inode);
1220 generic_fillattr(dentry->d_inode, stat);
1221 /* Change block size to what the server returned */
1222 stat->blksize = st->st_blksize;
1223
852 kfree(st); 1224 kfree(st);
853 return 0; 1225 return 0;
854} 1226}
@@ -896,10 +1268,77 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
896 } 1268 }
897 1269
898 retval = p9_client_wstat(fid, &wstat); 1270 retval = p9_client_wstat(fid, &wstat);
899 if (retval >= 0) 1271 if (retval < 0)
900 retval = inode_setattr(dentry->d_inode, iattr); 1272 return retval;
1273
1274 if ((iattr->ia_valid & ATTR_SIZE) &&
1275 iattr->ia_size != i_size_read(dentry->d_inode)) {
1276 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1277 if (retval)
1278 return retval;
1279 }
901 1280
902 return retval; 1281 setattr_copy(dentry->d_inode, iattr);
1282 mark_inode_dirty(dentry->d_inode);
1283 return 0;
1284}
1285
1286/**
1287 * v9fs_vfs_setattr_dotl - set file metadata
1288 * @dentry: file whose metadata to set
1289 * @iattr: metadata assignment structure
1290 *
1291 */
1292
1293int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1294{
1295 int retval;
1296 struct v9fs_session_info *v9ses;
1297 struct p9_fid *fid;
1298 struct p9_iattr_dotl p9attr;
1299
1300 P9_DPRINTK(P9_DEBUG_VFS, "\n");
1301
1302 retval = inode_change_ok(dentry->d_inode, iattr);
1303 if (retval)
1304 return retval;
1305
1306 p9attr.valid = iattr->ia_valid;
1307 p9attr.mode = iattr->ia_mode;
1308 p9attr.uid = iattr->ia_uid;
1309 p9attr.gid = iattr->ia_gid;
1310 p9attr.size = iattr->ia_size;
1311 p9attr.atime_sec = iattr->ia_atime.tv_sec;
1312 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
1313 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
1314 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
1315
1316 retval = -EPERM;
1317 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1318 fid = v9fs_fid_lookup(dentry);
1319 if (IS_ERR(fid))
1320 return PTR_ERR(fid);
1321
1322 retval = p9_client_setattr(fid, &p9attr);
1323 if (retval < 0)
1324 return retval;
1325
1326 if ((iattr->ia_valid & ATTR_SIZE) &&
1327 iattr->ia_size != i_size_read(dentry->d_inode)) {
1328 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1329 if (retval)
1330 return retval;
1331 }
1332
1333 setattr_copy(dentry->d_inode, iattr);
1334 mark_inode_dirty(dentry->d_inode);
1335 if (iattr->ia_valid & ATTR_MODE) {
1336 /* We also want to update ACL when we update mode bits */
1337 retval = v9fs_acl_chmod(dentry);
1338 if (retval < 0)
1339 return retval;
1340 }
1341 return 0;
903} 1342}
904 1343
905/** 1344/**
@@ -980,6 +1419,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
980} 1419}
981 1420
982/** 1421/**
1422 * v9fs_stat2inode_dotl - populate an inode structure with stat info
1423 * @stat: stat structure
1424 * @inode: inode to populate
1425 * @sb: superblock of filesystem
1426 *
1427 */
1428
1429void
1430v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
1431{
1432
1433 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
1434 inode->i_atime.tv_sec = stat->st_atime_sec;
1435 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1436 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1437 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1438 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1439 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1440 inode->i_uid = stat->st_uid;
1441 inode->i_gid = stat->st_gid;
1442 inode->i_nlink = stat->st_nlink;
1443 inode->i_mode = stat->st_mode;
1444 inode->i_rdev = new_decode_dev(stat->st_rdev);
1445
1446 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
1447 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1448
1449 i_size_write(inode, stat->st_size);
1450 inode->i_blocks = stat->st_blocks;
1451 } else {
1452 if (stat->st_result_mask & P9_STATS_ATIME) {
1453 inode->i_atime.tv_sec = stat->st_atime_sec;
1454 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1455 }
1456 if (stat->st_result_mask & P9_STATS_MTIME) {
1457 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1458 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1459 }
1460 if (stat->st_result_mask & P9_STATS_CTIME) {
1461 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1462 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1463 }
1464 if (stat->st_result_mask & P9_STATS_UID)
1465 inode->i_uid = stat->st_uid;
1466 if (stat->st_result_mask & P9_STATS_GID)
1467 inode->i_gid = stat->st_gid;
1468 if (stat->st_result_mask & P9_STATS_NLINK)
1469 inode->i_nlink = stat->st_nlink;
1470 if (stat->st_result_mask & P9_STATS_MODE) {
1471 inode->i_mode = stat->st_mode;
1472 if ((S_ISBLK(inode->i_mode)) ||
1473 (S_ISCHR(inode->i_mode)))
1474 init_special_inode(inode, inode->i_mode,
1475 inode->i_rdev);
1476 }
1477 if (stat->st_result_mask & P9_STATS_RDEV)
1478 inode->i_rdev = new_decode_dev(stat->st_rdev);
1479 if (stat->st_result_mask & P9_STATS_SIZE)
1480 i_size_write(inode, stat->st_size);
1481 if (stat->st_result_mask & P9_STATS_BLOCKS)
1482 inode->i_blocks = stat->st_blocks;
1483 }
1484 if (stat->st_result_mask & P9_STATS_GEN)
1485 inode->i_generation = stat->st_gen;
1486
1487 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
1488 * because the inode structure does not have fields for them.
1489 */
1490}
1491
1492/**
983 * v9fs_qid2ino - convert qid into inode number 1493 * v9fs_qid2ino - convert qid into inode number
984 * @qid: qid to hash 1494 * @qid: qid to hash
985 * 1495 *
@@ -1042,6 +1552,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1042 1552
1043 retval = strnlen(buffer, buflen); 1553 retval = strnlen(buffer, buflen);
1044done: 1554done:
1555 p9stat_free(st);
1045 kfree(st); 1556 kfree(st);
1046 return retval; 1557 return retval;
1047} 1558}
@@ -1128,6 +1639,94 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1128} 1639}
1129 1640
1130/** 1641/**
1642 * v9fs_vfs_symlink_dotl - helper function to create symlinks
1643 * @dir: directory inode containing symlink
1644 * @dentry: dentry for symlink
1645 * @symname: symlink data
1646 *
1647 * See Also: 9P2000.L RFC for more information
1648 *
1649 */
1650
1651static int
1652v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1653 const char *symname)
1654{
1655 struct v9fs_session_info *v9ses;
1656 struct p9_fid *dfid;
1657 struct p9_fid *fid = NULL;
1658 struct inode *inode;
1659 struct p9_qid qid;
1660 char *name;
1661 int err;
1662 gid_t gid;
1663
1664 name = (char *) dentry->d_name.name;
1665 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
1666 dir->i_ino, name, symname);
1667 v9ses = v9fs_inode2v9ses(dir);
1668
1669 dfid = v9fs_fid_lookup(dentry->d_parent);
1670 if (IS_ERR(dfid)) {
1671 err = PTR_ERR(dfid);
1672 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1673 return err;
1674 }
1675
1676 gid = v9fs_get_fsgid_for_create(dir);
1677
1678 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1679 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1680
1681 if (err < 0) {
1682 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
1683 goto error;
1684 }
1685
1686 if (v9ses->cache) {
1687 /* Now walk from the parent so we can get an unopened fid. */
1688 fid = p9_client_walk(dfid, 1, &name, 1);
1689 if (IS_ERR(fid)) {
1690 err = PTR_ERR(fid);
1691 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1692 err);
1693 fid = NULL;
1694 goto error;
1695 }
1696
1697 /* instantiate inode and assign the unopened fid to dentry */
1698 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1699 if (IS_ERR(inode)) {
1700 err = PTR_ERR(inode);
1701 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1702 err);
1703 goto error;
1704 }
1705 dentry->d_op = &v9fs_cached_dentry_operations;
1706 d_instantiate(dentry, inode);
1707 err = v9fs_fid_add(dentry, fid);
1708 if (err < 0)
1709 goto error;
1710 fid = NULL;
1711 } else {
1712 /* Not in cached mode. No need to populate inode with stat */
1713 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
1714 if (IS_ERR(inode)) {
1715 err = PTR_ERR(inode);
1716 goto error;
1717 }
1718 dentry->d_op = &v9fs_dentry_operations;
1719 d_instantiate(dentry, inode);
1720 }
1721
1722error:
1723 if (fid)
1724 p9_client_clunk(fid);
1725
1726 return err;
1727}
1728
1729/**
1131 * v9fs_vfs_symlink - helper function to create symlinks 1730 * v9fs_vfs_symlink - helper function to create symlinks
1132 * @dir: directory inode containing symlink 1731 * @dir: directory inode containing symlink
1133 * @dentry: dentry for symlink 1732 * @dentry: dentry for symlink
@@ -1186,6 +1785,77 @@ clunk_fid:
1186} 1785}
1187 1786
1188/** 1787/**
1788 * v9fs_vfs_link_dotl - create a hardlink for dotl
1789 * @old_dentry: dentry for file to link to
1790 * @dir: inode destination for new link
1791 * @dentry: dentry for link
1792 *
1793 */
1794
1795static int
1796v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1797 struct dentry *dentry)
1798{
1799 int err;
1800 struct p9_fid *dfid, *oldfid;
1801 char *name;
1802 struct v9fs_session_info *v9ses;
1803 struct dentry *dir_dentry;
1804
1805 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
1806 dir->i_ino, old_dentry->d_name.name,
1807 dentry->d_name.name);
1808
1809 v9ses = v9fs_inode2v9ses(dir);
1810 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1811 dfid = v9fs_fid_lookup(dir_dentry);
1812 if (IS_ERR(dfid))
1813 return PTR_ERR(dfid);
1814
1815 oldfid = v9fs_fid_lookup(old_dentry);
1816 if (IS_ERR(oldfid))
1817 return PTR_ERR(oldfid);
1818
1819 name = (char *) dentry->d_name.name;
1820
1821 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
1822
1823 if (err < 0) {
1824 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
1825 return err;
1826 }
1827
1828 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1829 /* Get the latest stat info from server. */
1830 struct p9_fid *fid;
1831 struct p9_stat_dotl *st;
1832
1833 fid = v9fs_fid_lookup(old_dentry);
1834 if (IS_ERR(fid))
1835 return PTR_ERR(fid);
1836
1837 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
1838 if (IS_ERR(st))
1839 return PTR_ERR(st);
1840
1841 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
1842
1843 kfree(st);
1844 } else {
1845 /* Caching disabled. No need to get upto date stat info.
1846 * This dentry will be released immediately. So, just hold the
1847 * inode
1848 */
1849 ihold(old_dentry->d_inode);
1850 }
1851
1852 dentry->d_op = old_dentry->d_op;
1853 d_instantiate(dentry, old_dentry->d_inode);
1854
1855 return err;
1856}
1857
1858/**
1189 * v9fs_vfs_mknod - create a special file 1859 * v9fs_vfs_mknod - create a special file
1190 * @dir: inode destination for new link 1860 * @dir: inode destination for new link
1191 * @dentry: dentry for file 1861 * @dentry: dentry for file
@@ -1230,6 +1900,160 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1230 return retval; 1900 return retval;
1231} 1901}
1232 1902
1903/**
1904 * v9fs_vfs_mknod_dotl - create a special file
1905 * @dir: inode destination for new link
1906 * @dentry: dentry for file
1907 * @mode: mode for creation
1908 * @rdev: device associated with special file
1909 *
1910 */
1911static int
1912v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
1913 dev_t rdev)
1914{
1915 int err;
1916 char *name;
1917 mode_t mode;
1918 struct v9fs_session_info *v9ses;
1919 struct p9_fid *fid = NULL, *dfid = NULL;
1920 struct inode *inode;
1921 gid_t gid;
1922 struct p9_qid qid;
1923 struct dentry *dir_dentry;
1924 struct posix_acl *dacl = NULL, *pacl = NULL;
1925
1926 P9_DPRINTK(P9_DEBUG_VFS,
1927 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1928 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
1929
1930 if (!new_valid_dev(rdev))
1931 return -EINVAL;
1932
1933 v9ses = v9fs_inode2v9ses(dir);
1934 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1935 dfid = v9fs_fid_lookup(dir_dentry);
1936 if (IS_ERR(dfid)) {
1937 err = PTR_ERR(dfid);
1938 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1939 dfid = NULL;
1940 goto error;
1941 }
1942
1943 gid = v9fs_get_fsgid_for_create(dir);
1944 mode = omode;
1945 /* Update mode based on ACL value */
1946 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
1947 if (err) {
1948 P9_DPRINTK(P9_DEBUG_VFS,
1949 "Failed to get acl values in mknod %d\n", err);
1950 goto error;
1951 }
1952 name = (char *) dentry->d_name.name;
1953
1954 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
1955 if (err < 0)
1956 goto error;
1957
1958 /* instantiate inode and assign the unopened fid to the dentry */
1959 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1960 fid = p9_client_walk(dfid, 1, &name, 1);
1961 if (IS_ERR(fid)) {
1962 err = PTR_ERR(fid);
1963 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1964 err);
1965 fid = NULL;
1966 goto error;
1967 }
1968
1969 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1970 if (IS_ERR(inode)) {
1971 err = PTR_ERR(inode);
1972 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1973 err);
1974 goto error;
1975 }
1976 dentry->d_op = &v9fs_cached_dentry_operations;
1977 d_instantiate(dentry, inode);
1978 err = v9fs_fid_add(dentry, fid);
1979 if (err < 0)
1980 goto error;
1981 fid = NULL;
1982 } else {
1983 /*
1984 * Not in cached mode. No need to populate inode with stat.
1985 * socket syscall returns a fd, so we need instantiate
1986 */
1987 inode = v9fs_get_inode(dir->i_sb, mode);
1988 if (IS_ERR(inode)) {
1989 err = PTR_ERR(inode);
1990 goto error;
1991 }
1992 dentry->d_op = &v9fs_dentry_operations;
1993 d_instantiate(dentry, inode);
1994 }
1995 /* Now set the ACL based on the default value */
1996 v9fs_set_create_acl(dentry, dacl, pacl);
1997error:
1998 if (fid)
1999 p9_client_clunk(fid);
2000 return err;
2001}
2002
2003static int
2004v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
2005{
2006 int retval;
2007 struct p9_fid *fid;
2008 char *target = NULL;
2009
2010 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
2011 retval = -EPERM;
2012 fid = v9fs_fid_lookup(dentry);
2013 if (IS_ERR(fid))
2014 return PTR_ERR(fid);
2015
2016 retval = p9_client_readlink(fid, &target);
2017 if (retval < 0)
2018 return retval;
2019
2020 strncpy(buffer, target, buflen);
2021 P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
2022
2023 retval = strnlen(buffer, buflen);
2024 return retval;
2025}
2026
2027/**
2028 * v9fs_vfs_follow_link_dotl - follow a symlink path
2029 * @dentry: dentry for symlink
2030 * @nd: nameidata
2031 *
2032 */
2033
2034static void *
2035v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
2036{
2037 int len = 0;
2038 char *link = __getname();
2039
2040 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
2041
2042 if (!link)
2043 link = ERR_PTR(-ENOMEM);
2044 else {
2045 len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
2046 if (len < 0) {
2047 __putname(link);
2048 link = ERR_PTR(len);
2049 } else
2050 link[min(len, PATH_MAX-1)] = 0;
2051 }
2052 nd_set_link(nd, link);
2053
2054 return NULL;
2055}
2056
1233static const struct inode_operations v9fs_dir_inode_operations_dotu = { 2057static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create, 2058 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup, 2059 .lookup = v9fs_vfs_lookup,
@@ -1245,17 +2069,22 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1245}; 2069};
1246 2070
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = { 2071static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1248 .create = v9fs_vfs_create, 2072 .create = v9fs_vfs_create_dotl,
1249 .lookup = v9fs_vfs_lookup, 2073 .lookup = v9fs_vfs_lookup,
1250 .symlink = v9fs_vfs_symlink, 2074 .link = v9fs_vfs_link_dotl,
1251 .link = v9fs_vfs_link, 2075 .symlink = v9fs_vfs_symlink_dotl,
1252 .unlink = v9fs_vfs_unlink, 2076 .unlink = v9fs_vfs_unlink,
1253 .mkdir = v9fs_vfs_mkdir, 2077 .mkdir = v9fs_vfs_mkdir_dotl,
1254 .rmdir = v9fs_vfs_rmdir, 2078 .rmdir = v9fs_vfs_rmdir,
1255 .mknod = v9fs_vfs_mknod, 2079 .mknod = v9fs_vfs_mknod_dotl,
1256 .rename = v9fs_vfs_rename, 2080 .rename = v9fs_vfs_rename,
1257 .getattr = v9fs_vfs_getattr, 2081 .getattr = v9fs_vfs_getattr_dotl,
1258 .setattr = v9fs_vfs_setattr, 2082 .setattr = v9fs_vfs_setattr_dotl,
2083 .setxattr = generic_setxattr,
2084 .getxattr = generic_getxattr,
2085 .removexattr = generic_removexattr,
2086 .listxattr = v9fs_listxattr,
2087 .check_acl = v9fs_check_acl,
1259}; 2088};
1260 2089
1261static const struct inode_operations v9fs_dir_inode_operations = { 2090static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1276,8 +2105,13 @@ static const struct inode_operations v9fs_file_inode_operations = {
1276}; 2105};
1277 2106
1278static const struct inode_operations v9fs_file_inode_operations_dotl = { 2107static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr, 2108 .getattr = v9fs_vfs_getattr_dotl,
1280 .setattr = v9fs_vfs_setattr, 2109 .setattr = v9fs_vfs_setattr_dotl,
2110 .setxattr = generic_setxattr,
2111 .getxattr = generic_getxattr,
2112 .removexattr = generic_removexattr,
2113 .listxattr = v9fs_listxattr,
2114 .check_acl = v9fs_check_acl,
1281}; 2115};
1282 2116
1283static const struct inode_operations v9fs_symlink_inode_operations = { 2117static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -1289,9 +2123,13 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1289}; 2123};
1290 2124
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = { 2125static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink, 2126 .readlink = v9fs_vfs_readlink_dotl,
1293 .follow_link = v9fs_vfs_follow_link, 2127 .follow_link = v9fs_vfs_follow_link_dotl,
1294 .put_link = v9fs_vfs_put_link, 2128 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr, 2129 .getattr = v9fs_vfs_getattr_dotl,
1296 .setattr = v9fs_vfs_setattr, 2130 .setattr = v9fs_vfs_setattr_dotl,
2131 .setxattr = generic_setxattr,
2132 .getxattr = generic_getxattr,
2133 .removexattr = generic_removexattr,
2134 .listxattr = v9fs_listxattr,
1297}; 2135};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index be74d020436..c55c614500a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,12 +39,15 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h> 41#include <linux/statfs.h>
42#include <linux/magic.h>
42#include <net/9p/9p.h> 43#include <net/9p/9p.h>
43#include <net/9p/client.h> 44#include <net/9p/client.h>
44 45
45#include "v9fs.h" 46#include "v9fs.h"
46#include "v9fs_vfs.h" 47#include "v9fs_vfs.h"
47#include "fid.h" 48#include "fid.h"
49#include "xattr.h"
50#include "acl.h"
48 51
49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; 52static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
50 53
@@ -65,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
65 * v9fs_fill_super - populate superblock with info 68 * v9fs_fill_super - populate superblock with info
66 * @sb: superblock 69 * @sb: superblock
67 * @v9ses: session information 70 * @v9ses: session information
68 * @flags: flags propagated from v9fs_get_sb() 71 * @flags: flags propagated from v9fs_mount()
69 * 72 *
70 */ 73 */
71 74
@@ -77,37 +80,40 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 80 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
78 sb->s_blocksize = 1 << sb->s_blocksize_bits; 81 sb->s_blocksize = 1 << sb->s_blocksize_bits;
79 sb->s_magic = V9FS_MAGIC; 82 sb->s_magic = V9FS_MAGIC;
80 if (v9fs_proto_dotl(v9ses)) 83 if (v9fs_proto_dotl(v9ses)) {
81 sb->s_op = &v9fs_super_ops_dotl; 84 sb->s_op = &v9fs_super_ops_dotl;
82 else 85 sb->s_xattr = v9fs_xattr_handlers;
86 } else
83 sb->s_op = &v9fs_super_ops; 87 sb->s_op = &v9fs_super_ops;
84 sb->s_bdi = &v9ses->bdi; 88 sb->s_bdi = &v9ses->bdi;
85 89
86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 90 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
87 MS_NOATIME; 91 MS_NOATIME;
88 92
93#ifdef CONFIG_9P_FS_POSIX_ACL
94 if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
95 sb->s_flags |= MS_POSIXACL;
96#endif
97
89 save_mount_options(sb, data); 98 save_mount_options(sb, data);
90} 99}
91 100
92/** 101/**
93 * v9fs_get_sb - mount a superblock 102 * v9fs_mount - mount a superblock
94 * @fs_type: file system type 103 * @fs_type: file system type
95 * @flags: mount flags 104 * @flags: mount flags
96 * @dev_name: device name that was mounted 105 * @dev_name: device name that was mounted
97 * @data: mount options 106 * @data: mount options
98 * @mnt: mountpoint record to be instantiated
99 * 107 *
100 */ 108 */
101 109
102static int v9fs_get_sb(struct file_system_type *fs_type, int flags, 110static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
103 const char *dev_name, void *data, 111 const char *dev_name, void *data)
104 struct vfsmount *mnt)
105{ 112{
106 struct super_block *sb = NULL; 113 struct super_block *sb = NULL;
107 struct inode *inode = NULL; 114 struct inode *inode = NULL;
108 struct dentry *root = NULL; 115 struct dentry *root = NULL;
109 struct v9fs_session_info *v9ses = NULL; 116 struct v9fs_session_info *v9ses = NULL;
110 struct p9_wstat *st = NULL;
111 int mode = S_IRWXUGO | S_ISVTX; 117 int mode = S_IRWXUGO | S_ISVTX;
112 struct p9_fid *fid; 118 struct p9_fid *fid;
113 int retval = 0; 119 int retval = 0;
@@ -116,24 +122,22 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
116 122
117 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
118 if (!v9ses) 124 if (!v9ses)
119 return -ENOMEM; 125 return ERR_PTR(-ENOMEM);
120 126
121 fid = v9fs_session_init(v9ses, dev_name, data); 127 fid = v9fs_session_init(v9ses, dev_name, data);
122 if (IS_ERR(fid)) { 128 if (IS_ERR(fid)) {
123 retval = PTR_ERR(fid); 129 retval = PTR_ERR(fid);
130 /*
131 * we need to call session_close to tear down some
132 * of the data structure setup by session_init
133 */
124 goto close_session; 134 goto close_session;
125 } 135 }
126 136
127 st = p9_client_stat(fid);
128 if (IS_ERR(st)) {
129 retval = PTR_ERR(st);
130 goto clunk_fid;
131 }
132
133 sb = sget(fs_type, NULL, v9fs_set_super, v9ses); 137 sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
134 if (IS_ERR(sb)) { 138 if (IS_ERR(sb)) {
135 retval = PTR_ERR(sb); 139 retval = PTR_ERR(sb);
136 goto free_stat; 140 goto clunk_fid;
137 } 141 }
138 v9fs_fill_super(sb, v9ses, flags, data); 142 v9fs_fill_super(sb, v9ses, flags, data);
139 143
@@ -149,37 +153,56 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
149 retval = -ENOMEM; 153 retval = -ENOMEM;
150 goto release_sb; 154 goto release_sb;
151 } 155 }
152
153 sb->s_root = root; 156 sb->s_root = root;
154 root->d_inode->i_ino = v9fs_qid2ino(&st->qid); 157 if (v9fs_proto_dotl(v9ses)) {
158 struct p9_stat_dotl *st = NULL;
159 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
160 if (IS_ERR(st)) {
161 retval = PTR_ERR(st);
162 goto release_sb;
163 }
155 164
156 v9fs_stat2inode(st, root->d_inode, sb); 165 v9fs_stat2inode_dotl(st, root->d_inode);
166 kfree(st);
167 } else {
168 struct p9_wstat *st = NULL;
169 st = p9_client_stat(fid);
170 if (IS_ERR(st)) {
171 retval = PTR_ERR(st);
172 goto release_sb;
173 }
157 174
158 v9fs_fid_add(root, fid); 175 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
159 p9stat_free(st); 176 v9fs_stat2inode(st, root->d_inode, sb);
160 kfree(st);
161 177
162P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 178 p9stat_free(st);
163 simple_set_mnt(mnt, sb); 179 kfree(st);
164 return 0; 180 }
181 retval = v9fs_get_acl(inode, fid);
182 if (retval)
183 goto release_sb;
184 v9fs_fid_add(root, fid);
165 185
166free_stat: 186 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
167 p9stat_free(st); 187 return dget(sb->s_root);
168 kfree(st);
169 188
170clunk_fid: 189clunk_fid:
171 p9_client_clunk(fid); 190 p9_client_clunk(fid);
172
173close_session: 191close_session:
174 v9fs_session_close(v9ses); 192 v9fs_session_close(v9ses);
175 kfree(v9ses); 193 kfree(v9ses);
176 return retval; 194 return ERR_PTR(retval);
177 195
178release_sb: 196release_sb:
179 p9stat_free(st); 197 /*
180 kfree(st); 198 * we will do the session_close and root dentry release
199 * in the below call. But we need to clunk fid, because we haven't
200 * attached the fid to dentry so it won't get clunked
201 * automatically.
202 */
203 p9_client_clunk(fid);
181 deactivate_locked_super(sb); 204 deactivate_locked_super(sb);
182 return retval; 205 return ERR_PTR(retval);
183} 206}
184 207
185/** 208/**
@@ -232,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
232 if (v9fs_proto_dotl(v9ses)) { 255 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs); 256 res = p9_client_statfs(fid, &rs);
234 if (res == 0) { 257 if (res == 0) {
235 buf->f_type = rs.type; 258 buf->f_type = V9FS_MAGIC;
236 buf->f_bsize = rs.bsize; 259 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks; 260 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree; 261 buf->f_bfree = rs.bfree;
@@ -257,7 +280,7 @@ static const struct super_operations v9fs_super_ops = {
257 .destroy_inode = v9fs_destroy_inode, 280 .destroy_inode = v9fs_destroy_inode,
258#endif 281#endif
259 .statfs = simple_statfs, 282 .statfs = simple_statfs,
260 .clear_inode = v9fs_clear_inode, 283 .evict_inode = v9fs_evict_inode,
261 .show_options = generic_show_options, 284 .show_options = generic_show_options,
262 .umount_begin = v9fs_umount_begin, 285 .umount_begin = v9fs_umount_begin,
263}; 286};
@@ -268,14 +291,15 @@ static const struct super_operations v9fs_super_ops_dotl = {
268 .destroy_inode = v9fs_destroy_inode, 291 .destroy_inode = v9fs_destroy_inode,
269#endif 292#endif
270 .statfs = v9fs_statfs, 293 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode, 294 .evict_inode = v9fs_evict_inode,
272 .show_options = generic_show_options, 295 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin, 296 .umount_begin = v9fs_umount_begin,
274}; 297};
275 298
276struct file_system_type v9fs_fs_type = { 299struct file_system_type v9fs_fs_type = {
277 .name = "9p", 300 .name = "9p",
278 .get_sb = v9fs_get_sb, 301 .mount = v9fs_mount,
279 .kill_sb = v9fs_kill_super, 302 .kill_sb = v9fs_kill_super,
280 .owner = THIS_MODULE, 303 .owner = THIS_MODULE,
304 .fs_flags = FS_RENAME_DOES_D_MOVE,
281}; 305};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 00000000000..43ec7df8433
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,172 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <linux/sched.h>
18#include <net/9p/9p.h>
19#include <net/9p/client.h>
20
21#include "fid.h"
22#include "xattr.h"
23
24ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
25 void *buffer, size_t buffer_size)
26{
27 ssize_t retval;
28 int msize, read_count;
29 u64 offset = 0, attr_size;
30 struct p9_fid *attr_fid;
31
32 attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
33 if (IS_ERR(attr_fid)) {
34 retval = PTR_ERR(attr_fid);
35 P9_DPRINTK(P9_DEBUG_VFS,
36 "p9_client_attrwalk failed %zd\n", retval);
37 attr_fid = NULL;
38 goto error;
39 }
40 if (!buffer_size) {
41 /* request to get the attr_size */
42 retval = attr_size;
43 goto error;
44 }
45 if (attr_size > buffer_size) {
46 retval = -ERANGE;
47 goto error;
48 }
49 msize = attr_fid->clnt->msize;
50 while (attr_size) {
51 if (attr_size > (msize - P9_IOHDRSZ))
52 read_count = msize - P9_IOHDRSZ;
53 else
54 read_count = attr_size;
55 read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
56 NULL, offset, read_count);
57 if (read_count < 0) {
58 /* error in xattr read */
59 retval = read_count;
60 goto error;
61 }
62 offset += read_count;
63 attr_size -= read_count;
64 }
65 /* Total read xattr bytes */
66 retval = offset;
67error:
68 if (attr_fid)
69 p9_client_clunk(attr_fid);
70 return retval;
71
72}
73
74
75/*
76 * v9fs_xattr_get()
77 *
78 * Copy an extended attribute into the buffer
79 * provided, or compute the buffer size required.
80 * Buffer is NULL to compute the size of the buffer required.
81 *
82 * Returns a negative error number on failure, or the number of bytes
83 * used / required on success.
84 */
85ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
86 void *buffer, size_t buffer_size)
87{
88 struct p9_fid *fid;
89
90 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
91 __func__, name, buffer_size);
92 fid = v9fs_fid_lookup(dentry);
93 if (IS_ERR(fid))
94 return PTR_ERR(fid);
95
96 return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
97}
98
99/*
100 * v9fs_xattr_set()
101 *
102 * Create, replace or remove an extended attribute for this inode. Buffer
103 * is NULL to remove an existing extended attribute, and non-NULL to
104 * either replace an existing extended attribute, or create a new extended
105 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
106 * specify that an extended attribute must exist and must not exist
107 * previous to the call, respectively.
108 *
109 * Returns 0, or a negative error number on failure.
110 */
111int v9fs_xattr_set(struct dentry *dentry, const char *name,
112 const void *value, size_t value_len, int flags)
113{
114 u64 offset = 0;
115 int retval, msize, write_count;
116 struct p9_fid *fid = NULL;
117
118 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
119 __func__, name, value_len, flags);
120
121 fid = v9fs_fid_clone(dentry);
122 if (IS_ERR(fid)) {
123 retval = PTR_ERR(fid);
124 fid = NULL;
125 goto error;
126 }
127 /*
128 * On success fid points to xattr
129 */
130 retval = p9_client_xattrcreate(fid, name, value_len, flags);
131 if (retval < 0) {
132 P9_DPRINTK(P9_DEBUG_VFS,
133 "p9_client_xattrcreate failed %d\n", retval);
134 goto error;
135 }
136 msize = fid->clnt->msize;;
137 while (value_len) {
138 if (value_len > (msize - P9_IOHDRSZ))
139 write_count = msize - P9_IOHDRSZ;
140 else
141 write_count = value_len;
142 write_count = p9_client_write(fid, ((char *)value)+offset,
143 NULL, offset, write_count);
144 if (write_count < 0) {
145 /* error in xattr write */
146 retval = write_count;
147 goto error;
148 }
149 offset += write_count;
150 value_len -= write_count;
151 }
152 /* Total read xattr bytes */
153 retval = offset;
154error:
155 if (fid)
156 retval = p9_client_clunk(fid);
157 return retval;
158}
159
160ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
161{
162 return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
163}
164
165const struct xattr_handler *v9fs_xattr_handlers[] = {
166 &v9fs_xattr_user_handler,
167#ifdef CONFIG_9P_FS_POSIX_ACL
168 &v9fs_xattr_acl_access_handler,
169 &v9fs_xattr_acl_default_handler,
170#endif
171 NULL
172};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 00000000000..eaa837c53bd
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,33 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14#ifndef FS_9P_XATTR_H
15#define FS_9P_XATTR_H
16
17#include <linux/xattr.h>
18#include <net/9p/9p.h>
19#include <net/9p/client.h>
20
21extern const struct xattr_handler *v9fs_xattr_handlers[];
22extern struct xattr_handler v9fs_xattr_user_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler;
25
26extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
27 void *, size_t);
28extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
29 void *, size_t);
30extern int v9fs_xattr_set(struct dentry *, const char *,
31 const void *, size_t, int);
32extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
33#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 00000000000..d0b701b7208
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_USER_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_USER_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_user_handler = {
77 .prefix = XATTR_USER_PREFIX,
78 .get = v9fs_xattr_user_get,
79 .set = v9fs_xattr_user_set,
80};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f85b594761..771f457402d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
47 47
48endif # BLOCK 48endif # BLOCK
49 49
50config EXPORTFS
51 tristate
52
50config FILE_LOCKING 53config FILE_LOCKING
51 bool "Enable POSIX file locking API" if EMBEDDED 54 bool "Enable POSIX file locking API" if EMBEDDED
52 default y 55 default y
@@ -59,12 +62,11 @@ source "fs/notify/Kconfig"
59 62
60source "fs/quota/Kconfig" 63source "fs/quota/Kconfig"
61 64
62source "fs/autofs/Kconfig"
63source "fs/autofs4/Kconfig" 65source "fs/autofs4/Kconfig"
64source "fs/fuse/Kconfig" 66source "fs/fuse/Kconfig"
65 67
66config CUSE 68config CUSE
67 tristate "Character device in Userpace support" 69 tristate "Character device in Userspace support"
68 depends on FUSE_FS 70 depends on FUSE_FS
69 help 71 help
70 This FUSE extension allows character devices to be 72 This FUSE extension allows character devices to be
@@ -221,9 +223,6 @@ config LOCKD_V4
221 depends on FILE_LOCKING 223 depends on FILE_LOCKING
222 default y 224 default y
223 225
224config EXPORTFS
225 tristate
226
227config NFS_ACL_SUPPORT 226config NFS_ACL_SUPPORT
228 tristate 227 tristate
229 select FS_POSIX_ACL 228 select FS_POSIX_ACL
@@ -234,7 +233,6 @@ config NFS_COMMON
234 default y 233 default y
235 234
236source "net/sunrpc/Kconfig" 235source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig" 236source "fs/ceph/Kconfig"
239source "fs/cifs/Kconfig" 237source "fs/cifs/Kconfig"
240source "fs/ncpfs/Kconfig" 238source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc..79e2ca7973b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
42 42
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default y
46 depends on BINFMT_ELF && ELF_CORE 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
60 inherited. See Documentation/filesystems/proc.txt for details. 60 inherited. See Documentation/filesystems/proc.txt for details.
61 61
62 This config option changes the default setting of coredump_filter 62 This config option changes the default setting of coredump_filter
63 seen at boot time. If unsure, say N. 63 seen at boot time. If unsure, say Y.
64 64
65config BINFMT_FLAT 65config BINFMT_FLAT
66 bool "Kernel support for flat binaries" 66 bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1..a7f7cef0c0c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
29obj-$(CONFIG_AIO) += aio.o 29obj-$(CONFIG_AIO) += aio.o
30obj-$(CONFIG_FILE_LOCKING) += locks.o 30obj-$(CONFIG_FILE_LOCKING) += locks.o
31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
32 32obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
33nfsd-$(CONFIG_NFSD) := nfsctl.o
34obj-y += $(nfsd-y) $(nfsd-m)
35
36obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
37obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o 34obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
38obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o 35obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD) += nfsd/
91obj-$(CONFIG_LOCKD) += lockd/ 88obj-$(CONFIG_LOCKD) += lockd/
92obj-$(CONFIG_NLS) += nls/ 89obj-$(CONFIG_NLS) += nls/
93obj-$(CONFIG_SYSV_FS) += sysv/ 90obj-$(CONFIG_SYSV_FS) += sysv/
94obj-$(CONFIG_SMB_FS) += smbfs/
95obj-$(CONFIG_CIFS) += cifs/ 91obj-$(CONFIG_CIFS) += cifs/
96obj-$(CONFIG_NCP_FS) += ncpfs/ 92obj-$(CONFIG_NCP_FS) += ncpfs/
97obj-$(CONFIG_HPFS_FS) += hpfs/ 93obj-$(CONFIG_HPFS_FS) += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/
104obj-$(CONFIG_AFFS_FS) += affs/ 100obj-$(CONFIG_AFFS_FS) += affs/
105obj-$(CONFIG_ROMFS_FS) += romfs/ 101obj-$(CONFIG_ROMFS_FS) += romfs/
106obj-$(CONFIG_QNX4FS_FS) += qnx4/ 102obj-$(CONFIG_QNX4FS_FS) += qnx4/
107obj-$(CONFIG_AUTOFS_FS) += autofs/
108obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 103obj-$(CONFIG_AUTOFS4_FS) += autofs4/
109obj-$(CONFIG_ADFS_FS) += adfs/ 104obj-$(CONFIG_ADFS_FS) += adfs/
110obj-$(CONFIG_FUSE_FS) += fuse/ 105obj-$(CONFIG_FUSE_FS) += fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a7460..1dd5f34b3cf 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
4 help 5 help
5 The Acorn Disc Filing System is the standard file system of the 6 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC 7 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 6f850b06ab6..65794b8fe79 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,10 +50,19 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
50 loff_t pos, unsigned len, unsigned flags, 50 loff_t pos, unsigned len, unsigned flags,
51 struct page **pagep, void **fsdata) 51 struct page **pagep, void **fsdata)
52{ 52{
53 int ret;
54
53 *pagep = NULL; 55 *pagep = NULL;
54 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 56 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
55 adfs_get_block, 57 adfs_get_block,
56 &ADFS_I(mapping->host)->mmu_private); 58 &ADFS_I(mapping->host)->mmu_private);
59 if (unlikely(ret)) {
60 loff_t isize = mapping->host->i_size;
61 if (pos + len > isize)
62 vmtruncate(mapping->host, isize);
63 }
64
65 return ret;
57} 66}
58 67
59static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) 68static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
@@ -324,10 +333,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
324 333
325 /* XXX: this is missing some actual on-disk truncation.. */ 334 /* XXX: this is missing some actual on-disk truncation.. */
326 if (ia_valid & ATTR_SIZE) 335 if (ia_valid & ATTR_SIZE)
327 error = simple_setsize(inode, attr->ia_size); 336 truncate_setsize(inode, attr->ia_size);
328
329 if (error)
330 goto out;
331 337
332 if (ia_valid & ATTR_MTIME) { 338 if (ia_valid & ATTR_MTIME) {
333 inode->i_mtime = attr->ia_mtime; 339 inode->i_mtime = attr->ia_mtime;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1..959dbff2d42 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
352 struct adfs_sb_info *asb; 352 struct adfs_sb_info *asb;
353 struct inode *root; 353 struct inode *root;
354 354
355 lock_kernel();
356
355 sb->s_flags |= MS_NODIRATIME; 357 sb->s_flags |= MS_NODIRATIME;
356 358
357 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 359 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
358 if (!asb) 360 if (!asb) {
361 unlock_kernel();
359 return -ENOMEM; 362 return -ENOMEM;
363 }
360 sb->s_fs_info = asb; 364 sb->s_fs_info = asb;
361 365
362 /* set default options */ 366 /* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
474 goto error; 478 goto error;
475 } else 479 } else
476 sb->s_root->d_op = &adfs_dentry_operations; 480 sb->s_root->d_op = &adfs_dentry_operations;
481 unlock_kernel();
477 return 0; 482 return 0;
478 483
479error_free_bh: 484error_free_bh:
@@ -481,20 +486,20 @@ error_free_bh:
481error: 486error:
482 sb->s_fs_info = NULL; 487 sb->s_fs_info = NULL;
483 kfree(asb); 488 kfree(asb);
489 unlock_kernel();
484 return -EINVAL; 490 return -EINVAL;
485} 491}
486 492
487static int adfs_get_sb(struct file_system_type *fs_type, 493static struct dentry *adfs_mount(struct file_system_type *fs_type,
488 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 494 int flags, const char *dev_name, void *data)
489{ 495{
490 return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, 496 return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
491 mnt);
492} 497}
493 498
494static struct file_system_type adfs_fs_type = { 499static struct file_system_type adfs_fs_type = {
495 .owner = THIS_MODULE, 500 .owner = THIS_MODULE,
496 .name = "adfs", 501 .name = "adfs",
497 .get_sb = adfs_get_sb, 502 .mount = adfs_mount,
498 .kill_sb = kill_block_super, 503 .kill_sb = kill_block_super,
499 .fs_flags = FS_REQUIRES_DEV, 504 .fs_flags = FS_REQUIRES_DEV,
500}; 505};
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index f05b6155ccc..a8cbdeb3402 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -171,8 +171,7 @@ extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
171extern unsigned long affs_parent_ino(struct inode *dir); 171extern unsigned long affs_parent_ino(struct inode *dir);
172extern struct inode *affs_new_inode(struct inode *dir); 172extern struct inode *affs_new_inode(struct inode *dir);
173extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); 173extern int affs_notify_change(struct dentry *dentry, struct iattr *attr);
174extern void affs_delete_inode(struct inode *inode); 174extern void affs_evict_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 175extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 176 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, 177extern int affs_write_inode(struct inode *inode,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 322710c3eed..0a90dcd46de 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,10 +406,19 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
406 loff_t pos, unsigned len, unsigned flags, 406 loff_t pos, unsigned len, unsigned flags,
407 struct page **pagep, void **fsdata) 407 struct page **pagep, void **fsdata)
408{ 408{
409 int ret;
410
409 *pagep = NULL; 411 *pagep = NULL;
410 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 412 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
411 affs_get_block, 413 affs_get_block,
412 &AFFS_I(mapping->host)->mmu_private); 414 &AFFS_I(mapping->host)->mmu_private);
415 if (unlikely(ret)) {
416 loff_t isize = mapping->host->i_size;
417 if (pos + len > isize)
418 vmtruncate(mapping->host, isize);
419 }
420
421 return ret;
413} 422}
414 423
415static sector_t _affs_bmap(struct address_space *mapping, sector_t block) 424static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
@@ -885,9 +894,9 @@ affs_truncate(struct inode *inode)
885 if (AFFS_SB(sb)->s_flags & SF_OFS) { 894 if (AFFS_SB(sb)->s_flags & SF_OFS) {
886 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); 895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
887 u32 tmp; 896 u32 tmp;
888 if (IS_ERR(ext_bh)) { 897 if (IS_ERR(bh)) {
889 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", 898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
890 ext, PTR_ERR(ext_bh)); 899 ext, PTR_ERR(bh));
891 return; 900 return;
892 } 901 }
893 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index f4b2a4ee4f9..5d828903ac6 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -235,31 +235,36 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
235 goto out; 235 goto out;
236 } 236 }
237 237
238 error = inode_setattr(inode, attr); 238 if ((attr->ia_valid & ATTR_SIZE) &&
239 if (!error && (attr->ia_valid & ATTR_MODE)) 239 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size);
241 if (error)
242 return error;
243 }
244
245 setattr_copy(inode, attr);
246 mark_inode_dirty(inode);
247
248 if (attr->ia_valid & ATTR_MODE)
240 mode_to_prot(inode); 249 mode_to_prot(inode);
241out: 250out:
242 return error; 251 return error;
243} 252}
244 253
245void 254void
246affs_delete_inode(struct inode *inode) 255affs_evict_inode(struct inode *inode)
247{
248 pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
249 truncate_inode_pages(&inode->i_data, 0);
250 inode->i_size = 0;
251 affs_truncate(inode);
252 clear_inode(inode);
253 affs_free_block(inode->i_sb, inode->i_ino);
254}
255
256void
257affs_clear_inode(struct inode *inode)
258{ 256{
259 unsigned long cache_page; 257 unsigned long cache_page;
258 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
259 truncate_inode_pages(&inode->i_data, 0);
260 260
261 pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 261 if (!inode->i_nlink) {
262 inode->i_size = 0;
263 affs_truncate(inode);
264 }
262 265
266 invalidate_inode_buffers(inode);
267 end_writeback(inode);
263 affs_free_prealloc(inode); 268 affs_free_prealloc(inode);
264 cache_page = (unsigned long)AFFS_I(inode)->i_lc; 269 cache_page = (unsigned long)AFFS_I(inode)->i_lc;
265 if (cache_page) { 270 if (cache_page) {
@@ -271,6 +276,9 @@ affs_clear_inode(struct inode *inode)
271 affs_brelse(AFFS_I(inode)->i_ext_bh); 276 affs_brelse(AFFS_I(inode)->i_ext_bh);
272 AFFS_I(inode)->i_ext_last = ~1; 277 AFFS_I(inode)->i_ext_last = ~1;
273 AFFS_I(inode)->i_ext_bh = NULL; 278 AFFS_I(inode)->i_ext_bh = NULL;
279
280 if (!inode->i_nlink)
281 affs_free_block(inode->i_sb, inode->i_ino);
274} 282}
275 283
276struct inode * 284struct inode *
@@ -380,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
380 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); 388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
381 mark_buffer_dirty_inode(inode_bh, inode); 389 mark_buffer_dirty_inode(inode_bh, inode);
382 inode->i_nlink = 2; 390 inode->i_nlink = 2;
383 atomic_inc(&inode->i_count); 391 ihold(inode);
384 } 392 }
385 affs_fix_checksum(sb, bh); 393 affs_fix_checksum(sb, bh);
386 mark_buffer_dirty_inode(bh, inode); 394 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 16a3e4765f6..0cf7f4384cb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include "affs.h" 20#include "affs.h"
22 21
@@ -26,7 +25,7 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
26static int affs_remount (struct super_block *sb, int *flags, char *data); 25static int affs_remount (struct super_block *sb, int *flags, char *data);
27 26
28static void 27static void
29affs_commit_super(struct super_block *sb, int clean) 28affs_commit_super(struct super_block *sb, int wait, int clean)
30{ 29{
31 struct affs_sb_info *sbi = AFFS_SB(sb); 30 struct affs_sb_info *sbi = AFFS_SB(sb);
32 struct buffer_head *bh = sbi->s_root_bh; 31 struct buffer_head *bh = sbi->s_root_bh;
@@ -36,6 +35,8 @@ affs_commit_super(struct super_block *sb, int clean)
36 secs_to_datestamp(get_seconds(), &tail->disk_change); 35 secs_to_datestamp(get_seconds(), &tail->disk_change);
37 affs_fix_checksum(sb, bh); 36 affs_fix_checksum(sb, bh);
38 mark_buffer_dirty(bh); 37 mark_buffer_dirty(bh);
38 if (wait)
39 sync_dirty_buffer(bh);
39} 40}
40 41
41static void 42static void
@@ -44,44 +45,33 @@ affs_put_super(struct super_block *sb)
44 struct affs_sb_info *sbi = AFFS_SB(sb); 45 struct affs_sb_info *sbi = AFFS_SB(sb);
45 pr_debug("AFFS: put_super()\n"); 46 pr_debug("AFFS: put_super()\n");
46 47
47 lock_kernel(); 48 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
48 49 affs_commit_super(sb, 1, 1);
49 if (!(sb->s_flags & MS_RDONLY))
50 affs_commit_super(sb, 1);
51 50
52 kfree(sbi->s_prefix); 51 kfree(sbi->s_prefix);
53 affs_free_bitmap(sb); 52 affs_free_bitmap(sb);
54 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
55 kfree(sbi); 54 kfree(sbi);
56 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
57
58 unlock_kernel();
59} 56}
60 57
61static void 58static void
62affs_write_super(struct super_block *sb) 59affs_write_super(struct super_block *sb)
63{ 60{
64 int clean = 2;
65
66 lock_super(sb); 61 lock_super(sb);
67 if (!(sb->s_flags & MS_RDONLY)) { 62 if (!(sb->s_flags & MS_RDONLY))
68 // if (sbi->s_bitmap[i].bm_bh) { 63 affs_commit_super(sb, 1, 2);
69 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { 64 sb->s_dirt = 0;
70 // clean = 0;
71 affs_commit_super(sb, clean);
72 sb->s_dirt = !clean; /* redo until bitmap synced */
73 } else
74 sb->s_dirt = 0;
75 unlock_super(sb); 65 unlock_super(sb);
76 66
77 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); 67 pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
78} 68}
79 69
80static int 70static int
81affs_sync_fs(struct super_block *sb, int wait) 71affs_sync_fs(struct super_block *sb, int wait)
82{ 72{
83 lock_super(sb); 73 lock_super(sb);
84 affs_commit_super(sb, 2); 74 affs_commit_super(sb, wait, 2);
85 sb->s_dirt = 0; 75 sb->s_dirt = 0;
86 unlock_super(sb); 76 unlock_super(sb);
87 return 0; 77 return 0;
@@ -114,8 +104,8 @@ static void init_once(void *foo)
114{ 104{
115 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 105 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
116 106
117 init_MUTEX(&ei->i_link_lock); 107 sema_init(&ei->i_link_lock, 1);
118 init_MUTEX(&ei->i_ext_lock); 108 sema_init(&ei->i_ext_lock, 1);
119 inode_init_once(&ei->vfs_inode); 109 inode_init_once(&ei->vfs_inode);
120} 110}
121 111
@@ -140,8 +130,7 @@ static const struct super_operations affs_sops = {
140 .alloc_inode = affs_alloc_inode, 130 .alloc_inode = affs_alloc_inode,
141 .destroy_inode = affs_destroy_inode, 131 .destroy_inode = affs_destroy_inode,
142 .write_inode = affs_write_inode, 132 .write_inode = affs_write_inode,
143 .delete_inode = affs_delete_inode, 133 .evict_inode = affs_evict_inode,
144 .clear_inode = affs_clear_inode,
145 .put_super = affs_put_super, 134 .put_super = affs_put_super,
146 .write_super = affs_write_super, 135 .write_super = affs_write_super,
147 .sync_fs = affs_sync_fs, 136 .sync_fs = affs_sync_fs,
@@ -308,6 +297,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
308 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); 297 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
309 if (!sbi) 298 if (!sbi)
310 return -ENOMEM; 299 return -ENOMEM;
300
311 sb->s_fs_info = sbi; 301 sb->s_fs_info = sbi;
312 mutex_init(&sbi->s_bmlock); 302 mutex_init(&sbi->s_bmlock);
313 spin_lock_init(&sbi->symlink_lock); 303 spin_lock_init(&sbi->symlink_lock);
@@ -533,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
533 kfree(new_opts); 523 kfree(new_opts);
534 return -EINVAL; 524 return -EINVAL;
535 } 525 }
536 lock_kernel(); 526
537 replace_mount_options(sb, new_opts); 527 replace_mount_options(sb, new_opts);
538 528
539 sbi->s_flags = mount_flags; 529 sbi->s_flags = mount_flags;
@@ -549,19 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
549 memcpy(sbi->s_volume, volume, 32); 539 memcpy(sbi->s_volume, volume, 32);
550 spin_unlock(&sbi->symlink_lock); 540 spin_unlock(&sbi->symlink_lock);
551 541
552 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 542 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
553 unlock_kernel();
554 return 0; 543 return 0;
555 } 544
556 if (*flags & MS_RDONLY) { 545 if (*flags & MS_RDONLY) {
557 sb->s_dirt = 1; 546 affs_write_super(sb);
558 while (sb->s_dirt)
559 affs_write_super(sb);
560 affs_free_bitmap(sb); 547 affs_free_bitmap(sb);
561 } else 548 } else
562 res = affs_init_bitmap(sb, flags); 549 res = affs_init_bitmap(sb, flags);
563 550
564 unlock_kernel();
565 return res; 551 return res;
566} 552}
567 553
@@ -587,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
587 return 0; 573 return 0;
588} 574}
589 575
590static int affs_get_sb(struct file_system_type *fs_type, 576static struct dentry *affs_mount(struct file_system_type *fs_type,
591 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 577 int flags, const char *dev_name, void *data)
592{ 578{
593 return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, 579 return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
594 mnt);
595} 580}
596 581
597static struct file_system_type affs_fs_type = { 582static struct file_system_type affs_fs_type = {
598 .owner = THIS_MODULE, 583 .owner = THIS_MODULE,
599 .name = "affs", 584 .name = "affs",
600 .get_sb = affs_get_sb, 585 .mount = affs_mount,
601 .kill_sb = kill_block_super, 586 .kill_sb = kill_block_super,
602 .fs_flags = FS_REQUIRES_DEV, 587 .fs_flags = FS_REQUIRES_DEV,
603}; 588};
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c77..8f975f25b48 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)" 2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select AF_RXRPC 4 select AF_RXRPC
5 select DNS_RESOLVER
5 help 6 help
6 If you say Y here, you will get an experimental Andrew File System 7 If you say Y here, you will get an experimental Andrew File System
7 driver. It currently only supports unsecured read-only AFS access. 8 driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059e..0d5eeadf612 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/key.h> 14#include <linux/key.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/dns_resolver.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <keys/rxrpc-type.h> 18#include <keys/rxrpc-type.h>
18#include "internal.h" 19#include "internal.h"
@@ -30,21 +31,24 @@ static struct afs_cell *afs_cell_root;
30 * allocate a cell record and fill in its name, VL server address list and 31 * allocate a cell record and fill in its name, VL server address list and
31 * allocate an anonymous key 32 * allocate an anonymous key
32 */ 33 */
33static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) 34static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
35 char *vllist)
34{ 36{
35 struct afs_cell *cell; 37 struct afs_cell *cell;
36 struct key *key; 38 struct key *key;
37 size_t namelen;
38 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; 39 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
40 char *dvllist = NULL, *_vllist = NULL;
41 char delimiter = ':';
39 int ret; 42 int ret;
40 43
41 _enter("%s,%s", name, vllist); 44 _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
42 45
43 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ 46 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
44 47
45 namelen = strlen(name); 48 if (namelen > AFS_MAXCELLNAME) {
46 if (namelen > AFS_MAXCELLNAME) 49 _leave(" = -ENAMETOOLONG");
47 return ERR_PTR(-ENAMETOOLONG); 50 return ERR_PTR(-ENAMETOOLONG);
51 }
48 52
49 /* allocate and initialise a cell record */ 53 /* allocate and initialise a cell record */
50 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); 54 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +68,35 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
64 INIT_LIST_HEAD(&cell->vl_list); 68 INIT_LIST_HEAD(&cell->vl_list);
65 spin_lock_init(&cell->vl_lock); 69 spin_lock_init(&cell->vl_lock);
66 70
71 /* if the ip address is invalid, try dns query */
72 if (!vllist || strlen(vllist) < 7) {
73 ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
74 if (ret < 0) {
75 if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
76 /* translate these errors into something
77 * userspace might understand */
78 ret = -EDESTADDRREQ;
79 _leave(" = %d", ret);
80 return ERR_PTR(ret);
81 }
82 _vllist = dvllist;
83
84 /* change the delimiter for user-space reply */
85 delimiter = ',';
86
87 } else {
88 _vllist = vllist;
89 }
90
67 /* fill in the VL server list from the rest of the string */ 91 /* fill in the VL server list from the rest of the string */
68 do { 92 do {
69 unsigned a, b, c, d; 93 unsigned a, b, c, d;
70 94
71 next = strchr(vllist, ':'); 95 next = strchr(_vllist, delimiter);
72 if (next) 96 if (next)
73 *next++ = 0; 97 *next++ = 0;
74 98
75 if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) 99 if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
76 goto bad_address; 100 goto bad_address;
77 101
78 if (a > 255 || b > 255 || c > 255 || d > 255) 102 if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +105,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
81 cell->vl_addrs[cell->vl_naddrs++].s_addr = 105 cell->vl_addrs[cell->vl_naddrs++].s_addr =
82 htonl((a << 24) | (b << 16) | (c << 8) | d); 106 htonl((a << 24) | (b << 16) | (c << 8) | d);
83 107
84 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); 108 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
85 109
86 /* create a key to represent an anonymous user */ 110 /* create a key to represent an anonymous user */
87 memcpy(keyname, "afs@", 4); 111 memcpy(keyname, "afs@", 4);
@@ -110,32 +134,36 @@ bad_address:
110 ret = -EINVAL; 134 ret = -EINVAL;
111error: 135error:
112 key_put(cell->anonymous_key); 136 key_put(cell->anonymous_key);
137 kfree(dvllist);
113 kfree(cell); 138 kfree(cell);
114 _leave(" = %d", ret); 139 _leave(" = %d", ret);
115 return ERR_PTR(ret); 140 return ERR_PTR(ret);
116} 141}
117 142
118/* 143/*
119 * create a cell record 144 * afs_cell_crate() - create a cell record
120 * - "name" is the name of the cell 145 * @name: is the name of the cell.
121 * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format 146 * @namsesz: is the strlen of the cell name.
147 * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format.
148 * @retref: is T to return the cell reference when the cell exists.
122 */ 149 */
123struct afs_cell *afs_cell_create(const char *name, char *vllist) 150struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
151 char *vllist, bool retref)
124{ 152{
125 struct afs_cell *cell; 153 struct afs_cell *cell;
126 int ret; 154 int ret;
127 155
128 _enter("%s,%s", name, vllist); 156 _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
129 157
130 down_write(&afs_cells_sem); 158 down_write(&afs_cells_sem);
131 read_lock(&afs_cells_lock); 159 read_lock(&afs_cells_lock);
132 list_for_each_entry(cell, &afs_cells, link) { 160 list_for_each_entry(cell, &afs_cells, link) {
133 if (strcasecmp(cell->name, name) == 0) 161 if (strncasecmp(cell->name, name, namesz) == 0)
134 goto duplicate_name; 162 goto duplicate_name;
135 } 163 }
136 read_unlock(&afs_cells_lock); 164 read_unlock(&afs_cells_lock);
137 165
138 cell = afs_cell_alloc(name, vllist); 166 cell = afs_cell_alloc(name, namesz, vllist);
139 if (IS_ERR(cell)) { 167 if (IS_ERR(cell)) {
140 _leave(" = %ld", PTR_ERR(cell)); 168 _leave(" = %ld", PTR_ERR(cell));
141 up_write(&afs_cells_sem); 169 up_write(&afs_cells_sem);
@@ -175,8 +203,18 @@ error:
175 return ERR_PTR(ret); 203 return ERR_PTR(ret);
176 204
177duplicate_name: 205duplicate_name:
206 if (retref && !IS_ERR(cell))
207 afs_get_cell(cell);
208
178 read_unlock(&afs_cells_lock); 209 read_unlock(&afs_cells_lock);
179 up_write(&afs_cells_sem); 210 up_write(&afs_cells_sem);
211
212 if (retref) {
213 _leave(" = %p", cell);
214 return cell;
215 }
216
217 _leave(" = -EEXIST");
180 return ERR_PTR(-EEXIST); 218 return ERR_PTR(-EEXIST);
181} 219}
182 220
@@ -201,15 +239,13 @@ int afs_cell_init(char *rootcell)
201 } 239 }
202 240
203 cp = strchr(rootcell, ':'); 241 cp = strchr(rootcell, ':');
204 if (!cp) { 242 if (!cp)
205 printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); 243 _debug("kAFS: no VL server IP addresses specified");
206 _leave(" = -EINVAL"); 244 else
207 return -EINVAL; 245 *cp++ = 0;
208 }
209 246
210 /* allocate a cell record for the root cell */ 247 /* allocate a cell record for the root cell */
211 *cp++ = 0; 248 new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
212 new_root = afs_cell_create(rootcell, cp);
213 if (IS_ERR(new_root)) { 249 if (IS_ERR(new_root)) {
214 _leave(" = %ld", PTR_ERR(new_root)); 250 _leave(" = %ld", PTR_ERR(new_root));
215 return PTR_ERR(new_root); 251 return PTR_ERR(new_root);
@@ -229,11 +265,12 @@ int afs_cell_init(char *rootcell)
229/* 265/*
230 * lookup a cell record 266 * lookup a cell record
231 */ 267 */
232struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz) 268struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
269 bool dns_cell)
233{ 270{
234 struct afs_cell *cell; 271 struct afs_cell *cell;
235 272
236 _enter("\"%*.*s\",", namesz, namesz, name ? name : ""); 273 _enter("\"%*.*s\",", namesz, namesz, name ?: "");
237 274
238 down_read(&afs_cells_sem); 275 down_read(&afs_cells_sem);
239 read_lock(&afs_cells_lock); 276 read_lock(&afs_cells_lock);
@@ -247,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
247 } 284 }
248 } 285 }
249 cell = ERR_PTR(-ENOENT); 286 cell = ERR_PTR(-ENOENT);
287 if (dns_cell)
288 goto create_cell;
250 found: 289 found:
251 ; 290 ;
252 } else { 291 } else {
@@ -269,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
269 up_read(&afs_cells_sem); 308 up_read(&afs_cells_sem);
270 _leave(" = %p", cell); 309 _leave(" = %p", cell);
271 return cell; 310 return cell;
311
312create_cell:
313 read_unlock(&afs_cells_lock);
314 up_read(&afs_cells_sem);
315
316 cell = afs_cell_create(name, namesz, NULL, true);
317
318 _leave(" = %p", cell);
319 return cell;
272} 320}
273 321
274#if 0 322#if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b42d5cc1d6d..5439e1bc9a8 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -477,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
477} 477}
478 478
479/* 479/*
480 * Try to auto mount the mountpoint with pseudo directory, if the autocell
481 * operation is setted.
482 */
483static struct inode *afs_try_auto_mntpt(
484 int ret, struct dentry *dentry, struct inode *dir, struct key *key,
485 struct afs_fid *fid)
486{
487 const char *devname = dentry->d_name.name;
488 struct afs_vnode *vnode = AFS_FS_I(dir);
489 struct inode *inode;
490
491 _enter("%d, %p{%s}, {%x:%u}, %p",
492 ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
493
494 if (ret != -ENOENT ||
495 !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
496 goto out;
497
498 inode = afs_iget_autocell(dir, devname, strlen(devname), key);
499 if (IS_ERR(inode)) {
500 ret = PTR_ERR(inode);
501 goto out;
502 }
503
504 *fid = AFS_FS_I(inode)->fid;
505 _leave("= %p", inode);
506 return inode;
507
508out:
509 _leave("= %d", ret);
510 return ERR_PTR(ret);
511}
512
513/*
480 * look up an entry in a directory 514 * look up an entry in a directory
481 */ 515 */
482static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, 516static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -520,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
520 554
521 ret = afs_do_lookup(dir, dentry, &fid, key); 555 ret = afs_do_lookup(dir, dentry, &fid, key);
522 if (ret < 0) { 556 if (ret < 0) {
557 inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
558 if (!IS_ERR(inode)) {
559 key_put(key);
560 goto success;
561 }
562
563 ret = PTR_ERR(inode);
523 key_put(key); 564 key_put(key);
524 if (ret == -ENOENT) { 565 if (ret == -ENOENT) {
525 d_add(dentry, NULL); 566 d_add(dentry, NULL);
@@ -539,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
539 return ERR_CAST(inode); 580 return ERR_CAST(inode);
540 } 581 }
541 582
583success:
542 dentry->d_op = &afs_fs_dentry_operations; 584 dentry->d_op = &afs_fs_dentry_operations;
543 585
544 d_add(dentry, inode); 586 d_add(dentry, inode);
@@ -696,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
696 goto zap; 738 goto zap;
697 739
698 if (dentry->d_inode && 740 if (dentry->d_inode &&
699 test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags)) 741 (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) ||
700 goto zap; 742 test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
743 goto zap;
701 744
702 _leave(" = 0 [keep]"); 745 _leave(" = 0 [keep]");
703 return 0; 746 return 0;
@@ -1002,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
1002 if (ret < 0) 1045 if (ret < 0)
1003 goto link_error; 1046 goto link_error;
1004 1047
1005 atomic_inc(&vnode->vfs_inode.i_count); 1048 ihold(&vnode->vfs_inode);
1006 d_instantiate(dentry, &vnode->vfs_inode); 1049 d_instantiate(dentry, &vnode->vfs_inode);
1007 key_put(key); 1050 key_put(key);
1008 _leave(" = 0"); 1051 _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325e..757d664575d 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/smp_lock.h>
13#include "internal.h" 12#include "internal.h"
14 13
15#define AFS_LOCK_GRANTED 0 14#define AFS_LOCK_GRANTED 0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
274 273
275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
276 275
277 lock_kernel(); 276 lock_flocks();
278 277
279 /* make sure we've got a callback on this file and that our view of the 278 /* make sure we've got a callback on this file and that our view of the
280 * data version is up to date */ 279 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
421 afs_vnode_fetch_status(vnode, NULL, key); 420 afs_vnode_fetch_status(vnode, NULL, key);
422 421
423error: 422error:
424 unlock_kernel(); 423 unlock_flocks();
425 _leave(" = %d", ret); 424 _leave(" = %d", ret);
426 return ret; 425 return ret;
427 426
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d00b312e311..0747339011c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/mount.h>
23#include <linux/namei.h>
22#include "internal.h" 24#include "internal.h"
23 25
24struct afs_iget_data { 26struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
102} 104}
103 105
104/* 106/*
107 * iget5() comparator for inode created by autocell operations
108 *
109 * These pseudo inodes don't match anything.
110 */
111static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
112{
113 return 0;
114}
115
116/*
105 * iget5() inode initialiser 117 * iget5() inode initialiser
106 */ 118 */
107static int afs_iget5_set(struct inode *inode, void *opaque) 119static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
118} 130}
119 131
120/* 132/*
133 * inode retrieval for autocell
134 */
135struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
136 int namesz, struct key *key)
137{
138 struct afs_iget_data data;
139 struct afs_super_info *as;
140 struct afs_vnode *vnode;
141 struct super_block *sb;
142 struct inode *inode;
143 static atomic_t afs_autocell_ino;
144
145 _enter("{%x:%u},%*.*s,",
146 AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
147 namesz, namesz, dev_name ?: "");
148
149 sb = dir->i_sb;
150 as = sb->s_fs_info;
151 data.volume = as->volume;
152 data.fid.vid = as->volume->vid;
153 data.fid.unique = 0;
154 data.fid.vnode = 0;
155
156 inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
157 afs_iget5_autocell_test, afs_iget5_set,
158 &data);
159 if (!inode) {
160 _leave(" = -ENOMEM");
161 return ERR_PTR(-ENOMEM);
162 }
163
164 _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
165 inode, inode->i_ino, data.fid.vid, data.fid.vnode,
166 data.fid.unique);
167
168 vnode = AFS_FS_I(inode);
169
170 /* there shouldn't be an existing inode */
171 BUG_ON(!(inode->i_state & I_NEW));
172
173 inode->i_size = 0;
174 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
175 inode->i_op = &afs_autocell_inode_operations;
176 inode->i_nlink = 2;
177 inode->i_uid = 0;
178 inode->i_gid = 0;
179 inode->i_ctime.tv_sec = get_seconds();
180 inode->i_ctime.tv_nsec = 0;
181 inode->i_atime = inode->i_mtime = inode->i_ctime;
182 inode->i_blocks = 0;
183 inode->i_version = 0;
184 inode->i_generation = 0;
185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME;
188 unlock_new_inode(inode);
189 _leave(" = %p", inode);
190 return inode;
191}
192
193/*
121 * inode retrieval 194 * inode retrieval
122 */ 195 */
123struct inode *afs_iget(struct super_block *sb, struct key *key, 196struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,9 +387,22 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
314} 387}
315 388
316/* 389/*
390 * discard an AFS inode
391 */
392int afs_drop_inode(struct inode *inode)
393{
394 _enter("");
395
396 if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
397 return generic_delete_inode(inode);
398 else
399 return generic_drop_inode(inode);
400}
401
402/*
317 * clear an AFS inode 403 * clear an AFS inode
318 */ 404 */
319void afs_clear_inode(struct inode *inode) 405void afs_evict_inode(struct inode *inode)
320{ 406{
321 struct afs_permits *permits; 407 struct afs_permits *permits;
322 struct afs_vnode *vnode; 408 struct afs_vnode *vnode;
@@ -335,6 +421,9 @@ void afs_clear_inode(struct inode *inode)
335 421
336 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 422 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
337 423
424 truncate_inode_pages(&inode->i_data, 0);
425 end_writeback(inode);
426
338 afs_give_up_callback(vnode); 427 afs_give_up_callback(vnode);
339 428
340 if (vnode->server) { 429 if (vnode->server) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5f679b77ce2..cca8eef736f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
42struct afs_mount_params { 42struct afs_mount_params {
43 bool rwpath; /* T if the parent should be considered R/W */ 43 bool rwpath; /* T if the parent should be considered R/W */
44 bool force; /* T to force cell type */ 44 bool force; /* T to force cell type */
45 bool autocell; /* T if set auto mount operation */
45 afs_voltype_t type; /* type of volume requested */ 46 afs_voltype_t type; /* type of volume requested */
46 int volnamesz; /* size of volume name */ 47 int volnamesz; /* size of volume name */
47 const char *volname; /* name of volume to mount */ 48 const char *volname; /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
358#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ 359#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
359#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ 360#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
360#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ 361#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
362#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */
363#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */
361 364
362 long acl_order; /* ACL check count (callback break count) */ 365 long acl_order; /* ACL check count (callback break count) */
363 366
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
468 471
469#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 472#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
470extern int afs_cell_init(char *); 473extern int afs_cell_init(char *);
471extern struct afs_cell *afs_cell_create(const char *, char *); 474extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
472extern struct afs_cell *afs_cell_lookup(const char *, unsigned); 475extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
473extern struct afs_cell *afs_grab_cell(struct afs_cell *); 476extern struct afs_cell *afs_grab_cell(struct afs_cell *);
474extern void afs_put_cell(struct afs_cell *); 477extern void afs_put_cell(struct afs_cell *);
475extern void afs_cell_purge(void); 478extern void afs_cell_purge(void);
@@ -558,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
558/* 561/*
559 * inode.c 562 * inode.c
560 */ 563 */
564extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
565 struct key *);
561extern struct inode *afs_iget(struct super_block *, struct key *, 566extern struct inode *afs_iget(struct super_block *, struct key *,
562 struct afs_fid *, struct afs_file_status *, 567 struct afs_fid *, struct afs_file_status *,
563 struct afs_callback *); 568 struct afs_callback *);
@@ -565,7 +570,8 @@ extern void afs_zap_data(struct afs_vnode *);
565extern int afs_validate(struct afs_vnode *, struct key *); 570extern int afs_validate(struct afs_vnode *, struct key *);
566extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 571extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
567extern int afs_setattr(struct dentry *, struct iattr *); 572extern int afs_setattr(struct dentry *, struct iattr *);
568extern void afs_clear_inode(struct inode *); 573extern void afs_evict_inode(struct inode *);
574extern int afs_drop_inode(struct inode *);
569 575
570/* 576/*
571 * main.c 577 * main.c
@@ -581,6 +587,7 @@ extern int afs_abort_to_error(u32);
581 * mntpt.c 587 * mntpt.c
582 */ 588 */
583extern const struct inode_operations afs_mntpt_inode_operations; 589extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations;
584extern const struct file_operations afs_mntpt_file_operations; 591extern const struct file_operations afs_mntpt_file_operations;
585 592
586extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -752,12 +759,6 @@ extern unsigned afs_debug;
752#define dbgprintk(FMT,...) \ 759#define dbgprintk(FMT,...) \
753 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) 760 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
754 761
755/* make sure we maintain the format strings, even when debugging is disabled */
756static inline __attribute__((format(printf,1,2)))
757void _dbprintk(const char *fmt, ...)
758{
759}
760
761#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 762#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
762#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 763#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
763#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) 764#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
@@ -792,9 +793,9 @@ do { \
792} while (0) 793} while (0)
793 794
794#else 795#else
795#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 796#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
796#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 797#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
797#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) 798#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
798#endif 799#endif
799 800
800/* 801/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c5..cfd1cbe25b2 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
111 111
112 /* initialise the callback update process */ 112 /* initialise the callback update process */
113 ret = afs_callback_update_init(); 113 ret = afs_callback_update_init();
114 if (ret < 0)
115 goto error_callback_update_init;
114 116
115 /* create the RxRPC transport */ 117 /* create the RxRPC transport */
116 ret = afs_open_socket(); 118 ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
127error_fs: 129error_fs:
128 afs_close_socket(); 130 afs_close_socket();
129error_open_socket: 131error_open_socket:
132 afs_callback_update_kill();
133error_callback_update_init:
134 afs_vlocation_purge();
130error_vl_update_init: 135error_vl_update_init:
136 afs_cell_purge();
131error_cell_init: 137error_cell_init:
132#ifdef CONFIG_AFS_FSCACHE 138#ifdef CONFIG_AFS_FSCACHE
133 fscache_unregister_netfs(&afs_cache_netfs); 139 fscache_unregister_netfs(&afs_cache_netfs);
134error_cache: 140error_cache:
135#endif 141#endif
136 afs_callback_update_kill();
137 afs_vlocation_purge();
138 afs_cell_purge();
139 afs_proc_cleanup(); 142 afs_proc_cleanup();
140 rcu_barrier(); 143 rcu_barrier();
141 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a9e23039ea3..6153417caf5 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 29
30const struct file_operations afs_mntpt_file_operations = { 30const struct file_operations afs_mntpt_file_operations = {
31 .open = afs_mntpt_open, 31 .open = afs_mntpt_open,
32 .llseek = noop_llseek,
32}; 33};
33 34
34const struct inode_operations afs_mntpt_inode_operations = { 35const struct inode_operations afs_mntpt_inode_operations = {
@@ -38,6 +39,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
38 .getattr = afs_getattr, 39 .getattr = afs_getattr,
39}; 40};
40 41
42const struct inode_operations afs_autocell_inode_operations = {
43 .follow_link = afs_mntpt_follow_link,
44 .getattr = afs_getattr,
45};
46
41static LIST_HEAD(afs_vfsmounts); 47static LIST_HEAD(afs_vfsmounts);
42static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); 48static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
43 49
@@ -136,20 +142,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
136{ 142{
137 struct afs_super_info *super; 143 struct afs_super_info *super;
138 struct vfsmount *mnt; 144 struct vfsmount *mnt;
145 struct afs_vnode *vnode;
139 struct page *page; 146 struct page *page;
140 size_t size; 147 char *devname, *options;
141 char *buf, *devname, *options; 148 bool rwpath = false;
142 int ret; 149 int ret;
143 150
144 _enter("{%s}", mntpt->d_name.name); 151 _enter("{%s}", mntpt->d_name.name);
145 152
146 BUG_ON(!mntpt->d_inode); 153 BUG_ON(!mntpt->d_inode);
147 154
148 ret = -EINVAL;
149 size = mntpt->d_inode->i_size;
150 if (size > PAGE_SIZE - 1)
151 goto error_no_devname;
152
153 ret = -ENOMEM; 155 ret = -ENOMEM;
154 devname = (char *) get_zeroed_page(GFP_KERNEL); 156 devname = (char *) get_zeroed_page(GFP_KERNEL);
155 if (!devname) 157 if (!devname)
@@ -159,28 +161,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
159 if (!options) 161 if (!options)
160 goto error_no_options; 162 goto error_no_options;
161 163
162 /* read the contents of the AFS special symlink */ 164 vnode = AFS_FS_I(mntpt->d_inode);
163 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 165 if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
164 if (IS_ERR(page)) { 166 /* if the directory is a pseudo directory, use the d_name */
165 ret = PTR_ERR(page); 167 static const char afs_root_cell[] = ":root.cell.";
166 goto error_no_page; 168 unsigned size = mntpt->d_name.len;
169
170 ret = -ENOENT;
171 if (size < 2 || size > AFS_MAXCELLNAME)
172 goto error_no_page;
173
174 if (mntpt->d_name.name[0] == '.') {
175 devname[0] = '#';
176 memcpy(devname + 1, mntpt->d_name.name, size - 1);
177 memcpy(devname + size, afs_root_cell,
178 sizeof(afs_root_cell));
179 rwpath = true;
180 } else {
181 devname[0] = '%';
182 memcpy(devname + 1, mntpt->d_name.name, size);
183 memcpy(devname + size + 1, afs_root_cell,
184 sizeof(afs_root_cell));
185 }
186 } else {
187 /* read the contents of the AFS special symlink */
188 loff_t size = i_size_read(mntpt->d_inode);
189 char *buf;
190
191 ret = -EINVAL;
192 if (size > PAGE_SIZE - 1)
193 goto error_no_page;
194
195 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
196 if (IS_ERR(page)) {
197 ret = PTR_ERR(page);
198 goto error_no_page;
199 }
200
201 ret = -EIO;
202 if (PageError(page))
203 goto error;
204
205 buf = kmap_atomic(page, KM_USER0);
206 memcpy(devname, buf, size);
207 kunmap_atomic(buf, KM_USER0);
208 page_cache_release(page);
209 page = NULL;
167 } 210 }
168 211
169 ret = -EIO;
170 if (PageError(page))
171 goto error;
172
173 buf = kmap_atomic(page, KM_USER0);
174 memcpy(devname, buf, size);
175 kunmap_atomic(buf, KM_USER0);
176 page_cache_release(page);
177 page = NULL;
178
179 /* work out what options we want */ 212 /* work out what options we want */
180 super = AFS_FS_S(mntpt->d_sb); 213 super = AFS_FS_S(mntpt->d_sb);
181 memcpy(options, "cell=", 5); 214 memcpy(options, "cell=", 5);
182 strcpy(options + 5, super->volume->cell->name); 215 strcpy(options + 5, super->volume->cell->name);
183 if (super->volume->type == AFSVL_RWVOL) 216 if (super->volume->type == AFSVL_RWVOL || rwpath)
184 strcat(options, ",rwpath"); 217 strcat(options, ",rwpath");
185 218
186 /* try and do the mount */ 219 /* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a..096b23f821a 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
294 if (strcmp(kbuf, "add") == 0) { 294 if (strcmp(kbuf, "add") == 0) {
295 struct afs_cell *cell; 295 struct afs_cell *cell;
296 296
297 cell = afs_cell_create(name, args); 297 cell = afs_cell_create(name, strlen(name), args, false);
298 if (IS_ERR(cell)) { 298 if (IS_ERR(cell)) {
299 ret = PTR_ERR(cell); 299 ret = PTR_ERR(cell);
300 goto done; 300 goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd..654d8fdbf01 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); 100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
101 if (ret < 0) { 101 if (ret < 0) {
102 sock_release(socket); 102 sock_release(socket);
103 destroy_workqueue(afs_async_calls);
103 _leave(" = %d [bind]", ret); 104 _leave(" = %d [bind]", ret);
104 return ret; 105 return ret;
105 } 106 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e932e5a3a0c..27201cffece 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,9 +16,9 @@
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/mount.h>
19#include <linux/init.h> 20#include <linux/init.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/smp_lock.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/parser.h> 24#include <linux/parser.h>
@@ -29,9 +29,8 @@
29#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ 29#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
30 30
31static void afs_i_init_once(void *foo); 31static void afs_i_init_once(void *foo);
32static int afs_get_sb(struct file_system_type *fs_type, 32static struct dentry *afs_mount(struct file_system_type *fs_type,
33 int flags, const char *dev_name, 33 int flags, const char *dev_name, void *data);
34 void *data, struct vfsmount *mnt);
35static struct inode *afs_alloc_inode(struct super_block *sb); 34static struct inode *afs_alloc_inode(struct super_block *sb);
36static void afs_put_super(struct super_block *sb); 35static void afs_put_super(struct super_block *sb);
37static void afs_destroy_inode(struct inode *inode); 36static void afs_destroy_inode(struct inode *inode);
@@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
40struct file_system_type afs_fs_type = { 39struct file_system_type afs_fs_type = {
41 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
42 .name = "afs", 41 .name = "afs",
43 .get_sb = afs_get_sb, 42 .mount = afs_mount,
44 .kill_sb = kill_anon_super, 43 .kill_sb = kill_anon_super,
45 .fs_flags = 0, 44 .fs_flags = 0,
46}; 45};
@@ -48,8 +47,9 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 47static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 48 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 49 .alloc_inode = afs_alloc_inode,
50 .drop_inode = afs_drop_inode,
51 .destroy_inode = afs_destroy_inode, 51 .destroy_inode = afs_destroy_inode,
52 .clear_inode = afs_clear_inode, 52 .evict_inode = afs_evict_inode,
53 .put_super = afs_put_super, 53 .put_super = afs_put_super,
54 .show_options = generic_show_options, 54 .show_options = generic_show_options,
55}; 55};
@@ -62,12 +62,14 @@ enum {
62 afs_opt_cell, 62 afs_opt_cell,
63 afs_opt_rwpath, 63 afs_opt_rwpath,
64 afs_opt_vol, 64 afs_opt_vol,
65 afs_opt_autocell,
65}; 66};
66 67
67static const match_table_t afs_options_list = { 68static const match_table_t afs_options_list = {
68 { afs_opt_cell, "cell=%s" }, 69 { afs_opt_cell, "cell=%s" },
69 { afs_opt_rwpath, "rwpath" }, 70 { afs_opt_rwpath, "rwpath" },
70 { afs_opt_vol, "vol=%s" }, 71 { afs_opt_vol, "vol=%s" },
72 { afs_opt_autocell, "autocell" },
71 { afs_no_opt, NULL }, 73 { afs_no_opt, NULL },
72}; 74};
73 75
@@ -151,7 +153,8 @@ static int afs_parse_options(struct afs_mount_params *params,
151 switch (token) { 153 switch (token) {
152 case afs_opt_cell: 154 case afs_opt_cell:
153 cell = afs_cell_lookup(args[0].from, 155 cell = afs_cell_lookup(args[0].from,
154 args[0].to - args[0].from); 156 args[0].to - args[0].from,
157 false);
155 if (IS_ERR(cell)) 158 if (IS_ERR(cell))
156 return PTR_ERR(cell); 159 return PTR_ERR(cell);
157 afs_put_cell(params->cell); 160 afs_put_cell(params->cell);
@@ -166,6 +169,10 @@ static int afs_parse_options(struct afs_mount_params *params,
166 *devname = args[0].from; 169 *devname = args[0].from;
167 break; 170 break;
168 171
172 case afs_opt_autocell:
173 params->autocell = 1;
174 break;
175
169 default: 176 default:
170 printk(KERN_ERR "kAFS:" 177 printk(KERN_ERR "kAFS:"
171 " Unknown or invalid mount option: '%s'\n", p); 178 " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +259,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
252 259
253 /* lookup the cell record */ 260 /* lookup the cell record */
254 if (cellname || !params->cell) { 261 if (cellname || !params->cell) {
255 cell = afs_cell_lookup(cellname, cellnamesz); 262 cell = afs_cell_lookup(cellname, cellnamesz, true);
256 if (IS_ERR(cell)) { 263 if (IS_ERR(cell)) {
257 printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n", 264 printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
258 cellname ?: ""); 265 cellnamesz, cellnamesz, cellname ?: "");
259 return PTR_ERR(cell); 266 return PTR_ERR(cell);
260 } 267 }
261 afs_put_cell(params->cell); 268 afs_put_cell(params->cell);
@@ -321,6 +328,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
321 if (IS_ERR(inode)) 328 if (IS_ERR(inode))
322 goto error_inode; 329 goto error_inode;
323 330
331 if (params->autocell)
332 set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
333
324 ret = -ENOMEM; 334 ret = -ENOMEM;
325 root = d_alloc_root(inode); 335 root = d_alloc_root(inode);
326 if (!root) 336 if (!root)
@@ -348,11 +358,8 @@ error:
348/* 358/*
349 * get an AFS superblock 359 * get an AFS superblock
350 */ 360 */
351static int afs_get_sb(struct file_system_type *fs_type, 361static struct dentry *afs_mount(struct file_system_type *fs_type,
352 int flags, 362 int flags, const char *dev_name, void *options)
353 const char *dev_name,
354 void *options,
355 struct vfsmount *mnt)
356{ 363{
357 struct afs_mount_params params; 364 struct afs_mount_params params;
358 struct super_block *sb; 365 struct super_block *sb;
@@ -416,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
416 ASSERTCMP(sb->s_flags, &, MS_ACTIVE); 423 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
417 } 424 }
418 425
419 simple_set_mnt(mnt, sb);
420 afs_put_volume(params.volume); 426 afs_put_volume(params.volume);
421 afs_put_cell(params.cell); 427 afs_put_cell(params.cell);
422 kfree(new_opts); 428 kfree(new_opts);
423 _leave(" = 0 [%p]", sb); 429 _leave(" = 0 [%p]", sb);
424 return 0; 430 return dget(sb->s_root);
425 431
426error: 432error:
427 afs_put_volume(params.volume); 433 afs_put_volume(params.volume);
@@ -429,7 +435,7 @@ error:
429 key_put(params.key); 435 key_put(params.key);
430 kfree(new_opts); 436 kfree(new_opts);
431 _leave(" = %d", ret); 437 _leave(" = %d", ret);
432 return ret; 438 return ERR_PTR(ret);
433} 439}
434 440
435/* 441/*
@@ -441,12 +447,8 @@ static void afs_put_super(struct super_block *sb)
441 447
442 _enter(""); 448 _enter("");
443 449
444 lock_kernel();
445
446 afs_put_volume(as->volume); 450 afs_put_volume(as->volume);
447 451
448 unlock_kernel();
449
450 _leave(""); 452 _leave("");
451} 453}
452 454
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d..15690bb1d3b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
438 */ 438 */
439int afs_writepage(struct page *page, struct writeback_control *wbc) 439int afs_writepage(struct page *page, struct writeback_control *wbc)
440{ 440{
441 struct backing_dev_info *bdi = page->mapping->backing_dev_info;
442 struct afs_writeback *wb; 441 struct afs_writeback *wb;
443 int ret; 442 int ret;
444 443
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
455 } 454 }
456 455
457 wbc->nr_to_write -= ret; 456 wbc->nr_to_write -= ret;
458 if (wbc->nonblocking && bdi_write_congested(bdi))
459 wbc->encountered_congestion = 1;
460 457
461 _leave(" = 0"); 458 _leave(" = 0");
462 return 0; 459 return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
469 struct writeback_control *wbc, 466 struct writeback_control *wbc,
470 pgoff_t index, pgoff_t end, pgoff_t *_next) 467 pgoff_t index, pgoff_t end, pgoff_t *_next)
471{ 468{
472 struct backing_dev_info *bdi = mapping->backing_dev_info;
473 struct afs_writeback *wb; 469 struct afs_writeback *wb;
474 struct page *page; 470 struct page *page;
475 int ret, n; 471 int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
529 525
530 wbc->nr_to_write -= ret; 526 wbc->nr_to_write -= ret;
531 527
532 if (wbc->nonblocking && bdi_write_congested(bdi)) {
533 wbc->encountered_congestion = 1;
534 break;
535 }
536
537 cond_resched(); 528 cond_resched();
538 } while (index < end && wbc->nr_to_write > 0); 529 } while (index < end && wbc->nr_to_write > 0);
539 530
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
548int afs_writepages(struct address_space *mapping, 539int afs_writepages(struct address_space *mapping,
549 struct writeback_control *wbc) 540 struct writeback_control *wbc)
550{ 541{
551 struct backing_dev_info *bdi = mapping->backing_dev_info;
552 pgoff_t start, end, next; 542 pgoff_t start, end, next;
553 int ret; 543 int ret;
554 544
555 _enter(""); 545 _enter("");
556 546
557 if (wbc->nonblocking && bdi_write_congested(bdi)) {
558 wbc->encountered_congestion = 1;
559 _leave(" = 0 [congest]");
560 return 0;
561 }
562
563 if (wbc->range_cyclic) { 547 if (wbc->range_cyclic) {
564 start = mapping->writeback_index; 548 start = mapping->writeback_index;
565 end = -1; 549 end = -1;
566 ret = afs_writepages_region(mapping, wbc, start, end, &next); 550 ret = afs_writepages_region(mapping, wbc, start, end, &next);
567 if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && 551 if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
568 !(wbc->nonblocking && wbc->encountered_congestion))
569 ret = afs_writepages_region(mapping, wbc, 0, start, 552 ret = afs_writepages_region(mapping, wbc, 0, start,
570 &next); 553 &next);
571 mapping->writeback_index = next; 554 mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 1ccf25cef1f..8c8f6c5b6d7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
712 */ 712 */
713 ret = retry(iocb); 713 ret = retry(iocb);
714 714
715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) 715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
716 /*
717 * There's no easy way to restart the syscall since other AIO's
718 * may be already running. Just fail this IO with EINTR.
719 */
720 if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
721 ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
722 ret = -EINTR;
716 aio_complete(iocb, ret, 0); 723 aio_complete(iocb, ret, 0);
724 }
717out: 725out:
718 spin_lock_irq(&ctx->ctx_lock); 726 spin_lock_irq(&ctx->ctx_lock);
719 727
@@ -1277,7 +1285,7 @@ out:
1277/* sys_io_destroy: 1285/* sys_io_destroy:
1278 * Destroy the aio_context specified. May cancel any outstanding 1286 * Destroy the aio_context specified. May cancel any outstanding
1279 * AIOs and block on completion. Will fail with -ENOSYS if not 1287 * AIOs and block on completion. Will fail with -ENOSYS if not
1280 * implemented. May fail with -EFAULT if the context pointed to 1288 * implemented. May fail with -EINVAL if the context pointed to
1281 * is invalid. 1289 * is invalid.
1282 */ 1290 */
1283SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) 1291SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1535,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
1535 } 1543 }
1536 1544
1537 abe = mempool_alloc(abe_pool, GFP_KERNEL); 1545 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1538 BUG_ON(!igrab(mapping->host)); 1546
1547 /*
1548 * we should be using igrab here, but
1549 * we don't want to hammer on the global
1550 * inode spinlock just to take an extra
1551 * reference on a file that we must already
1552 * have a reference to.
1553 *
1554 * When we're called, we always have a reference
1555 * on the file, so we must always have a reference
1556 * on the inode, so ihold() is safe here.
1557 */
1558 ihold(mapping->host);
1539 abe->mapping = mapping; 1559 abe->mapping = mapping;
1540 hlist_add_head(&abe->list, &batch_hash[bucket]); 1560 hlist_add_head(&abe->list, &batch_hash[bucket]);
1541 return; 1561 return;
@@ -1659,6 +1679,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1659 if (unlikely(nr < 0)) 1679 if (unlikely(nr < 0))
1660 return -EINVAL; 1680 return -EINVAL;
1661 1681
1682 if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
1683 nr = LONG_MAX/sizeof(*iocbpp);
1684
1662 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) 1685 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
1663 return -EFAULT; 1686 return -EFAULT;
1664 1687
@@ -1795,15 +1818,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1795 1818
1796/* io_getevents: 1819/* io_getevents:
1797 * Attempts to read at least min_nr events and up to nr events from 1820 * Attempts to read at least min_nr events and up to nr events from
1798 * the completion queue for the aio_context specified by ctx_id. May 1821 * the completion queue for the aio_context specified by ctx_id. If
1799 * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, 1822 * it succeeds, the number of read events is returned. May fail with
1800 * if nr is out of range, if when is out of range. May fail with 1823 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
1801 * -EFAULT if any of the memory specified to is invalid. May return 1824 * out of range, if timeout is out of range. May fail with -EFAULT
1802 * 0 or < min_nr if no events are available and the timeout specified 1825 * if any of the memory specified is invalid. May return 0 or
1803 * by when has elapsed, where when == NULL specifies an infinite 1826 * < min_nr if the timeout specified by timeout has elapsed
1804 * timeout. Note that the timeout pointed to by when is relative and 1827 * before sufficient events are available, where timeout == NULL
1805 * will be updated if not NULL and the operation blocks. Will fail 1828 * specifies an infinite timeout. Note that the timeout pointed to by
1806 * with -ENOSYS if not implemented. 1829 * timeout is relative and will be updated if not NULL and the
1830 * operation blocks. Will fail with -ENOSYS if not implemented.
1807 */ 1831 */
1808SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1832SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1809 long, min_nr, 1833 long, min_nr,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda8..57ce55b2564 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
26static struct inode *anon_inode_inode; 26static struct inode *anon_inode_inode;
27static const struct file_operations anon_inode_fops; 27static const struct file_operations anon_inode_fops;
28 28
29static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, 29static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
30 const char *dev_name, void *data, 30 int flags, const char *dev_name, void *data)
31 struct vfsmount *mnt)
32{ 31{
33 return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC, 32 return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
34 mnt);
35} 33}
36 34
37/* 35/*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
45 43
46static struct file_system_type anon_inode_fs_type = { 44static struct file_system_type anon_inode_fs_type = {
47 .name = "anon_inodefs", 45 .name = "anon_inodefs",
48 .get_sb = anon_inodefs_get_sb, 46 .mount = anon_inodefs_mount,
49 .kill_sb = kill_anon_super, 47 .kill_sb = kill_anon_super,
50}; 48};
51static const struct dentry_operations anon_inodefs_dentry_operations = { 49static const struct dentry_operations anon_inodefs_dentry_operations = {
@@ -111,10 +109,9 @@ struct file *anon_inode_getfile(const char *name,
111 path.mnt = mntget(anon_inode_mnt); 109 path.mnt = mntget(anon_inode_mnt);
112 /* 110 /*
113 * We know the anon_inode inode count is always greater than zero, 111 * We know the anon_inode inode count is always greater than zero,
114 * so we can avoid doing an igrab() and we can use an open-coded 112 * so ihold() is safe.
115 * atomic_inc().
116 */ 113 */
117 atomic_inc(&anon_inode_inode->i_count); 114 ihold(anon_inode_inode);
118 115
119 path.dentry->d_op = &anon_inodefs_dentry_operations; 116 path.dentry->d_op = &anon_inodefs_dentry_operations;
120 d_instantiate(path.dentry, anon_inode_inode); 117 d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +191,7 @@ static struct inode *anon_inode_mkinode(void)
194 if (!inode) 191 if (!inode)
195 return ERR_PTR(-ENOMEM); 192 return ERR_PTR(-ENOMEM);
196 193
194 inode->i_ino = get_next_ino();
197 inode->i_fop = &anon_inode_fops; 195 inode->i_fop = &anon_inode_fops;
198 196
199 inode->i_mapping->a_ops = &anon_aops; 197 inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/attr.c b/fs/attr.c
index b4fa3b0aa59..7ca41811afa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,35 +14,53 @@
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/security.h> 15#include <linux/security.h>
16 16
17/* Taken over from the old code... */ 17/**
18 18 * inode_change_ok - check if attribute changes to an inode are allowed
19/* POSIX UID/GID verification for setting inode attributes. */ 19 * @inode: inode to check
20 * @attr: attributes to change
21 *
22 * Check if we are allowed to change the attributes contained in @attr
23 * in the given inode. This includes the normal unix access permission
24 * checks, as well as checks for rlimits and others.
25 *
26 * Should be called as the first thing in ->setattr implementations,
27 * possibly after taking additional locks.
28 */
20int inode_change_ok(const struct inode *inode, struct iattr *attr) 29int inode_change_ok(const struct inode *inode, struct iattr *attr)
21{ 30{
22 int retval = -EPERM;
23 unsigned int ia_valid = attr->ia_valid; 31 unsigned int ia_valid = attr->ia_valid;
24 32
33 /*
34 * First check size constraints. These can't be overriden using
35 * ATTR_FORCE.
36 */
37 if (ia_valid & ATTR_SIZE) {
38 int error = inode_newsize_ok(inode, attr->ia_size);
39 if (error)
40 return error;
41 }
42
25 /* If force is set do it anyway. */ 43 /* If force is set do it anyway. */
26 if (ia_valid & ATTR_FORCE) 44 if (ia_valid & ATTR_FORCE)
27 goto fine; 45 return 0;
28 46
29 /* Make sure a caller can chown. */ 47 /* Make sure a caller can chown. */
30 if ((ia_valid & ATTR_UID) && 48 if ((ia_valid & ATTR_UID) &&
31 (current_fsuid() != inode->i_uid || 49 (current_fsuid() != inode->i_uid ||
32 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) 50 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
33 goto error; 51 return -EPERM;
34 52
35 /* Make sure caller can chgrp. */ 53 /* Make sure caller can chgrp. */
36 if ((ia_valid & ATTR_GID) && 54 if ((ia_valid & ATTR_GID) &&
37 (current_fsuid() != inode->i_uid || 55 (current_fsuid() != inode->i_uid ||
38 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && 56 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
39 !capable(CAP_CHOWN)) 57 !capable(CAP_CHOWN))
40 goto error; 58 return -EPERM;
41 59
42 /* Make sure a caller can chmod. */ 60 /* Make sure a caller can chmod. */
43 if (ia_valid & ATTR_MODE) { 61 if (ia_valid & ATTR_MODE) {
44 if (!is_owner_or_cap(inode)) 62 if (!is_owner_or_cap(inode))
45 goto error; 63 return -EPERM;
46 /* Also check the setgid bit! */ 64 /* Also check the setgid bit! */
47 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
48 inode->i_gid) && !capable(CAP_FSETID)) 66 inode->i_gid) && !capable(CAP_FSETID))
@@ -52,12 +70,10 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
52 /* Check for setting the inode time. */ 70 /* Check for setting the inode time. */
53 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { 71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
54 if (!is_owner_or_cap(inode)) 72 if (!is_owner_or_cap(inode))
55 goto error; 73 return -EPERM;
56 } 74 }
57fine: 75
58 retval = 0; 76 return 0;
59error:
60 return retval;
61} 77}
62EXPORT_SYMBOL(inode_change_ok); 78EXPORT_SYMBOL(inode_change_ok);
63 79
@@ -105,21 +121,21 @@ out_big:
105EXPORT_SYMBOL(inode_newsize_ok); 121EXPORT_SYMBOL(inode_newsize_ok);
106 122
107/** 123/**
108 * generic_setattr - copy simple metadata updates into the generic inode 124 * setattr_copy - copy simple metadata updates into the generic inode
109 * @inode: the inode to be updated 125 * @inode: the inode to be updated
110 * @attr: the new attributes 126 * @attr: the new attributes
111 * 127 *
112 * generic_setattr must be called with i_mutex held. 128 * setattr_copy must be called with i_mutex held.
113 * 129 *
114 * generic_setattr updates the inode's metadata with that specified 130 * setattr_copy updates the inode's metadata with that specified
115 * in attr. Noticably missing is inode size update, which is more complex 131 * in attr. Noticably missing is inode size update, which is more complex
116 * as it requires pagecache updates. See simple_setsize. 132 * as it requires pagecache updates.
117 * 133 *
118 * The inode is not marked as dirty after this operation. The rationale is 134 * The inode is not marked as dirty after this operation. The rationale is
119 * that for "simple" filesystems, the struct inode is the inode storage. 135 * that for "simple" filesystems, the struct inode is the inode storage.
120 * The caller is free to mark the inode dirty afterwards if needed. 136 * The caller is free to mark the inode dirty afterwards if needed.
121 */ 137 */
122void generic_setattr(struct inode *inode, const struct iattr *attr) 138void setattr_copy(struct inode *inode, const struct iattr *attr)
123{ 139{
124 unsigned int ia_valid = attr->ia_valid; 140 unsigned int ia_valid = attr->ia_valid;
125 141
@@ -144,32 +160,7 @@ void generic_setattr(struct inode *inode, const struct iattr *attr)
144 inode->i_mode = mode; 160 inode->i_mode = mode;
145 } 161 }
146} 162}
147EXPORT_SYMBOL(generic_setattr); 163EXPORT_SYMBOL(setattr_copy);
148
149/*
150 * note this function is deprecated, the new truncate sequence should be
151 * used instead -- see eg. simple_setsize, generic_setattr.
152 */
153int inode_setattr(struct inode *inode, const struct iattr *attr)
154{
155 unsigned int ia_valid = attr->ia_valid;
156
157 if (ia_valid & ATTR_SIZE &&
158 attr->ia_size != i_size_read(inode)) {
159 int error;
160
161 error = vmtruncate(inode, attr->ia_size);
162 if (error)
163 return error;
164 }
165
166 generic_setattr(inode, attr);
167
168 mark_inode_dirty(inode);
169
170 return 0;
171}
172EXPORT_SYMBOL(inode_setattr);
173 164
174int notify_change(struct dentry * dentry, struct iattr * attr) 165int notify_change(struct dentry * dentry, struct iattr * attr)
175{ 166{
@@ -237,13 +228,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
237 if (ia_valid & ATTR_SIZE) 228 if (ia_valid & ATTR_SIZE)
238 down_write(&dentry->d_inode->i_alloc_sem); 229 down_write(&dentry->d_inode->i_alloc_sem);
239 230
240 if (inode->i_op && inode->i_op->setattr) { 231 if (inode->i_op->setattr)
241 error = inode->i_op->setattr(dentry, attr); 232 error = inode->i_op->setattr(dentry, attr);
242 } else { 233 else
243 error = inode_change_ok(inode, attr); 234 error = simple_setattr(dentry, attr);
244 if (!error)
245 error = inode_setattr(inode, attr);
246 }
247 235
248 if (ia_valid & ATTR_SIZE) 236 if (ia_valid & ATTR_SIZE)
249 up_write(&dentry->d_inode->i_alloc_sem); 237 up_write(&dentry->d_inode->i_alloc_sem);
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911..00000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
1config AUTOFS_FS
2 tristate "Kernel automounter support"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from the autofs
10 package; you can find the location in <file:Documentation/Changes>.
11 You also want to answer Y to "NFS file system support", below.
12
13 If you want to use the newer version of the automounter with more
14 features, say N here and say Y to "Kernel automounter v4 support",
15 below.
16
17 To compile this support as a module, choose M here: the module will be
18 called autofs.
19
20 If you are not a part of a fairly large, distributed network, you
21 probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d0..00000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
1#
2# Makefile for the linux autofs-filesystem routines.
3#
4
5obj-$(CONFIG_AUTOFS_FS) += autofs.o
6
7autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec4..00000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * linux/fs/autofs/autofs_i.h
4 *
5 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/* Internal header file for autofs */
14
15#include <linux/auto_fs.h>
16
17/* This is the range of ioctl() numbers we claim as ours */
18#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
19#define AUTOFS_IOC_COUNT 32
20
21#include <linux/kernel.h>
22#include <linux/slab.h>
23#include <linux/time.h>
24#include <linux/string.h>
25#include <linux/wait.h>
26#include <linux/dcache.h>
27#include <linux/namei.h>
28#include <linux/mount.h>
29#include <linux/sched.h>
30
31#include <asm/current.h>
32#include <asm/uaccess.h>
33
34#ifdef DEBUG
35#define DPRINTK(D) (printk D)
36#else
37#define DPRINTK(D) ((void)0)
38#endif
39
40/*
41 * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
42 * kernel will keep the negative response cached for up to the time given
43 * here, although the time can be shorter if the kernel throws the dcache
44 * entry away. This probably should be settable from user space.
45 */
46#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
47
48/* Structures associated with the root directory hash table */
49
50#define AUTOFS_HASH_SIZE 67
51
52struct autofs_dir_ent {
53 int hash;
54 char *name;
55 int len;
56 ino_t ino;
57 struct dentry *dentry;
58 /* Linked list of entries */
59 struct autofs_dir_ent *next;
60 struct autofs_dir_ent **back;
61 /* The following entries are for the expiry system */
62 unsigned long last_usage;
63 struct list_head exp;
64};
65
66struct autofs_dirhash {
67 struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
68 struct list_head expiry_head;
69};
70
71struct autofs_wait_queue {
72 wait_queue_head_t queue;
73 struct autofs_wait_queue *next;
74 autofs_wqt_t wait_queue_token;
75 /* We use the following to see what we are waiting for */
76 int hash;
77 int len;
78 char *name;
79 /* This is for status reporting upon return */
80 int status;
81 int wait_ctr;
82};
83
84struct autofs_symlink {
85 char *data;
86 int len;
87 time_t mtime;
88};
89
90#define AUTOFS_MAX_SYMLINKS 256
91
92#define AUTOFS_ROOT_INO 1
93#define AUTOFS_FIRST_SYMLINK 2
94#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
95
96#define AUTOFS_SYMLINK_BITMAP_LEN \
97 ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
98
99#define AUTOFS_SBI_MAGIC 0x6d4a556d
100
101struct autofs_sb_info {
102 u32 magic;
103 struct file *pipe;
104 struct pid *oz_pgrp;
105 int catatonic;
106 struct super_block *sb;
107 unsigned long exp_timeout;
108 ino_t next_dir_ino;
109 struct autofs_wait_queue *queues; /* Wait queue pointer */
110 struct autofs_dirhash dirhash; /* Root directory hash */
111 struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
112 unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
113};
114
115static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
116{
117 return (struct autofs_sb_info *)(sb->s_fs_info);
118}
119
120/* autofs_oz_mode(): do we see the man behind the curtain? (The
121 processes which do manipulations for us in user space sees the raw
122 filesystem without "magic".) */
123
124static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
125 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
126}
127
128/* Hash operations */
129
130void autofs_initialize_hash(struct autofs_dirhash *);
131struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
132void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
133void autofs_hash_delete(struct autofs_dir_ent *);
134struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
135void autofs_hash_dputall(struct autofs_dirhash *);
136void autofs_hash_nuke(struct autofs_sb_info *);
137
138/* Expiration-handling functions */
139
140void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
141struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
142
143/* Operations structures */
144
145extern const struct inode_operations autofs_root_inode_operations;
146extern const struct inode_operations autofs_symlink_inode_operations;
147extern const struct file_operations autofs_root_operations;
148
149/* Initializing function */
150
151int autofs_fill_super(struct super_block *, void *, int);
152void autofs_kill_sb(struct super_block *sb);
153struct inode *autofs_iget(struct super_block *, unsigned long);
154
155/* Queue management functions */
156
157int autofs_wait(struct autofs_sb_info *,struct qstr *);
158int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
159void autofs_catatonic_mode(struct autofs_sb_info *);
160
161#ifdef DEBUG
162void autofs_say(const char *name, int len);
163#else
164#define autofs_say(n,l) ((void)0)
165#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e..00000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/dirhash.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Functions for maintenance of expiry queue */
16
17static void autofs_init_usage(struct autofs_dirhash *dh,
18 struct autofs_dir_ent *ent)
19{
20 list_add_tail(&ent->exp, &dh->expiry_head);
21 ent->last_usage = jiffies;
22}
23
24static void autofs_delete_usage(struct autofs_dir_ent *ent)
25{
26 list_del(&ent->exp);
27}
28
29void autofs_update_usage(struct autofs_dirhash *dh,
30 struct autofs_dir_ent *ent)
31{
32 autofs_delete_usage(ent); /* Unlink from current position */
33 autofs_init_usage(dh,ent); /* Relink at queue tail */
34}
35
36struct autofs_dir_ent *autofs_expire(struct super_block *sb,
37 struct autofs_sb_info *sbi,
38 struct vfsmount *mnt)
39{
40 struct autofs_dirhash *dh = &sbi->dirhash;
41 struct autofs_dir_ent *ent;
42 unsigned long timeout = sbi->exp_timeout;
43
44 while (1) {
45 struct path path;
46 int umount_ok;
47
48 if ( list_empty(&dh->expiry_head) || sbi->catatonic )
49 return NULL; /* No entries */
50 /* We keep the list sorted by last_usage and want old stuff */
51 ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
52 if (jiffies - ent->last_usage < timeout)
53 break;
54 /* Move to end of list in case expiry isn't desirable */
55 autofs_update_usage(dh, ent);
56
57 /* Check to see that entry is expirable */
58 if ( ent->ino < AUTOFS_FIRST_DIR_INO )
59 return ent; /* Symlinks are always expirable */
60
61 /* Get the dentry for the autofs subdirectory */
62 path.dentry = ent->dentry;
63
64 if (!path.dentry) {
65 /* Should only happen in catatonic mode */
66 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
67 autofs_delete_usage(ent);
68 continue;
69 }
70
71 if (!path.dentry->d_inode) {
72 dput(path.dentry);
73 printk("autofs: negative dentry on expiry queue: %s\n",
74 ent->name);
75 autofs_delete_usage(ent);
76 continue;
77 }
78
79 /* Make sure entry is mounted and unused; note that dentry will
80 point to the mounted-on-top root. */
81 if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
82 !d_mountpoint(path.dentry)) {
83 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
84 continue;
85 }
86 path.mnt = mnt;
87 path_get(&path);
88 if (!follow_down(&path)) {
89 path_put(&path);
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue;
92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ;
95 umount_ok = may_umount(path.mnt);
96 path_put(&path);
97
98 if (umount_ok) {
99 DPRINTK(("autofs: signaling expire on %s\n", ent->name));
100 return ent; /* Expirable! */
101 }
102 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
103 }
104 return NULL; /* No expirable entries */
105}
106
107void autofs_initialize_hash(struct autofs_dirhash *dh) {
108 memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
109 INIT_LIST_HEAD(&dh->expiry_head);
110}
111
112struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
113{
114 struct autofs_dir_ent *dhn;
115
116 DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
117 autofs_say(name->name,name->len);
118
119 for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
120 if ( name->hash == dhn->hash &&
121 name->len == dhn->len &&
122 !memcmp(name->name, dhn->name, name->len) )
123 break;
124 }
125
126 return dhn;
127}
128
129void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
130{
131 struct autofs_dir_ent **dhnp;
132
133 DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
134 autofs_say(ent->name,ent->len);
135
136 autofs_init_usage(dh,ent);
137 if (ent->dentry)
138 dget(ent->dentry);
139
140 dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
141 ent->next = *dhnp;
142 ent->back = dhnp;
143 *dhnp = ent;
144 if ( ent->next )
145 ent->next->back = &(ent->next);
146}
147
148void autofs_hash_delete(struct autofs_dir_ent *ent)
149{
150 *(ent->back) = ent->next;
151 if ( ent->next )
152 ent->next->back = ent->back;
153
154 autofs_delete_usage(ent);
155
156 if ( ent->dentry )
157 dput(ent->dentry);
158 kfree(ent->name);
159 kfree(ent);
160}
161
162/*
163 * Used by readdir(). We must validate "ptr", so we can't simply make it
164 * a pointer. Values below 0xffff are reserved; calling with any value
165 * <= 0x10000 will return the first entry found.
166 *
167 * "last" can be NULL or the value returned by the last search *if* we
168 * want the next sequential entry.
169 */
170struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
171 off_t *ptr, struct autofs_dir_ent *last)
172{
173 int bucket, ecount, i;
174 struct autofs_dir_ent *ent;
175
176 bucket = (*ptr >> 16) - 1;
177 ecount = *ptr & 0xffff;
178
179 if ( bucket < 0 ) {
180 bucket = ecount = 0;
181 }
182
183 DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
184
185 ent = last ? last->next : NULL;
186
187 if ( ent ) {
188 ecount++;
189 } else {
190 while ( bucket < AUTOFS_HASH_SIZE ) {
191 ent = dh->h[bucket];
192 for ( i = ecount ; ent && i ; i-- )
193 ent = ent->next;
194
195 if (ent) {
196 ecount++; /* Point to *next* entry */
197 break;
198 }
199
200 bucket++; ecount = 0;
201 }
202 }
203
204#ifdef DEBUG
205 if ( !ent )
206 printk("autofs_hash_enum: nothing found\n");
207 else {
208 printk("autofs_hash_enum: found hash %08x, name", ent->hash);
209 autofs_say(ent->name,ent->len);
210 }
211#endif
212
213 *ptr = ((bucket+1) << 16) + ecount;
214 return ent;
215}
216
217/* Iterate over all the ents, and remove all dentry pointers. Used on
218 entering catatonic mode, in order to make the filesystem unmountable. */
219void autofs_hash_dputall(struct autofs_dirhash *dh)
220{
221 int i;
222 struct autofs_dir_ent *ent;
223
224 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
225 for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
226 if ( ent->dentry ) {
227 dput(ent->dentry);
228 ent->dentry = NULL;
229 }
230 }
231 }
232}
233
234/* Delete everything. This is used on filesystem destruction, so we
235 make no attempt to keep the pointers valid */
236void autofs_hash_nuke(struct autofs_sb_info *sbi)
237{
238 int i;
239 struct autofs_dir_ent *ent, *nent;
240
241 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
242 for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
243 nent = ent->next;
244 if ( ent->dentry )
245 dput(ent->dentry);
246 kfree(ent->name);
247 kfree(ent);
248 }
249 }
250}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f3..00000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/init.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include "autofs_i.h"
16
17static int autofs_get_sb(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
19{
20 return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
21}
22
23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE,
25 .name = "autofs",
26 .get_sb = autofs_get_sb,
27 .kill_sb = autofs_kill_sb,
28};
29
30static int __init init_autofs_fs(void)
31{
32 return register_filesystem(&autofs_fs_type);
33}
34
35static void __exit exit_autofs_fs(void)
36{
37 unregister_filesystem(&autofs_fs_type);
38}
39
40module_init(init_autofs_fs);
41module_exit(exit_autofs_fs);
42
43#ifdef DEBUG
44void autofs_say(const char *name, int len)
45{
46 printk("(%d: ", len);
47 while ( len-- )
48 printk("%c", *name++);
49 printk(")\n");
50}
51#endif
52MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e2..00000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/inode.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/kernel.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/file.h>
17#include <linux/parser.h>
18#include <linux/bitops.h>
19#include <linux/magic.h>
20#include "autofs_i.h"
21#include <linux/module.h>
22
23void autofs_kill_sb(struct super_block *sb)
24{
25 struct autofs_sb_info *sbi = autofs_sbi(sb);
26 unsigned int n;
27
28 /*
29 * In the event of a failure in get_sb_nodev the superblock
30 * info is not present so nothing else has been setup, so
31 * just call kill_anon_super when we are called from
32 * deactivate_super.
33 */
34 if (!sbi)
35 goto out_kill_sb;
36
37 if (!sbi->catatonic)
38 autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
39
40 put_pid(sbi->oz_pgrp);
41
42 autofs_hash_nuke(sbi);
43 for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
44 if (test_bit(n, sbi->symlink_bitmap))
45 kfree(sbi->symlink[n].data);
46 }
47
48 kfree(sb->s_fs_info);
49
50out_kill_sb:
51 DPRINTK(("autofs: shutting down\n"));
52 kill_anon_super(sb);
53}
54
55static const struct super_operations autofs_sops = {
56 .statfs = simple_statfs,
57 .show_options = generic_show_options,
58};
59
60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
61
62static const match_table_t autofs_tokens = {
63 {Opt_fd, "fd=%u"},
64 {Opt_uid, "uid=%u"},
65 {Opt_gid, "gid=%u"},
66 {Opt_pgrp, "pgrp=%u"},
67 {Opt_minproto, "minproto=%u"},
68 {Opt_maxproto, "maxproto=%u"},
69 {Opt_err, NULL}
70};
71
72static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
73 pid_t *pgrp, int *minproto, int *maxproto)
74{
75 char *p;
76 substring_t args[MAX_OPT_ARGS];
77 int option;
78
79 *uid = current_uid();
80 *gid = current_gid();
81 *pgrp = task_pgrp_nr(current);
82
83 *minproto = *maxproto = AUTOFS_PROTO_VERSION;
84
85 *pipefd = -1;
86
87 if (!options)
88 return 1;
89
90 while ((p = strsep(&options, ",")) != NULL) {
91 int token;
92 if (!*p)
93 continue;
94
95 token = match_token(p, autofs_tokens, args);
96 switch (token) {
97 case Opt_fd:
98 if (match_int(&args[0], &option))
99 return 1;
100 *pipefd = option;
101 break;
102 case Opt_uid:
103 if (match_int(&args[0], &option))
104 return 1;
105 *uid = option;
106 break;
107 case Opt_gid:
108 if (match_int(&args[0], &option))
109 return 1;
110 *gid = option;
111 break;
112 case Opt_pgrp:
113 if (match_int(&args[0], &option))
114 return 1;
115 *pgrp = option;
116 break;
117 case Opt_minproto:
118 if (match_int(&args[0], &option))
119 return 1;
120 *minproto = option;
121 break;
122 case Opt_maxproto:
123 if (match_int(&args[0], &option))
124 return 1;
125 *maxproto = option;
126 break;
127 default:
128 return 1;
129 }
130 }
131 return (*pipefd < 0);
132}
133
134int autofs_fill_super(struct super_block *s, void *data, int silent)
135{
136 struct inode * root_inode;
137 struct dentry * root;
138 struct file * pipe;
139 int pipefd;
140 struct autofs_sb_info *sbi;
141 int minproto, maxproto;
142 pid_t pgid;
143
144 save_mount_options(s, data);
145
146 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
147 if (!sbi)
148 goto fail_unlock;
149 DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
150
151 s->s_fs_info = sbi;
152 sbi->magic = AUTOFS_SBI_MAGIC;
153 sbi->pipe = NULL;
154 sbi->catatonic = 1;
155 sbi->exp_timeout = 0;
156 autofs_initialize_hash(&sbi->dirhash);
157 sbi->queues = NULL;
158 memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
159 sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
160 s->s_blocksize = 1024;
161 s->s_blocksize_bits = 10;
162 s->s_magic = AUTOFS_SUPER_MAGIC;
163 s->s_op = &autofs_sops;
164 s->s_time_gran = 1;
165 sbi->sb = s;
166
167 root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
168 if (IS_ERR(root_inode))
169 goto fail_free;
170 root = d_alloc_root(root_inode);
171 pipe = NULL;
172
173 if (!root)
174 goto fail_iput;
175
176 /* Can this call block? - WTF cares? s is locked. */
177 if (parse_options(data, &pipefd, &root_inode->i_uid,
178 &root_inode->i_gid, &pgid, &minproto,
179 &maxproto)) {
180 printk("autofs: called with bogus options\n");
181 goto fail_dput;
182 }
183
184 /* Couldn't this be tested earlier? */
185 if (minproto > AUTOFS_PROTO_VERSION ||
186 maxproto < AUTOFS_PROTO_VERSION) {
187 printk("autofs: kernel does not match daemon version\n");
188 goto fail_dput;
189 }
190
191 DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
192 sbi->oz_pgrp = find_get_pid(pgid);
193
194 if (!sbi->oz_pgrp) {
195 printk("autofs: could not find process group %d\n", pgid);
196 goto fail_dput;
197 }
198
199 pipe = fget(pipefd);
200
201 if (!pipe) {
202 printk("autofs: could not open pipe file descriptor\n");
203 goto fail_put_pid;
204 }
205
206 if (!pipe->f_op || !pipe->f_op->write)
207 goto fail_fput;
208 sbi->pipe = pipe;
209 sbi->catatonic = 0;
210
211 /*
212 * Success! Install the root dentry now to indicate completion.
213 */
214 s->s_root = root;
215 return 0;
216
217fail_fput:
218 printk("autofs: pipe file descriptor does not contain proper ops\n");
219 fput(pipe);
220fail_put_pid:
221 put_pid(sbi->oz_pgrp);
222fail_dput:
223 dput(root);
224 goto fail_free;
225fail_iput:
226 printk("autofs: get root dentry failed\n");
227 iput(root_inode);
228fail_free:
229 kfree(sbi);
230 s->s_fs_info = NULL;
231fail_unlock:
232 return -EINVAL;
233}
234
235struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
236{
237 unsigned int n;
238 struct autofs_sb_info *sbi = autofs_sbi(sb);
239 struct inode *inode;
240
241 inode = iget_locked(sb, ino);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244 if (!(inode->i_state & I_NEW))
245 return inode;
246
247 /* Initialize to the default case (stub directory) */
248
249 inode->i_op = &simple_dir_inode_operations;
250 inode->i_fop = &simple_dir_operations;
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254
255 if (ino == AUTOFS_ROOT_INO) {
256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
257 inode->i_op = &autofs_root_inode_operations;
258 inode->i_fop = &autofs_root_operations;
259 goto done;
260 }
261
262 inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
263 inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
264
265 if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
266 /* Symlink inode - should be in symlink list */
267 struct autofs_symlink *sl;
268
269 n = ino - AUTOFS_FIRST_SYMLINK;
270 if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
271 printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
272 goto done;
273 }
274
275 inode->i_op = &autofs_symlink_inode_operations;
276 sl = &sbi->symlink[n];
277 inode->i_private = sl;
278 inode->i_mode = S_IFLNK | S_IRWXUGO;
279 inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
280 inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
281 inode->i_size = sl->len;
282 inode->i_nlink = 1;
283 }
284
285done:
286 unlock_new_inode(inode);
287 return inode;
288}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 9a0520b5066..00000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,582 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/root.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/capability.h>
14#include <linux/errno.h>
15#include <linux/stat.h>
16#include <linux/slab.h>
17#include <linux/param.h>
18#include <linux/time.h>
19#include <linux/smp_lock.h>
20#include "autofs_i.h"
21
22static int autofs_root_readdir(struct file *,void *,filldir_t);
23static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
24static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
25static int autofs_root_unlink(struct inode *,struct dentry *);
26static int autofs_root_rmdir(struct inode *,struct dentry *);
27static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
29
30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
32 .read = generic_read_dir,
33 .readdir = autofs_root_readdir,
34 .ioctl = autofs_root_ioctl,
35};
36
37const struct inode_operations autofs_root_inode_operations = {
38 .lookup = autofs_root_lookup,
39 .unlink = autofs_root_unlink,
40 .symlink = autofs_root_symlink,
41 .mkdir = autofs_root_mkdir,
42 .rmdir = autofs_root_rmdir,
43};
44
45static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
46{
47 struct autofs_dir_ent *ent = NULL;
48 struct autofs_dirhash *dirhash;
49 struct autofs_sb_info *sbi;
50 struct inode * inode = filp->f_path.dentry->d_inode;
51 off_t onr, nr;
52
53 lock_kernel();
54
55 sbi = autofs_sbi(inode->i_sb);
56 dirhash = &sbi->dirhash;
57 nr = filp->f_pos;
58
59 switch(nr)
60 {
61 case 0:
62 if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
63 goto out;
64 filp->f_pos = ++nr;
65 /* fall through */
66 case 1:
67 if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
68 goto out;
69 filp->f_pos = ++nr;
70 /* fall through */
71 default:
72 while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
73 if (!ent->dentry || d_mountpoint(ent->dentry)) {
74 if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
75 goto out;
76 filp->f_pos = nr;
77 }
78 }
79 break;
80 }
81
82out:
83 unlock_kernel();
84 return 0;
85}
86
87static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
88{
89 struct inode * inode;
90 struct autofs_dir_ent *ent;
91 int status = 0;
92
93 if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
94 do {
95 if (status && dentry->d_inode) {
96 if (status != -ENOENT)
97 printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
98 return 0; /* Try to get the kernel to invalidate this dentry */
99 }
100
101 /* Turn this into a real negative dentry? */
102 if (status == -ENOENT) {
103 dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
104 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
105 return 1;
106 } else if (status) {
107 /* Return a negative dentry, but leave it "pending" */
108 return 1;
109 }
110 status = autofs_wait(sbi, &dentry->d_name);
111 } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
112 }
113
114 /* Abuse this field as a pointer to the directory entry, used to
115 find the expire list pointers */
116 dentry->d_time = (unsigned long) ent;
117
118 if (!dentry->d_inode) {
119 inode = autofs_iget(sb, ent->ino);
120 if (IS_ERR(inode)) {
121 /* Failed, but leave pending for next time */
122 return 1;
123 }
124 dentry->d_inode = inode;
125 }
126
127 /* If this is a directory that isn't a mount point, bitch at the
128 daemon and fix it in user space */
129 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
130 return !autofs_wait(sbi, &dentry->d_name);
131 }
132
133 /* We don't update the usages for the autofs daemon itself, this
134 is necessary for recursive autofs mounts */
135 if (!autofs_oz_mode(sbi)) {
136 autofs_update_usage(&sbi->dirhash,ent);
137 }
138
139 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
140 return 1;
141}
142
143
144/*
145 * Revalidate is called on every cache lookup. Some of those
146 * cache lookups may actually happen while the dentry is not
147 * yet completely filled in, and revalidate has to delay such
148 * lookups..
149 */
150static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
151{
152 struct inode * dir;
153 struct autofs_sb_info *sbi;
154 struct autofs_dir_ent *ent;
155 int res;
156
157 lock_kernel();
158 dir = dentry->d_parent->d_inode;
159 sbi = autofs_sbi(dir->i_sb);
160
161 /* Pending dentry */
162 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
163 if (autofs_oz_mode(sbi))
164 res = 1;
165 else
166 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
167 unlock_kernel();
168 return res;
169 }
170
171 /* Negative dentry.. invalidate if "old" */
172 if (!dentry->d_inode) {
173 unlock_kernel();
174 return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
175 }
176
177 /* Check for a non-mountpoint directory */
178 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
179 if (autofs_oz_mode(sbi))
180 res = 1;
181 else
182 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
183 unlock_kernel();
184 return res;
185 }
186
187 /* Update the usage list */
188 if (!autofs_oz_mode(sbi)) {
189 ent = (struct autofs_dir_ent *) dentry->d_time;
190 if (ent)
191 autofs_update_usage(&sbi->dirhash,ent);
192 }
193 unlock_kernel();
194 return 1;
195}
196
197static const struct dentry_operations autofs_dentry_operations = {
198 .d_revalidate = autofs_revalidate,
199};
200
201static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
202{
203 struct autofs_sb_info *sbi;
204 int oz_mode;
205
206 DPRINTK(("autofs_root_lookup: name = "));
207 lock_kernel();
208 autofs_say(dentry->d_name.name,dentry->d_name.len);
209
210 if (dentry->d_name.len > NAME_MAX) {
211 unlock_kernel();
212 return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
213 }
214
215 sbi = autofs_sbi(dir->i_sb);
216
217 oz_mode = autofs_oz_mode(sbi);
218 DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
219 "oz_mode = %d\n", task_pid_nr(current),
220 task_pgrp_nr(current), sbi->catatonic,
221 oz_mode));
222
223 /*
224 * Mark the dentry incomplete, but add it. This is needed so
225 * that the VFS layer knows about the dentry, and we can count
226 * on catching any lookups through the revalidate.
227 *
228 * Let all the hard work be done by the revalidate function that
229 * needs to be able to do this anyway..
230 *
231 * We need to do this before we release the directory semaphore.
232 */
233 dentry->d_op = &autofs_dentry_operations;
234 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
235 d_add(dentry, NULL);
236
237 mutex_unlock(&dir->i_mutex);
238 autofs_revalidate(dentry, nd);
239 mutex_lock(&dir->i_mutex);
240
241 /*
242 * If we are still pending, check if we had to handle
243 * a signal. If so we can force a restart..
244 */
245 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
246 /* See if we were interrupted */
247 if (signal_pending(current)) {
248 sigset_t *sigset = &current->pending.signal;
249 if (sigismember (sigset, SIGKILL) ||
250 sigismember (sigset, SIGQUIT) ||
251 sigismember (sigset, SIGINT)) {
252 unlock_kernel();
253 return ERR_PTR(-ERESTARTNOINTR);
254 }
255 }
256 }
257 unlock_kernel();
258
259 /*
260 * If this dentry is unhashed, then we shouldn't honour this
261 * lookup even if the dentry is positive. Returning ENOENT here
262 * doesn't do the right thing for all system calls, but it should
263 * be OK for the operations we permit from an autofs.
264 */
265 if (dentry->d_inode && d_unhashed(dentry))
266 return ERR_PTR(-ENOENT);
267
268 return NULL;
269}
270
271static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
272{
273 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
274 struct autofs_dirhash *dh = &sbi->dirhash;
275 struct autofs_dir_ent *ent;
276 unsigned int n;
277 int slsize;
278 struct autofs_symlink *sl;
279 struct inode *inode;
280
281 DPRINTK(("autofs_root_symlink: %s <- ", symname));
282 autofs_say(dentry->d_name.name,dentry->d_name.len);
283
284 lock_kernel();
285 if (!autofs_oz_mode(sbi)) {
286 unlock_kernel();
287 return -EACCES;
288 }
289
290 if (autofs_hash_lookup(dh, &dentry->d_name)) {
291 unlock_kernel();
292 return -EEXIST;
293 }
294
295 n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
296 if (n >= AUTOFS_MAX_SYMLINKS) {
297 unlock_kernel();
298 return -ENOSPC;
299 }
300
301 set_bit(n,sbi->symlink_bitmap);
302 sl = &sbi->symlink[n];
303 sl->len = strlen(symname);
304 sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
305 if (!sl->data) {
306 clear_bit(n,sbi->symlink_bitmap);
307 unlock_kernel();
308 return -ENOSPC;
309 }
310
311 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
312 if (!ent) {
313 kfree(sl->data);
314 clear_bit(n,sbi->symlink_bitmap);
315 unlock_kernel();
316 return -ENOSPC;
317 }
318
319 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
320 if (!ent->name) {
321 kfree(sl->data);
322 kfree(ent);
323 clear_bit(n,sbi->symlink_bitmap);
324 unlock_kernel();
325 return -ENOSPC;
326 }
327
328 memcpy(sl->data,symname,slsize);
329 sl->mtime = get_seconds();
330
331 ent->ino = AUTOFS_FIRST_SYMLINK + n;
332 ent->hash = dentry->d_name.hash;
333 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
334 ent->dentry = NULL; /* We don't keep the dentry for symlinks */
335
336 autofs_hash_insert(dh,ent);
337
338 inode = autofs_iget(dir->i_sb, ent->ino);
339 if (IS_ERR(inode))
340 return PTR_ERR(inode);
341
342 d_instantiate(dentry, inode);
343 unlock_kernel();
344 return 0;
345}
346
347/*
348 * NOTE!
349 *
350 * Normal filesystems would do a "d_delete()" to tell the VFS dcache
351 * that the file no longer exists. However, doing that means that the
352 * VFS layer can turn the dentry into a negative dentry, which we
353 * obviously do not want (we're dropping the entry not because it
354 * doesn't exist, but because it has timed out).
355 *
356 * Also see autofs_root_rmdir()..
357 */
358static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
359{
360 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
361 struct autofs_dirhash *dh = &sbi->dirhash;
362 struct autofs_dir_ent *ent;
363 unsigned int n;
364
365 /* This allows root to remove symlinks */
366 lock_kernel();
367 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
368 unlock_kernel();
369 return -EACCES;
370 }
371
372 ent = autofs_hash_lookup(dh, &dentry->d_name);
373 if (!ent) {
374 unlock_kernel();
375 return -ENOENT;
376 }
377
378 n = ent->ino - AUTOFS_FIRST_SYMLINK;
379 if (n >= AUTOFS_MAX_SYMLINKS) {
380 unlock_kernel();
381 return -EISDIR; /* It's a directory, dummy */
382 }
383 if (!test_bit(n,sbi->symlink_bitmap)) {
384 unlock_kernel();
385 return -EINVAL; /* Nonexistent symlink? Shouldn't happen */
386 }
387
388 dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
389 autofs_hash_delete(ent);
390 clear_bit(n,sbi->symlink_bitmap);
391 kfree(sbi->symlink[n].data);
392 d_drop(dentry);
393
394 unlock_kernel();
395 return 0;
396}
397
398static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
399{
400 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
401 struct autofs_dirhash *dh = &sbi->dirhash;
402 struct autofs_dir_ent *ent;
403
404 lock_kernel();
405 if (!autofs_oz_mode(sbi)) {
406 unlock_kernel();
407 return -EACCES;
408 }
409
410 ent = autofs_hash_lookup(dh, &dentry->d_name);
411 if (!ent) {
412 unlock_kernel();
413 return -ENOENT;
414 }
415
416 if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
417 unlock_kernel();
418 return -ENOTDIR; /* Not a directory */
419 }
420
421 if (ent->dentry != dentry) {
422 printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
423 }
424
425 dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
426 autofs_hash_delete(ent);
427 drop_nlink(dir);
428 d_drop(dentry);
429 unlock_kernel();
430
431 return 0;
432}
433
434static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
435{
436 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
437 struct autofs_dirhash *dh = &sbi->dirhash;
438 struct autofs_dir_ent *ent;
439 struct inode *inode;
440 ino_t ino;
441
442 lock_kernel();
443 if (!autofs_oz_mode(sbi)) {
444 unlock_kernel();
445 return -EACCES;
446 }
447
448 ent = autofs_hash_lookup(dh, &dentry->d_name);
449 if (ent) {
450 unlock_kernel();
451 return -EEXIST;
452 }
453
454 if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
455 printk("autofs: Out of inode numbers -- what the heck did you do??\n");
456 unlock_kernel();
457 return -ENOSPC;
458 }
459 ino = sbi->next_dir_ino++;
460
461 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
462 if (!ent) {
463 unlock_kernel();
464 return -ENOSPC;
465 }
466
467 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
468 if (!ent->name) {
469 kfree(ent);
470 unlock_kernel();
471 return -ENOSPC;
472 }
473
474 ent->hash = dentry->d_name.hash;
475 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
476 ent->ino = ino;
477 ent->dentry = dentry;
478 autofs_hash_insert(dh,ent);
479
480 inc_nlink(dir);
481
482 inode = autofs_iget(dir->i_sb, ino);
483 if (IS_ERR(inode)) {
484 drop_nlink(dir);
485 return PTR_ERR(inode);
486 }
487
488 d_instantiate(dentry, inode);
489 unlock_kernel();
490
491 return 0;
492}
493
494/* Get/set timeout ioctl() operation */
495static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
496 unsigned long __user *p)
497{
498 unsigned long ntimeout;
499
500 if (get_user(ntimeout, p) ||
501 put_user(sbi->exp_timeout / HZ, p))
502 return -EFAULT;
503
504 if (ntimeout > ULONG_MAX/HZ)
505 sbi->exp_timeout = 0;
506 else
507 sbi->exp_timeout = ntimeout * HZ;
508
509 return 0;
510}
511
512/* Return protocol version */
513static inline int autofs_get_protover(int __user *p)
514{
515 return put_user(AUTOFS_PROTO_VERSION, p);
516}
517
518/* Perform an expiry operation */
519static inline int autofs_expire_run(struct super_block *sb,
520 struct autofs_sb_info *sbi,
521 struct vfsmount *mnt,
522 struct autofs_packet_expire __user *pkt_p)
523{
524 struct autofs_dir_ent *ent;
525 struct autofs_packet_expire pkt;
526
527 memset(&pkt,0,sizeof pkt);
528
529 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
530 pkt.hdr.type = autofs_ptype_expire;
531
532 if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
533 return -EAGAIN;
534
535 pkt.len = ent->len;
536 memcpy(pkt.name, ent->name, pkt.len);
537 pkt.name[pkt.len] = '\0';
538
539 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
540 return -EFAULT;
541
542 return 0;
543}
544
545/*
546 * ioctl()'s on the root directory is the chief method for the daemon to
547 * generate kernel reactions
548 */
549static int autofs_root_ioctl(struct inode *inode, struct file *filp,
550 unsigned int cmd, unsigned long arg)
551{
552 struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
553 void __user *argp = (void __user *)arg;
554
555 DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
556
557 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
558 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
559 return -ENOTTY;
560
561 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
562 return -EPERM;
563
564 switch(cmd) {
565 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
566 return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
567 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
568 return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
569 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
570 autofs_catatonic_mode(sbi);
571 return 0;
572 case AUTOFS_IOC_PROTOVER: /* Get protocol version */
573 return autofs_get_protover(argp);
574 case AUTOFS_IOC_SETTIMEOUT:
575 return autofs_get_set_timeout(sbi, argp);
576 case AUTOFS_IOC_EXPIRE:
577 return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
578 argp);
579 default:
580 return -ENOSYS;
581 }
582}
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce..00000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/symlink.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Nothing to release.. */
16static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
17{
18 char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
19 nd_set_link(nd, s);
20 return NULL;
21}
22
23const struct inode_operations autofs_symlink_inode_operations = {
24 .readlink = generic_readlink,
25 .follow_link = autofs_follow_link
26};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f..00000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/waitq.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/slab.h>
14#include <linux/time.h>
15#include <linux/signal.h>
16#include <linux/file.h>
17#include "autofs_i.h"
18
19/* We make this a static variable rather than a part of the superblock; it
20 is better if we don't reassign numbers easily even across filesystems */
21static autofs_wqt_t autofs_next_wait_queue = 1;
22
23/* These are the signals we allow interrupting a pending mount */
24#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
25
26void autofs_catatonic_mode(struct autofs_sb_info *sbi)
27{
28 struct autofs_wait_queue *wq, *nwq;
29
30 DPRINTK(("autofs: entering catatonic mode\n"));
31
32 sbi->catatonic = 1;
33 wq = sbi->queues;
34 sbi->queues = NULL; /* Erase all wait queues */
35 while ( wq ) {
36 nwq = wq->next;
37 wq->status = -ENOENT; /* Magic is gone - report failure */
38 kfree(wq->name);
39 wq->name = NULL;
40 wake_up(&wq->queue);
41 wq = nwq;
42 }
43 fput(sbi->pipe); /* Close the pipe */
44 sbi->pipe = NULL;
45 autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
46}
47
48static int autofs_write(struct file *file, const void *addr, int bytes)
49{
50 unsigned long sigpipe, flags;
51 mm_segment_t fs;
52 const char *data = (const char *)addr;
53 ssize_t wr = 0;
54
55 /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
56
57 sigpipe = sigismember(&current->pending.signal, SIGPIPE);
58
59 /* Save pointer to user space and point back to kernel space */
60 fs = get_fs();
61 set_fs(KERNEL_DS);
62
63 while (bytes &&
64 (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
65 data += wr;
66 bytes -= wr;
67 }
68
69 set_fs(fs);
70
71 /* Keep the currently executing process from receiving a
72 SIGPIPE unless it was already supposed to get one */
73 if (wr == -EPIPE && !sigpipe) {
74 spin_lock_irqsave(&current->sighand->siglock, flags);
75 sigdelset(&current->pending.signal, SIGPIPE);
76 recalc_sigpending();
77 spin_unlock_irqrestore(&current->sighand->siglock, flags);
78 }
79
80 return (bytes > 0);
81}
82
83static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
84{
85 struct autofs_packet_missing pkt;
86
87 DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
88 autofs_say(wq->name,wq->len);
89
90 memset(&pkt,0,sizeof pkt); /* For security reasons */
91
92 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
93 pkt.hdr.type = autofs_ptype_missing;
94 pkt.wait_queue_token = wq->wait_queue_token;
95 pkt.len = wq->len;
96 memcpy(pkt.name, wq->name, pkt.len);
97 pkt.name[pkt.len] = '\0';
98
99 if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
100 autofs_catatonic_mode(sbi);
101}
102
103int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
104{
105 struct autofs_wait_queue *wq;
106 int status;
107
108 /* In catatonic mode, we don't wait for nobody */
109 if ( sbi->catatonic )
110 return -ENOENT;
111
112 /* We shouldn't be able to get here, but just in case */
113 if ( name->len > NAME_MAX )
114 return -ENOENT;
115
116 for ( wq = sbi->queues ; wq ; wq = wq->next ) {
117 if ( wq->hash == name->hash &&
118 wq->len == name->len &&
119 wq->name && !memcmp(wq->name,name->name,name->len) )
120 break;
121 }
122
123 if ( !wq ) {
124 /* Create a new wait queue */
125 wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
126 if ( !wq )
127 return -ENOMEM;
128
129 wq->name = kmalloc(name->len,GFP_KERNEL);
130 if ( !wq->name ) {
131 kfree(wq);
132 return -ENOMEM;
133 }
134 wq->wait_queue_token = autofs_next_wait_queue++;
135 init_waitqueue_head(&wq->queue);
136 wq->hash = name->hash;
137 wq->len = name->len;
138 wq->status = -EINTR; /* Status return if interrupted */
139 memcpy(wq->name, name->name, name->len);
140 wq->next = sbi->queues;
141 sbi->queues = wq;
142
143 /* autofs_notify_daemon() may block */
144 wq->wait_ctr = 2;
145 autofs_notify_daemon(sbi,wq);
146 } else
147 wq->wait_ctr++;
148
149 /* wq->name is NULL if and only if the lock is already released */
150
151 if ( sbi->catatonic ) {
152 /* We might have slept, so check again for catatonic mode */
153 wq->status = -ENOENT;
154 kfree(wq->name);
155 wq->name = NULL;
156 }
157
158 if ( wq->name ) {
159 /* Block all but "shutdown" signals while waiting */
160 sigset_t sigmask;
161
162 siginitsetinv(&sigmask, SHUTDOWN_SIGS);
163 sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
164
165 interruptible_sleep_on(&wq->queue);
166
167 sigprocmask(SIG_SETMASK, &sigmask, NULL);
168 } else {
169 DPRINTK(("autofs_wait: skipped sleeping\n"));
170 }
171
172 status = wq->status;
173
174 if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
175 kfree(wq);
176
177 return status;
178}
179
180
181int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
182{
183 struct autofs_wait_queue *wq, **wql;
184
185 for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
186 if ( wq->wait_queue_token == wait_queue_token )
187 break;
188 }
189 if ( !wq )
190 return -EINVAL;
191
192 *wql = wq->next; /* Unlink from chain */
193 kfree(wq->name);
194 wq->name = NULL; /* Do not wait on this queue */
195
196 wq->status = status;
197
198 if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
199 kfree(wq);
200 else
201 wake_up(&wq->queue);
202
203 return 0;
204}
205
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22..eff9a419469 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
724 .unlocked_ioctl = autofs_dev_ioctl, 724 .unlocked_ioctl = autofs_dev_ioctl,
725 .compat_ioctl = autofs_dev_ioctl_compat, 725 .compat_ioctl = autofs_dev_ioctl_compat,
726 .owner = THIS_MODULE, 726 .owner = THIS_MODULE,
727 .llseek = noop_llseek,
727}; 728};
728 729
729static struct miscdevice _autofs_dev_ioctl_misc = { 730static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd895..c038727b405 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include "autofs_i.h" 15#include "autofs_i.h"
16 16
17static int autofs_get_sb(struct file_system_type *fs_type, 17static struct dentry *autofs_mount(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 18 int flags, const char *dev_name, void *data)
19{ 19{
20 return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); 20 return mount_nodev(fs_type, flags, data, autofs4_fill_super);
21} 21}
22 22
23static struct file_system_type autofs_fs_type = { 23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .mount = autofs_mount,
27 .kill_sb = autofs4_kill_sb, 27 .kill_sb = autofs4_kill_sb,
28}; 28};
29 29
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955da..ac87e49fa70 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
398 inode->i_gid = sb->s_root->d_inode->i_gid; 398 inode->i_gid = sb->s_root->d_inode->i_gid;
399 } 399 }
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino();
401 402
402 if (S_ISDIR(inf->mode)) { 403 if (S_ISDIR(inf->mode)) {
403 inode->i_nlink = 2; 404 inode->i_nlink = 2;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index db4117ed780..d5c1401f003 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,7 +18,9 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h> 21#include <linux/compat.h>
22#include <linux/mutex.h>
23
22#include "autofs_i.h" 24#include "autofs_i.h"
23 25
24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
@@ -26,6 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
26static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
31#ifdef CONFIG_COMPAT
32static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
33#endif
29static int autofs4_dir_open(struct inode *inode, struct file *file); 34static int autofs4_dir_open(struct inode *inode, struct file *file);
30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
31static void *autofs4_follow_link(struct dentry *, struct nameidata *); 36static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -40,6 +45,9 @@ const struct file_operations autofs4_root_operations = {
40 .readdir = dcache_readdir, 45 .readdir = dcache_readdir,
41 .llseek = dcache_dir_lseek, 46 .llseek = dcache_dir_lseek,
42 .unlocked_ioctl = autofs4_root_ioctl, 47 .unlocked_ioctl = autofs4_root_ioctl,
48#ifdef CONFIG_COMPAT
49 .compat_ioctl = autofs4_root_compat_ioctl,
50#endif
43}; 51};
44 52
45const struct file_operations autofs4_dir_operations = { 53const struct file_operations autofs4_dir_operations = {
@@ -198,8 +206,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
198 } 206 }
199 207
200 /* Initialize expiry counter after successful mount */ 208 /* Initialize expiry counter after successful mount */
201 if (ino) 209 ino->last_used = jiffies;
202 ino->last_used = jiffies;
203 210
204 spin_lock(&sbi->fs_lock); 211 spin_lock(&sbi->fs_lock);
205 ino->flags &= ~AUTOFS_INF_PENDING; 212 ino->flags &= ~AUTOFS_INF_PENDING;
@@ -840,6 +847,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
840} 847}
841 848
842/* Get/set timeout ioctl() operation */ 849/* Get/set timeout ioctl() operation */
850#ifdef CONFIG_COMPAT
851static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
852 compat_ulong_t __user *p)
853{
854 int rv;
855 unsigned long ntimeout;
856
857 if ((rv = get_user(ntimeout, p)) ||
858 (rv = put_user(sbi->exp_timeout/HZ, p)))
859 return rv;
860
861 if (ntimeout > UINT_MAX/HZ)
862 sbi->exp_timeout = 0;
863 else
864 sbi->exp_timeout = ntimeout * HZ;
865
866 return 0;
867}
868#endif
869
843static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, 870static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
844 unsigned long __user *p) 871 unsigned long __user *p)
845{ 872{
@@ -933,6 +960,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
933 return autofs4_get_protosubver(sbi, p); 960 return autofs4_get_protosubver(sbi, p);
934 case AUTOFS_IOC_SETTIMEOUT: 961 case AUTOFS_IOC_SETTIMEOUT:
935 return autofs4_get_set_timeout(sbi, p); 962 return autofs4_get_set_timeout(sbi, p);
963#ifdef CONFIG_COMPAT
964 case AUTOFS_IOC_SETTIMEOUT32:
965 return autofs4_compat_get_set_timeout(sbi, p);
966#endif
936 967
937 case AUTOFS_IOC_ASKUMOUNT: 968 case AUTOFS_IOC_ASKUMOUNT:
938 return autofs4_ask_umount(filp->f_path.mnt, p); 969 return autofs4_ask_umount(filp->f_path.mnt, p);
@@ -949,15 +980,36 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
949 } 980 }
950} 981}
951 982
983static DEFINE_MUTEX(autofs4_ioctl_mutex);
984
952static long autofs4_root_ioctl(struct file *filp, 985static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg) 986 unsigned int cmd, unsigned long arg)
954{ 987{
955 long ret; 988 long ret;
956 struct inode *inode = filp->f_dentry->d_inode; 989 struct inode *inode = filp->f_dentry->d_inode;
957 990
958 lock_kernel(); 991 mutex_lock(&autofs4_ioctl_mutex);
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 992 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel(); 993 mutex_unlock(&autofs4_ioctl_mutex);
994
995 return ret;
996}
997
998#ifdef CONFIG_COMPAT
999static long autofs4_root_compat_ioctl(struct file *filp,
1000 unsigned int cmd, unsigned long arg)
1001{
1002 struct inode *inode = filp->f_path.dentry->d_inode;
1003 int ret;
1004
1005 mutex_lock(&autofs4_ioctl_mutex);
1006 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1007 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1008 else
1009 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1010 (unsigned long)compat_ptr(arg));
1011 mutex_unlock(&autofs4_ioctl_mutex);
961 1012
962 return ret; 1013 return ret;
963} 1014}
1015#endif
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 52e59bf4aa5..f024d8aadde 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
55 return POLLERR; 55 return POLLERR;
56} 56}
57 57
58static int bad_file_ioctl (struct inode *inode, struct file *filp,
59 unsigned int cmd, unsigned long arg)
60{
61 return -EIO;
62}
63
64static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, 58static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
65 unsigned long arg) 59 unsigned long arg)
66{ 60{
@@ -159,7 +153,6 @@ static const struct file_operations bad_file_ops =
159 .aio_write = bad_file_aio_write, 153 .aio_write = bad_file_aio_write,
160 .readdir = bad_file_readdir, 154 .readdir = bad_file_readdir,
161 .poll = bad_file_poll, 155 .poll = bad_file_poll,
162 .ioctl = bad_file_ioctl,
163 .unlocked_ioctl = bad_file_unlocked_ioctl, 156 .unlocked_ioctl = bad_file_unlocked_ioctl,
164 .compat_ioctl = bad_file_compat_ioctl, 157 .compat_ioctl = bad_file_compat_ioctl,
165 .mmap = bad_file_mmap, 158 .mmap = bad_file_mmap,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e6..aa4e7c7ae3c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
436 init_once); 436 init_once);
437 if (befs_inode_cachep == NULL) { 437 if (befs_inode_cachep == NULL) {
438 printk(KERN_ERR "befs_init_inodecache: " 438 printk(KERN_ERR "befs_init_inodecache: "
439 "Couldn't initalize inode slabcache\n"); 439 "Couldn't initialize inode slabcache\n");
440 return -ENOMEM; 440 return -ENOMEM;
441 } 441 }
442 442
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
913 return 0; 913 return 0;
914} 914}
915 915
916static int 916static struct dentry *
917befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, 917befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
918 void *data, struct vfsmount *mnt) 918 void *data)
919{ 919{
920 return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, 920 return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
921 mnt);
922} 921}
923 922
924static struct file_system_type befs_fs_type = { 923static struct file_system_type befs_fs_type = {
925 .owner = THIS_MODULE, 924 .owner = THIS_MODULE,
926 .name = "befs", 925 .name = "befs",
927 .get_sb = befs_get_sb, 926 .mount = befs_mount,
928 .kill_sb = kill_block_super, 927 .kill_sb = kill_block_super,
929 .fs_flags = FS_REQUIRES_DEV, 928 .fs_flags = FS_REQUIRES_DEV,
930}; 929};
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 7109e451abf..f7f87e233dd 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -17,7 +17,6 @@ struct bfs_sb_info {
17 unsigned long si_lf_eblk; 17 unsigned long si_lf_eblk;
18 unsigned long si_lasti; 18 unsigned long si_lasti;
19 unsigned long *si_imap; 19 unsigned long *si_imap;
20 struct buffer_head *si_sbh; /* buffer header w/superblock */
21 struct mutex bfs_lock; 20 struct mutex bfs_lock;
22}; 21};
23 22
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b77..685ecff3ab3 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
176 inc_nlink(inode); 176 inc_nlink(inode);
177 inode->i_ctime = CURRENT_TIME_SEC; 177 inode->i_ctime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 atomic_inc(&inode->i_count); 179 ihold(inode);
180 d_instantiate(new, inode); 180 d_instantiate(new, inode);
181 mutex_unlock(&info->bfs_lock); 181 mutex_unlock(&info->bfs_lock);
182 return 0; 182 return 0;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3ff44e..eb67edd0f8e 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -70,7 +70,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
70 struct super_block *sb = inode->i_sb; 70 struct super_block *sb = inode->i_sb;
71 struct bfs_sb_info *info = BFS_SB(sb); 71 struct bfs_sb_info *info = BFS_SB(sb);
72 struct bfs_inode_info *bi = BFS_I(inode); 72 struct bfs_inode_info *bi = BFS_I(inode);
73 struct buffer_head *sbh = info->si_sbh;
74 73
75 phys = bi->i_sblock + block; 74 phys = bi->i_sblock + block;
76 if (!create) { 75 if (!create) {
@@ -112,7 +111,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
112 info->si_freeb -= phys - bi->i_eblock; 111 info->si_freeb -= phys - bi->i_eblock;
113 info->si_lf_eblk = bi->i_eblock = phys; 112 info->si_lf_eblk = bi->i_eblock = phys;
114 mark_inode_dirty(inode); 113 mark_inode_dirty(inode);
115 mark_buffer_dirty(sbh);
116 err = 0; 114 err = 0;
117 goto out; 115 goto out;
118 } 116 }
@@ -147,7 +145,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
147 */ 145 */
148 info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; 146 info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
149 mark_inode_dirty(inode); 147 mark_inode_dirty(inode);
150 mark_buffer_dirty(sbh);
151 map_bh(bh_result, sb, phys); 148 map_bh(bh_result, sb, phys);
152out: 149out:
153 mutex_unlock(&info->bfs_lock); 150 mutex_unlock(&info->bfs_lock);
@@ -168,9 +165,17 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
168 loff_t pos, unsigned len, unsigned flags, 165 loff_t pos, unsigned len, unsigned flags,
169 struct page **pagep, void **fsdata) 166 struct page **pagep, void **fsdata)
170{ 167{
171 *pagep = NULL; 168 int ret;
172 return block_write_begin(file, mapping, pos, len, flags, 169
173 pagep, fsdata, bfs_get_block); 170 ret = block_write_begin(mapping, pos, len, flags, pagep,
171 bfs_get_block);
172 if (unlikely(ret)) {
173 loff_t isize = mapping->host->i_size;
174 if (pos + len > isize)
175 vmtruncate(mapping->host, isize);
176 }
177
178 return ret;
174} 179}
175 180
176static sector_t bfs_bmap(struct address_space *mapping, sector_t block) 181static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f22a7d3dc36..76db6d7d49b 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/vfs.h> 16#include <linux/vfs.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
@@ -31,7 +30,6 @@ MODULE_LICENSE("GPL");
31#define dprintf(x...) 30#define dprintf(x...)
32#endif 31#endif
33 32
34static void bfs_write_super(struct super_block *s);
35void dump_imap(const char *prefix, struct super_block *s); 33void dump_imap(const char *prefix, struct super_block *s);
36 34
37struct inode *bfs_iget(struct super_block *sb, unsigned long ino) 35struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -99,6 +97,24 @@ error:
99 return ERR_PTR(-EIO); 97 return ERR_PTR(-EIO);
100} 98}
101 99
100static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
101{
102 if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
103 printf("Bad inode number %s:%08x\n", sb->s_id, ino);
104 return ERR_PTR(-EIO);
105 }
106
107 ino -= BFS_ROOT_INO;
108
109 *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
110 if (!*p) {
111 printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
112 return ERR_PTR(-EIO);
113 }
114
115 return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK;
116}
117
102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) 118static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
103{ 119{
104 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 120 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
@@ -106,28 +122,15 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
106 unsigned long i_sblock; 122 unsigned long i_sblock;
107 struct bfs_inode *di; 123 struct bfs_inode *di;
108 struct buffer_head *bh; 124 struct buffer_head *bh;
109 int block, off;
110 int err = 0; 125 int err = 0;
111 126
112 dprintf("ino=%08x\n", ino); 127 dprintf("ino=%08x\n", ino);
113 128
114 if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { 129 di = find_inode(inode->i_sb, ino, &bh);
115 printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino); 130 if (IS_ERR(di))
116 return -EIO; 131 return PTR_ERR(di);
117 }
118 132
119 mutex_lock(&info->bfs_lock); 133 mutex_lock(&info->bfs_lock);
120 block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
121 bh = sb_bread(inode->i_sb, block);
122 if (!bh) {
123 printf("Unable to read inode %s:%08x\n",
124 inode->i_sb->s_id, ino);
125 mutex_unlock(&info->bfs_lock);
126 return -EIO;
127 }
128
129 off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
130 di = (struct bfs_inode *)bh->b_data + off;
131 134
132 if (ino == BFS_ROOT_INO) 135 if (ino == BFS_ROOT_INO)
133 di->i_vtype = cpu_to_le32(BFS_VDIR); 136 di->i_vtype = cpu_to_le32(BFS_VDIR);
@@ -158,12 +161,11 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
158 return err; 161 return err;
159} 162}
160 163
161static void bfs_delete_inode(struct inode *inode) 164static void bfs_evict_inode(struct inode *inode)
162{ 165{
163 unsigned long ino = inode->i_ino; 166 unsigned long ino = inode->i_ino;
164 struct bfs_inode *di; 167 struct bfs_inode *di;
165 struct buffer_head *bh; 168 struct buffer_head *bh;
166 int block, off;
167 struct super_block *s = inode->i_sb; 169 struct super_block *s = inode->i_sb;
168 struct bfs_sb_info *info = BFS_SB(s); 170 struct bfs_sb_info *info = BFS_SB(s);
169 struct bfs_inode_info *bi = BFS_I(inode); 171 struct bfs_inode_info *bi = BFS_I(inode);
@@ -171,28 +173,19 @@ static void bfs_delete_inode(struct inode *inode)
171 dprintf("ino=%08lx\n", ino); 173 dprintf("ino=%08lx\n", ino);
172 174
173 truncate_inode_pages(&inode->i_data, 0); 175 truncate_inode_pages(&inode->i_data, 0);
176 invalidate_inode_buffers(inode);
177 end_writeback(inode);
174 178
175 if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) { 179 if (inode->i_nlink)
176 printf("invalid ino=%08lx\n", ino);
177 return; 180 return;
178 }
179
180 inode->i_size = 0;
181 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
182 mutex_lock(&info->bfs_lock);
183 mark_inode_dirty(inode);
184 181
185 block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; 182 di = find_inode(s, inode->i_ino, &bh);
186 bh = sb_bread(s, block); 183 if (IS_ERR(di))
187 if (!bh) {
188 printf("Unable to read inode %s:%08lx\n",
189 inode->i_sb->s_id, ino);
190 mutex_unlock(&info->bfs_lock);
191 return; 184 return;
192 } 185
193 off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; 186 mutex_lock(&info->bfs_lock);
194 di = (struct bfs_inode *)bh->b_data + off; 187 /* clear on-disk inode */
195 memset((void *)di, 0, sizeof(struct bfs_inode)); 188 memset(di, 0, sizeof(struct bfs_inode));
196 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
197 brelse(bh); 190 brelse(bh);
198 191
@@ -209,32 +202,9 @@ static void bfs_delete_inode(struct inode *inode)
209 * "last block of the last file" even if there is no 202 * "last block of the last file" even if there is no
210 * real file there, saves us 1 gap. 203 * real file there, saves us 1 gap.
211 */ 204 */
212 if (info->si_lf_eblk == bi->i_eblock) { 205 if (info->si_lf_eblk == bi->i_eblock)
213 info->si_lf_eblk = bi->i_sblock - 1; 206 info->si_lf_eblk = bi->i_sblock - 1;
214 mark_buffer_dirty(info->si_sbh);
215 }
216 mutex_unlock(&info->bfs_lock);
217 clear_inode(inode);
218}
219
220static int bfs_sync_fs(struct super_block *sb, int wait)
221{
222 struct bfs_sb_info *info = BFS_SB(sb);
223
224 mutex_lock(&info->bfs_lock);
225 mark_buffer_dirty(info->si_sbh);
226 sb->s_dirt = 0;
227 mutex_unlock(&info->bfs_lock); 207 mutex_unlock(&info->bfs_lock);
228
229 return 0;
230}
231
232static void bfs_write_super(struct super_block *sb)
233{
234 if (!(sb->s_flags & MS_RDONLY))
235 bfs_sync_fs(sb, 1);
236 else
237 sb->s_dirt = 0;
238} 208}
239 209
240static void bfs_put_super(struct super_block *s) 210static void bfs_put_super(struct super_block *s)
@@ -244,18 +214,10 @@ static void bfs_put_super(struct super_block *s)
244 if (!info) 214 if (!info)
245 return; 215 return;
246 216
247 lock_kernel();
248
249 if (s->s_dirt)
250 bfs_write_super(s);
251
252 brelse(info->si_sbh);
253 mutex_destroy(&info->bfs_lock); 217 mutex_destroy(&info->bfs_lock);
254 kfree(info->si_imap); 218 kfree(info->si_imap);
255 kfree(info); 219 kfree(info);
256 s->s_fs_info = NULL; 220 s->s_fs_info = NULL;
257
258 unlock_kernel();
259} 221}
260 222
261static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -319,10 +281,8 @@ static const struct super_operations bfs_sops = {
319 .alloc_inode = bfs_alloc_inode, 281 .alloc_inode = bfs_alloc_inode,
320 .destroy_inode = bfs_destroy_inode, 282 .destroy_inode = bfs_destroy_inode,
321 .write_inode = bfs_write_inode, 283 .write_inode = bfs_write_inode,
322 .delete_inode = bfs_delete_inode, 284 .evict_inode = bfs_evict_inode,
323 .put_super = bfs_put_super, 285 .put_super = bfs_put_super,
324 .write_super = bfs_write_super,
325 .sync_fs = bfs_sync_fs,
326 .statfs = bfs_statfs, 286 .statfs = bfs_statfs,
327}; 287};
328 288
@@ -349,7 +309,7 @@ void dump_imap(const char *prefix, struct super_block *s)
349 309
350static int bfs_fill_super(struct super_block *s, void *data, int silent) 310static int bfs_fill_super(struct super_block *s, void *data, int silent)
351{ 311{
352 struct buffer_head *bh; 312 struct buffer_head *bh, *sbh;
353 struct bfs_super_block *bfs_sb; 313 struct bfs_super_block *bfs_sb;
354 struct inode *inode; 314 struct inode *inode;
355 unsigned i, imap_len; 315 unsigned i, imap_len;
@@ -365,10 +325,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
365 325
366 sb_set_blocksize(s, BFS_BSIZE); 326 sb_set_blocksize(s, BFS_BSIZE);
367 327
368 info->si_sbh = sb_bread(s, 0); 328 sbh = sb_bread(s, 0);
369 if (!info->si_sbh) 329 if (!sbh)
370 goto out; 330 goto out;
371 bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; 331 bfs_sb = (struct bfs_super_block *)sbh->b_data;
372 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { 332 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
373 if (!silent) 333 if (!silent)
374 printf("No BFS filesystem on %s (magic=%08x)\n", 334 printf("No BFS filesystem on %s (magic=%08x)\n",
@@ -472,10 +432,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
472 info->si_lf_eblk = eblock; 432 info->si_lf_eblk = eblock;
473 } 433 }
474 brelse(bh); 434 brelse(bh);
475 if (!(s->s_flags & MS_RDONLY)) { 435 brelse(sbh);
476 mark_buffer_dirty(info->si_sbh);
477 s->s_dirt = 1;
478 }
479 dump_imap("read_super", s); 436 dump_imap("read_super", s);
480 return 0; 437 return 0;
481 438
@@ -485,7 +442,7 @@ out3:
485out2: 442out2:
486 kfree(info->si_imap); 443 kfree(info->si_imap);
487out1: 444out1:
488 brelse(info->si_sbh); 445 brelse(sbh);
489out: 446out:
490 mutex_destroy(&info->bfs_lock); 447 mutex_destroy(&info->bfs_lock);
491 kfree(info); 448 kfree(info);
@@ -493,16 +450,16 @@ out:
493 return ret; 450 return ret;
494} 451}
495 452
496static int bfs_get_sb(struct file_system_type *fs_type, 453static struct dentry *bfs_mount(struct file_system_type *fs_type,
497 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 454 int flags, const char *dev_name, void *data)
498{ 455{
499 return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); 456 return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
500} 457}
501 458
502static struct file_system_type bfs_fs_type = { 459static struct file_system_type bfs_fs_type = {
503 .owner = THIS_MODULE, 460 .owner = THIS_MODULE,
504 .name = "bfs", 461 .name = "bfs",
505 .get_sb = bfs_get_sb, 462 .mount = bfs_mount,
506 .kill_sb = kill_block_super, 463 .kill_sb = kill_block_super,
507 .fs_flags = FS_REQUIRES_DEV, 464 .fs_flags = FS_REQUIRES_DEV,
508}; 465};
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11..a6395bdb26a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
134 if (!dump_write(file, dump_start, dump_size)) 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump; 135 goto end_coredump;
136 } 136 }
137/* Finally dump the task struct. Not be used by gdb, but could be useful */
138 set_fs(KERNEL_DS);
139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
141end_coredump: 137end_coredump:
142 set_fs(fs); 138 set_fs(fs);
143 return has_dumped; 139 return has_dumped;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a..6884e198e0c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
800 * default mmap base, as well as whatever program they 800 * default mmap base, as well as whatever program they
801 * might try to exec. This is because the brk will 801 * might try to exec. This is because the brk will
802 * follow the loader, and is not movable. */ 802 * follow the loader, and is not movable. */
803#ifdef CONFIG_X86 803#if defined(CONFIG_X86) || defined(CONFIG_ARM)
804 load_bias = 0; 804 load_bias = 0;
805#else 805#else
806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead..1befe2ec818 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
108 Node *fmt; 108 Node *fmt;
109 struct file * interp_file = NULL; 109 struct file * interp_file = NULL;
110 char iname[BINPRM_BUF_SIZE]; 110 char iname[BINPRM_BUF_SIZE];
111 char *iname_addr = iname; 111 const char *iname_addr = iname;
112 int retval; 112 int retval;
113 int fd_binary = -1; 113 int fd_binary = -1;
114 114
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
495 struct inode * inode = new_inode(sb); 495 struct inode * inode = new_inode(sb);
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_ino = get_next_ino();
498 inode->i_mode = mode; 499 inode->i_mode = mode;
499 inode->i_atime = inode->i_mtime = inode->i_ctime = 500 inode->i_atime = inode->i_mtime = inode->i_ctime =
500 current_fs_time(inode->i_sb); 501 current_fs_time(inode->i_sb);
@@ -502,8 +503,9 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
502 return inode; 503 return inode;
503} 504}
504 505
505static void bm_clear_inode(struct inode *inode) 506static void bm_evict_inode(struct inode *inode)
506{ 507{
508 end_writeback(inode);
507 kfree(inode->i_private); 509 kfree(inode->i_private);
508} 510}
509 511
@@ -575,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
575static const struct file_operations bm_entry_operations = { 577static const struct file_operations bm_entry_operations = {
576 .read = bm_entry_read, 578 .read = bm_entry_read,
577 .write = bm_entry_write, 579 .write = bm_entry_write,
580 .llseek = default_llseek,
578}; 581};
579 582
580/* /register */ 583/* /register */
@@ -642,6 +645,7 @@ out:
642 645
643static const struct file_operations bm_register_operations = { 646static const struct file_operations bm_register_operations = {
644 .write = bm_register_write, 647 .write = bm_register_write,
648 .llseek = noop_llseek,
645}; 649};
646 650
647/* /status */ 651/* /status */
@@ -679,13 +683,14 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
679static const struct file_operations bm_status_operations = { 683static const struct file_operations bm_status_operations = {
680 .read = bm_status_read, 684 .read = bm_status_read,
681 .write = bm_status_write, 685 .write = bm_status_write,
686 .llseek = default_llseek,
682}; 687};
683 688
684/* Superblock handling */ 689/* Superblock handling */
685 690
686static const struct super_operations s_ops = { 691static const struct super_operations s_ops = {
687 .statfs = simple_statfs, 692 .statfs = simple_statfs,
688 .clear_inode = bm_clear_inode, 693 .evict_inode = bm_evict_inode,
689}; 694};
690 695
691static int bm_fill_super(struct super_block * sb, void * data, int silent) 696static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -701,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
701 return err; 706 return err;
702} 707}
703 708
704static int bm_get_sb(struct file_system_type *fs_type, 709static struct dentry *bm_mount(struct file_system_type *fs_type,
705 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 710 int flags, const char *dev_name, void *data)
706{ 711{
707 return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); 712 return mount_single(fs_type, flags, data, bm_fill_super);
708} 713}
709 714
710static struct linux_binfmt misc_format = { 715static struct linux_binfmt misc_format = {
@@ -715,7 +720,7 @@ static struct linux_binfmt misc_format = {
715static struct file_system_type bm_fs_type = { 720static struct file_system_type bm_fs_type = {
716 .owner = THIS_MODULE, 721 .owner = THIS_MODULE,
717 .name = "binfmt_misc", 722 .name = "binfmt_misc",
718 .get_sb = bm_get_sb, 723 .mount = bm_mount,
719 .kill_sb = kill_litter_super, 724 .kill_sb = kill_litter_super,
720}; 725};
721 726
@@ -723,7 +728,7 @@ static int __init init_misc_binfmt(void)
723{ 728{
724 int err = register_filesystem(&bm_fs_type); 729 int err = register_filesystem(&bm_fs_type);
725 if (!err) { 730 if (!err) {
726 err = register_binfmt(&misc_format); 731 err = insert_binfmt(&misc_format);
727 if (err) 732 if (err)
728 unregister_filesystem(&bm_fs_type); 733 unregister_filesystem(&bm_fs_type);
729 } 734 }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb2..396a9884591 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
18{ 18{
19 char *cp, *i_name, *i_arg; 19 const char *i_arg, *i_name;
20 char *cp;
20 struct file *file; 21 struct file *file;
21 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
22 int retval; 23 int retval;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c..4d0ff5ee27b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
413 413
414 /* Allocate kernel buffer for protection data */ 414 /* Allocate kernel buffer for protection data */
415 len = sectors * blk_integrity_tuple_size(bi); 415 len = sectors * blk_integrity_tuple_size(bi);
416 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); 416 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
417 if (unlikely(buf == NULL)) { 417 if (unlikely(buf == NULL)) {
418 printk(KERN_ERR "could not allocate integrity buffer\n"); 418 printk(KERN_ERR "could not allocate integrity buffer\n");
419 return -EIO; 419 return -ENOMEM;
420 } 420 }
421 421
422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/bio.c b/fs/bio.c
index e7bf6ca64dc..8abb2dfb2e7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
843 if (!bio) 843 if (!bio)
844 goto out_bmd; 844 goto out_bmd;
845 845
846 bio->bi_rw |= (!write_to_vm << BIO_RW); 846 if (!write_to_vm)
847 bio->bi_rw |= REQ_WRITE;
847 848
848 ret = 0; 849 ret = 0;
849 850
@@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
1024 * set data direction, and check if mapped pages need bouncing 1025 * set data direction, and check if mapped pages need bouncing
1025 */ 1026 */
1026 if (!write_to_vm) 1027 if (!write_to_vm)
1027 bio->bi_rw |= (1 << BIO_RW); 1028 bio->bi_rw |= REQ_WRITE;
1028 1029
1029 bio->bi_bdev = bdev; 1030 bio->bi_bdev = bdev;
1030 bio->bi_flags |= (1 << BIO_USER_MAPPED); 1031 bio->bi_flags |= (1 << BIO_USER_MAPPED);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99d6af81174..06e8ff12b97 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
48 48
49EXPORT_SYMBOL(I_BDEV); 49EXPORT_SYMBOL(I_BDEV);
50 50
51/*
52 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
53 * we need to move it onto the dirty list of @dst so that the inode is always
54 * on the right list.
55 */
56static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst)
58{
59 spin_lock(&inode_lock);
60 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode_lock);
64}
65
51static sector_t max_block(struct block_device *bdev) 66static sector_t max_block(struct block_device *bdev)
52{ 67{
53 sector_t retval = ~((sector_t)0); 68 sector_t retval = ~((sector_t)0);
@@ -172,9 +187,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 187 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 188 struct inode *inode = file->f_mapping->host;
174 189
175 return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode, 190 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
176 I_BDEV(inode), iov, offset, nr_segs, 191 nr_segs, blkdev_get_blocks, NULL, NULL, 0);
177 blkdev_get_blocks, NULL);
178} 192}
179 193
180int __sync_blockdev(struct block_device *bdev, int wait) 194int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,9 +323,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
309 loff_t pos, unsigned len, unsigned flags, 323 loff_t pos, unsigned len, unsigned flags,
310 struct page **pagep, void **fsdata) 324 struct page **pagep, void **fsdata)
311{ 325{
312 *pagep = NULL; 326 return block_write_begin(mapping, pos, len, flags, pagep,
313 return block_write_begin_newtrunc(file, mapping, pos, len, flags, 327 blkdev_get_block);
314 pagep, fsdata, blkdev_get_block);
315} 328}
316 329
317static int blkdev_write_end(struct file *file, struct address_space *mapping, 330static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -372,7 +385,7 @@ int blkdev_fsync(struct file *filp, int datasync)
372 */ 385 */
373 mutex_unlock(&bd_inode->i_mutex); 386 mutex_unlock(&bd_inode->i_mutex);
374 387
375 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 388 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
376 if (error == -EOPNOTSUPP) 389 if (error == -EOPNOTSUPP)
377 error = 0; 390 error = 0;
378 391
@@ -428,10 +441,13 @@ static inline void __bd_forget(struct inode *inode)
428 inode->i_mapping = &inode->i_data; 441 inode->i_mapping = &inode->i_data;
429} 442}
430 443
431static void bdev_clear_inode(struct inode *inode) 444static void bdev_evict_inode(struct inode *inode)
432{ 445{
433 struct block_device *bdev = &BDEV_I(inode)->bdev; 446 struct block_device *bdev = &BDEV_I(inode)->bdev;
434 struct list_head *p; 447 struct list_head *p;
448 truncate_inode_pages(&inode->i_data, 0);
449 invalidate_inode_buffers(inode); /* is it needed here? */
450 end_writeback(inode);
435 spin_lock(&bdev_lock); 451 spin_lock(&bdev_lock);
436 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 452 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
437 __bd_forget(list_entry(p, struct inode, i_devices)); 453 __bd_forget(list_entry(p, struct inode, i_devices));
@@ -445,18 +461,18 @@ static const struct super_operations bdev_sops = {
445 .alloc_inode = bdev_alloc_inode, 461 .alloc_inode = bdev_alloc_inode,
446 .destroy_inode = bdev_destroy_inode, 462 .destroy_inode = bdev_destroy_inode,
447 .drop_inode = generic_delete_inode, 463 .drop_inode = generic_delete_inode,
448 .clear_inode = bdev_clear_inode, 464 .evict_inode = bdev_evict_inode,
449}; 465};
450 466
451static int bd_get_sb(struct file_system_type *fs_type, 467static struct dentry *bd_mount(struct file_system_type *fs_type,
452 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 468 int flags, const char *dev_name, void *data)
453{ 469{
454 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 470 return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
455} 471}
456 472
457static struct file_system_type bd_type = { 473static struct file_system_type bd_type = {
458 .name = "bdev", 474 .name = "bdev",
459 .get_sb = bd_get_sb, 475 .mount = bd_mount,
460 .kill_sb = kill_anon_super, 476 .kill_sb = kill_anon_super,
461}; 477};
462 478
@@ -549,7 +565,7 @@ EXPORT_SYMBOL(bdget);
549 */ 565 */
550struct block_device *bdgrab(struct block_device *bdev) 566struct block_device *bdgrab(struct block_device *bdev)
551{ 567{
552 atomic_inc(&bdev->bd_inode->i_count); 568 ihold(bdev->bd_inode);
553 return bdev; 569 return bdev;
554} 570}
555 571
@@ -579,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
579 spin_lock(&bdev_lock); 595 spin_lock(&bdev_lock);
580 bdev = inode->i_bdev; 596 bdev = inode->i_bdev;
581 if (bdev) { 597 if (bdev) {
582 atomic_inc(&bdev->bd_inode->i_count); 598 ihold(bdev->bd_inode);
583 spin_unlock(&bdev_lock); 599 spin_unlock(&bdev_lock);
584 return bdev; 600 return bdev;
585 } 601 }
@@ -590,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
590 spin_lock(&bdev_lock); 606 spin_lock(&bdev_lock);
591 if (!inode->i_bdev) { 607 if (!inode->i_bdev) {
592 /* 608 /*
593 * We take an additional bd_inode->i_count for inode, 609 * We take an additional reference to bd_inode,
594 * and it's released in clear_inode() of inode. 610 * and it's released in clear_inode() of inode.
595 * So, we can access it via ->i_mapping always 611 * So, we can access it via ->i_mapping always
596 * without igrab(). 612 * without igrab().
597 */ 613 */
598 atomic_inc(&bdev->bd_inode->i_count); 614 ihold(bdev->bd_inode);
599 inode->i_bdev = bdev; 615 inode->i_bdev = bdev;
600 inode->i_mapping = bdev->bd_inode->i_mapping; 616 inode->i_mapping = bdev->bd_inode->i_mapping;
601 list_add(&inode->i_devices, &bdev->bd_inodes); 617 list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -681,8 +697,8 @@ retry:
681 if (!bd_may_claim(bdev, whole, holder)) 697 if (!bd_may_claim(bdev, whole, holder))
682 return -EBUSY; 698 return -EBUSY;
683 699
684 /* if someone else is claiming, wait for it to finish */ 700 /* if claiming is already in progress, wait for it to finish */
685 if (whole->bd_claiming && whole->bd_claiming != holder) { 701 if (whole->bd_claiming) {
686 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 702 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
687 DEFINE_WAIT(wait); 703 DEFINE_WAIT(wait);
688 704
@@ -1339,19 +1355,20 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1339 /* 1355 /*
1340 * hooks: /n/, see "layering violations". 1356 * hooks: /n/, see "layering violations".
1341 */ 1357 */
1342 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1358 if (!for_part) {
1343 if (ret != 0) { 1359 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1344 bdput(bdev); 1360 if (ret != 0) {
1345 return ret; 1361 bdput(bdev);
1362 return ret;
1363 }
1346 } 1364 }
1347 1365
1348 lock_kernel();
1349 restart: 1366 restart:
1350 1367
1351 ret = -ENXIO; 1368 ret = -ENXIO;
1352 disk = get_gendisk(bdev->bd_dev, &partno); 1369 disk = get_gendisk(bdev->bd_dev, &partno);
1353 if (!disk) 1370 if (!disk)
1354 goto out_unlock_kernel; 1371 goto out;
1355 1372
1356 mutex_lock_nested(&bdev->bd_mutex, for_part); 1373 mutex_lock_nested(&bdev->bd_mutex, for_part);
1357 if (!bdev->bd_openers) { 1374 if (!bdev->bd_openers) {
@@ -1388,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1388 bdi = blk_get_backing_dev_info(bdev); 1405 bdi = blk_get_backing_dev_info(bdev);
1389 if (bdi == NULL) 1406 if (bdi == NULL)
1390 bdi = &default_backing_dev_info; 1407 bdi = &default_backing_dev_info;
1391 bdev->bd_inode->i_data.backing_dev_info = bdi; 1408 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1392 } 1409 }
1393 if (bdev->bd_invalidated) 1410 if (bdev->bd_invalidated)
1394 rescan_partitions(disk, bdev); 1411 rescan_partitions(disk, bdev);
@@ -1403,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1403 if (ret) 1420 if (ret)
1404 goto out_clear; 1421 goto out_clear;
1405 bdev->bd_contains = whole; 1422 bdev->bd_contains = whole;
1406 bdev->bd_inode->i_data.backing_dev_info = 1423 bdev_inode_switch_bdi(bdev->bd_inode,
1407 whole->bd_inode->i_data.backing_dev_info; 1424 whole->bd_inode->i_data.backing_dev_info);
1408 bdev->bd_part = disk_get_part(disk, partno); 1425 bdev->bd_part = disk_get_part(disk, partno);
1409 if (!(disk->flags & GENHD_FL_UP) || 1426 if (!(disk->flags & GENHD_FL_UP) ||
1410 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1427 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1431,22 +1448,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1431 if (for_part) 1448 if (for_part)
1432 bdev->bd_part_count++; 1449 bdev->bd_part_count++;
1433 mutex_unlock(&bdev->bd_mutex); 1450 mutex_unlock(&bdev->bd_mutex);
1434 unlock_kernel();
1435 return 0; 1451 return 0;
1436 1452
1437 out_clear: 1453 out_clear:
1438 disk_put_part(bdev->bd_part); 1454 disk_put_part(bdev->bd_part);
1439 bdev->bd_disk = NULL; 1455 bdev->bd_disk = NULL;
1440 bdev->bd_part = NULL; 1456 bdev->bd_part = NULL;
1441 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1457 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1442 if (bdev != bdev->bd_contains) 1458 if (bdev != bdev->bd_contains)
1443 __blkdev_put(bdev->bd_contains, mode, 1); 1459 __blkdev_put(bdev->bd_contains, mode, 1);
1444 bdev->bd_contains = NULL; 1460 bdev->bd_contains = NULL;
1445 out_unlock_bdev: 1461 out_unlock_bdev:
1446 mutex_unlock(&bdev->bd_mutex); 1462 mutex_unlock(&bdev->bd_mutex);
1447 out_unlock_kernel: 1463 out:
1448 unlock_kernel();
1449
1450 if (disk) 1464 if (disk)
1451 module_put(disk->fops->owner); 1465 module_put(disk->fops->owner);
1452 put_disk(disk); 1466 put_disk(disk);
@@ -1515,7 +1529,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1515 struct block_device *victim = NULL; 1529 struct block_device *victim = NULL;
1516 1530
1517 mutex_lock_nested(&bdev->bd_mutex, for_part); 1531 mutex_lock_nested(&bdev->bd_mutex, for_part);
1518 lock_kernel();
1519 if (for_part) 1532 if (for_part)
1520 bdev->bd_part_count--; 1533 bdev->bd_part_count--;
1521 1534
@@ -1535,12 +1548,12 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1535 disk_put_part(bdev->bd_part); 1548 disk_put_part(bdev->bd_part);
1536 bdev->bd_part = NULL; 1549 bdev->bd_part = NULL;
1537 bdev->bd_disk = NULL; 1550 bdev->bd_disk = NULL;
1538 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1551 bdev_inode_switch_bdi(bdev->bd_inode,
1552 &default_backing_dev_info);
1539 if (bdev != bdev->bd_contains) 1553 if (bdev != bdev->bd_contains)
1540 victim = bdev->bd_contains; 1554 victim = bdev->bd_contains;
1541 bdev->bd_contains = NULL; 1555 bdev->bd_contains = NULL;
1542 } 1556 }
1543 unlock_kernel();
1544 mutex_unlock(&bdev->bd_mutex); 1557 mutex_unlock(&bdev->bd_mutex);
1545 bdput(bdev); 1558 bdput(bdev);
1546 if (victim) 1559 if (victim)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a..7845d1f7d1d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
163 */ 163 */
164static void end_compressed_bio_read(struct bio *bio, int err) 164static void end_compressed_bio_read(struct bio *bio, int err)
165{ 165{
166 struct extent_io_tree *tree;
167 struct compressed_bio *cb = bio->bi_private; 166 struct compressed_bio *cb = bio->bi_private;
168 struct inode *inode; 167 struct inode *inode;
169 struct page *page; 168 struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
187 /* ok, we're the last bio for this extent, lets start 186 /* ok, we're the last bio for this extent, lets start
188 * the decompression. 187 * the decompression.
189 */ 188 */
190 tree = &BTRFS_I(inode)->io_tree;
191 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 189 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
192 cb->start, 190 cb->start,
193 cb->orig_bio->bi_io_vec, 191 cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc..9ac17159925 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
200 struct extent_buffer **cow_ret, u64 new_root_objectid) 200 struct extent_buffer **cow_ret, u64 new_root_objectid)
201{ 201{
202 struct extent_buffer *cow; 202 struct extent_buffer *cow;
203 u32 nritems;
204 int ret = 0; 203 int ret = 0;
205 int level; 204 int level;
206 struct btrfs_disk_key disk_key; 205 struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
210 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 209 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
211 210
212 level = btrfs_header_level(buf); 211 level = btrfs_header_level(buf);
213 nritems = btrfs_header_nritems(buf);
214 if (level == 0) 212 if (level == 0)
215 btrfs_item_key(buf, &disk_key, 0); 213 btrfs_item_key(buf, &disk_key, 0);
216 else 214 else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1008 int wret; 1006 int wret;
1009 int pslot; 1007 int pslot;
1010 int orig_slot = path->slots[level]; 1008 int orig_slot = path->slots[level];
1011 int err_on_enospc = 0;
1012 u64 orig_ptr; 1009 u64 orig_ptr;
1013 1010
1014 if (level == 0) 1011 if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1068 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1072 return 0; 1069 return 0;
1073 1070
1074 if (btrfs_header_nritems(mid) < 2) 1071 btrfs_header_nritems(mid);
1075 err_on_enospc = 1;
1076 1072
1077 left = read_node_slot(root, parent, pslot - 1); 1073 left = read_node_slot(root, parent, pslot - 1);
1078 if (left) { 1074 if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1103 wret = push_node_left(trans, root, left, mid, 1); 1099 wret = push_node_left(trans, root, left, mid, 1);
1104 if (wret < 0) 1100 if (wret < 0)
1105 ret = wret; 1101 ret = wret;
1106 if (btrfs_header_nritems(mid) < 2) 1102 btrfs_header_nritems(mid);
1107 err_on_enospc = 1;
1108 } 1103 }
1109 1104
1110 /* 1105 /*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1224 int wret; 1219 int wret;
1225 int pslot; 1220 int pslot;
1226 int orig_slot = path->slots[level]; 1221 int orig_slot = path->slots[level];
1227 u64 orig_ptr;
1228 1222
1229 if (level == 0) 1223 if (level == 0)
1230 return 1; 1224 return 1;
1231 1225
1232 mid = path->nodes[level]; 1226 mid = path->nodes[level];
1233 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1227 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1234 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1235 1228
1236 if (level < BTRFS_MAX_LEVEL - 1) 1229 if (level < BTRFS_MAX_LEVEL - 1)
1237 parent = path->nodes[level + 1]; 1230 parent = path->nodes[level + 1];
@@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1577 blocksize = btrfs_level_size(root, level - 1); 1570 blocksize = btrfs_level_size(root, level - 1);
1578 1571
1579 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1572 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1580 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1573 if (tmp) {
1581 /* 1574 if (btrfs_buffer_uptodate(tmp, 0)) {
1582 * we found an up to date block without sleeping, return 1575 if (btrfs_buffer_uptodate(tmp, gen)) {
1583 * right away 1576 /*
1584 */ 1577 * we found an up to date block without
1585 *eb_ret = tmp; 1578 * sleeping, return
1586 return 0; 1579 * right away
1580 */
1581 *eb_ret = tmp;
1582 return 0;
1583 }
1584 /* the pages were up to date, but we failed
1585 * the generation number check. Do a full
1586 * read for the generation number that is correct.
1587 * We must do this without dropping locks so
1588 * we can trust our generation number
1589 */
1590 free_extent_buffer(tmp);
1591 tmp = read_tree_block(root, blocknr, blocksize, gen);
1592 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1593 *eb_ret = tmp;
1594 return 0;
1595 }
1596 free_extent_buffer(tmp);
1597 btrfs_release_path(NULL, p);
1598 return -EIO;
1599 }
1587 } 1600 }
1588 1601
1589 /* 1602 /*
@@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1596 btrfs_unlock_up_safe(p, level + 1); 1609 btrfs_unlock_up_safe(p, level + 1);
1597 btrfs_set_path_blocking(p); 1610 btrfs_set_path_blocking(p);
1598 1611
1599 if (tmp) 1612 free_extent_buffer(tmp);
1600 free_extent_buffer(tmp);
1601 if (p->reada) 1613 if (p->reada)
1602 reada_for_search(root, p, level, slot, key->objectid); 1614 reada_for_search(root, p, level, slot, key->objectid);
1603 1615
@@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2548{ 2560{
2549 struct btrfs_disk_key disk_key; 2561 struct btrfs_disk_key disk_key;
2550 struct extent_buffer *right = path->nodes[0]; 2562 struct extent_buffer *right = path->nodes[0];
2551 int slot;
2552 int i; 2563 int i;
2553 int push_space = 0; 2564 int push_space = 0;
2554 int push_items = 0; 2565 int push_items = 0;
@@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2560 u32 this_item_size; 2571 u32 this_item_size;
2561 u32 old_left_item_size; 2572 u32 old_left_item_size;
2562 2573
2563 slot = path->slots[1];
2564
2565 if (empty) 2574 if (empty)
2566 nr = min(right_nritems, max_slot); 2575 nr = min(right_nritems, max_slot);
2567 else 2576 else
@@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3330{ 3339{
3331 int ret = 0; 3340 int ret = 0;
3332 int slot; 3341 int slot;
3333 int slot_orig;
3334 struct extent_buffer *leaf; 3342 struct extent_buffer *leaf;
3335 struct btrfs_item *item; 3343 struct btrfs_item *item;
3336 u32 nritems; 3344 u32 nritems;
@@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3340 unsigned int size_diff; 3348 unsigned int size_diff;
3341 int i; 3349 int i;
3342 3350
3343 slot_orig = path->slots[0];
3344 leaf = path->nodes[0]; 3351 leaf = path->nodes[0];
3345 slot = path->slots[0]; 3352 slot = path->slots[0];
3346 3353
@@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3445{ 3452{
3446 int ret = 0; 3453 int ret = 0;
3447 int slot; 3454 int slot;
3448 int slot_orig;
3449 struct extent_buffer *leaf; 3455 struct extent_buffer *leaf;
3450 struct btrfs_item *item; 3456 struct btrfs_item *item;
3451 u32 nritems; 3457 u32 nritems;
@@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3454 unsigned int old_size; 3460 unsigned int old_size;
3455 int i; 3461 int i;
3456 3462
3457 slot_orig = path->slots[0];
3458 leaf = path->nodes[0]; 3463 leaf = path->nodes[0];
3459 3464
3460 nritems = btrfs_header_nritems(leaf); 3465 nritems = btrfs_header_nritems(leaf);
@@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3787 struct btrfs_key *cpu_key, u32 *data_size, 3792 struct btrfs_key *cpu_key, u32 *data_size,
3788 int nr) 3793 int nr)
3789{ 3794{
3790 struct extent_buffer *leaf;
3791 int ret = 0; 3795 int ret = 0;
3792 int slot; 3796 int slot;
3793 int i; 3797 int i;
@@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3804 if (ret < 0) 3808 if (ret < 0)
3805 goto out; 3809 goto out;
3806 3810
3807 leaf = path->nodes[0];
3808 slot = path->slots[0]; 3811 slot = path->slots[0];
3809 BUG_ON(slot < 0); 3812 BUG_ON(slot < 0);
3810 3813
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 29c20092847..8db9234f6b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
99 */ 99 */
100#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL 100#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
101 101
102/* For storing free space cache */
103#define BTRFS_FREE_SPACE_OBJECTID -11ULL
104
102/* dummy objectid represents multiple objectids */ 105/* dummy objectid represents multiple objectids */
103#define BTRFS_MULTIPLE_OBJECTIDS -255ULL 106#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
104 107
@@ -265,6 +268,22 @@ struct btrfs_chunk {
265 /* additional stripes go here */ 268 /* additional stripes go here */
266} __attribute__ ((__packed__)); 269} __attribute__ ((__packed__));
267 270
271#define BTRFS_FREE_SPACE_EXTENT 1
272#define BTRFS_FREE_SPACE_BITMAP 2
273
274struct btrfs_free_space_entry {
275 __le64 offset;
276 __le64 bytes;
277 u8 type;
278} __attribute__ ((__packed__));
279
280struct btrfs_free_space_header {
281 struct btrfs_disk_key location;
282 __le64 generation;
283 __le64 num_entries;
284 __le64 num_bitmaps;
285} __attribute__ ((__packed__));
286
268static inline unsigned long btrfs_chunk_item_size(int num_stripes) 287static inline unsigned long btrfs_chunk_item_size(int num_stripes)
269{ 288{
270 BUG_ON(num_stripes == 0); 289 BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
365 384
366 char label[BTRFS_LABEL_SIZE]; 385 char label[BTRFS_LABEL_SIZE];
367 386
387 __le64 cache_generation;
388
368 /* future expansion */ 389 /* future expansion */
369 __le64 reserved[32]; 390 __le64 reserved[31];
370 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 391 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
371} __attribute__ ((__packed__)); 392} __attribute__ ((__packed__));
372 393
@@ -375,13 +396,15 @@ struct btrfs_super_block {
375 * ones specified below then we will fail to mount 396 * ones specified below then we will fail to mount
376 */ 397 */
377#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
378#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) 399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
379 401
380#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
381#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
382#define BTRFS_FEATURE_INCOMPAT_SUPP \ 404#define BTRFS_FEATURE_INCOMPAT_SUPP \
383 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
384 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) 406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
385 408
386/* 409/*
387 * A leaf is full of items. offset and size tell us where to find 410 * A leaf is full of items. offset and size tell us where to find
@@ -675,7 +698,8 @@ struct btrfs_block_group_item {
675struct btrfs_space_info { 698struct btrfs_space_info {
676 u64 flags; 699 u64 flags;
677 700
678 u64 total_bytes; /* total bytes in the space */ 701 u64 total_bytes; /* total bytes in the space,
702 this doesn't take mirrors into account */
679 u64 bytes_used; /* total bytes used, 703 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */ 704 this does't take mirrors into account */
681 u64 bytes_pinned; /* total bytes pinned, will be freed when the 705 u64 bytes_pinned; /* total bytes pinned, will be freed when the
@@ -687,6 +711,8 @@ struct btrfs_space_info {
687 u64 bytes_may_use; /* number of bytes that may be used for 711 u64 bytes_may_use; /* number of bytes that may be used for
688 delalloc/allocations */ 712 delalloc/allocations */
689 u64 disk_used; /* total bytes used on disk */ 713 u64 disk_used; /* total bytes used on disk */
714 u64 disk_total; /* total bytes on disk, takes mirrors into
715 account */
690 716
691 int full; /* indicates that we cannot allocate any more 717 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 718 chunks for this space */
@@ -750,6 +776,14 @@ enum btrfs_caching_type {
750 BTRFS_CACHE_FINISHED = 2, 776 BTRFS_CACHE_FINISHED = 2,
751}; 777};
752 778
779enum btrfs_disk_cache_state {
780 BTRFS_DC_WRITTEN = 0,
781 BTRFS_DC_ERROR = 1,
782 BTRFS_DC_CLEAR = 2,
783 BTRFS_DC_SETUP = 3,
784 BTRFS_DC_NEED_WRITE = 4,
785};
786
753struct btrfs_caching_control { 787struct btrfs_caching_control {
754 struct list_head list; 788 struct list_head list;
755 struct mutex mutex; 789 struct mutex mutex;
@@ -763,6 +797,7 @@ struct btrfs_block_group_cache {
763 struct btrfs_key key; 797 struct btrfs_key key;
764 struct btrfs_block_group_item item; 798 struct btrfs_block_group_item item;
765 struct btrfs_fs_info *fs_info; 799 struct btrfs_fs_info *fs_info;
800 struct inode *inode;
766 spinlock_t lock; 801 spinlock_t lock;
767 u64 pinned; 802 u64 pinned;
768 u64 reserved; 803 u64 reserved;
@@ -773,8 +808,11 @@ struct btrfs_block_group_cache {
773 int extents_thresh; 808 int extents_thresh;
774 int free_extents; 809 int free_extents;
775 int total_bitmaps; 810 int total_bitmaps;
776 int ro; 811 int ro:1;
777 int dirty; 812 int dirty:1;
813 int iref:1;
814
815 int disk_cache_state;
778 816
779 /* cache tracking stuff */ 817 /* cache tracking stuff */
780 int cached; 818 int cached;
@@ -863,6 +901,7 @@ struct btrfs_fs_info {
863 struct btrfs_transaction *running_transaction; 901 struct btrfs_transaction *running_transaction;
864 wait_queue_head_t transaction_throttle; 902 wait_queue_head_t transaction_throttle;
865 wait_queue_head_t transaction_wait; 903 wait_queue_head_t transaction_wait;
904 wait_queue_head_t transaction_blocked_wait;
866 wait_queue_head_t async_submit_wait; 905 wait_queue_head_t async_submit_wait;
867 906
868 struct btrfs_super_block super_copy; 907 struct btrfs_super_block super_copy;
@@ -949,6 +988,7 @@ struct btrfs_fs_info {
949 struct btrfs_workers endio_meta_workers; 988 struct btrfs_workers endio_meta_workers;
950 struct btrfs_workers endio_meta_write_workers; 989 struct btrfs_workers endio_meta_write_workers;
951 struct btrfs_workers endio_write_workers; 990 struct btrfs_workers endio_write_workers;
991 struct btrfs_workers endio_freespace_worker;
952 struct btrfs_workers submit_workers; 992 struct btrfs_workers submit_workers;
953 /* 993 /*
954 * fixup workers take dirty pages that didn't properly go through 994 * fixup workers take dirty pages that didn't properly go through
@@ -1192,6 +1232,9 @@ struct btrfs_root {
1192#define BTRFS_MOUNT_NOSSD (1 << 9) 1232#define BTRFS_MOUNT_NOSSD (1 << 9)
1193#define BTRFS_MOUNT_DISCARD (1 << 10) 1233#define BTRFS_MOUNT_DISCARD (1 << 10)
1194#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1234#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
1235#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1236#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1237#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1195 1238
1196#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1239#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1197#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1240#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1665 write_eb_member(eb, item, struct btrfs_dir_item, location, key); 1708 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1666} 1709}
1667 1710
1711BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
1712 num_entries, 64);
1713BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
1714 num_bitmaps, 64);
1715BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
1716 generation, 64);
1717
1718static inline void btrfs_free_space_key(struct extent_buffer *eb,
1719 struct btrfs_free_space_header *h,
1720 struct btrfs_disk_key *key)
1721{
1722 read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1723}
1724
1725static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
1726 struct btrfs_free_space_header *h,
1727 struct btrfs_disk_key *key)
1728{
1729 write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1730}
1731
1668/* struct btrfs_disk_key */ 1732/* struct btrfs_disk_key */
1669BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, 1733BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1670 objectid, 64); 1734 objectid, 64);
@@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1876 incompat_flags, 64); 1940 incompat_flags, 64);
1877BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1941BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1878 csum_type, 16); 1942 csum_type, 16);
1943BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
1944 cache_generation, 64);
1879 1945
1880static inline int btrfs_super_csum_size(struct btrfs_super_block *s) 1946static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1881{ 1947{
@@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file)
1988 return file->f_path.dentry; 2054 return file->f_path.dentry;
1989} 2055}
1990 2056
2057static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2058{
2059 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2060 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2061}
2062
1991/* extent-tree.c */ 2063/* extent-tree.c */
1992void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2064void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2065int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2151void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 2152int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2081 struct btrfs_root *root, 2153 struct btrfs_root *root,
2082 int num_items, int *retries); 2154 int num_items);
2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2155void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2084 struct btrfs_root *root); 2156 struct btrfs_root *root);
2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2157int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 2172int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root, 2173 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv, 2174 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries); 2175 u64 num_bytes);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2176int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root, 2177 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv, 2178 struct btrfs_block_rsv *block_rsv,
@@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache); 2187 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root, 2188int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache); 2189 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2118/* ctree.c */ 2191/* ctree.c */
2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2120 int level, int *slot); 2193 int level, int *slot);
@@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2373 u32 min_type); 2446 u32 min_type);
2374 2447
2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2448int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); 2449int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
2450 int sync);
2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2451int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2378 struct extent_state **cached_state); 2452 struct extent_state **cached_state);
2379int btrfs_writepages(struct address_space *mapping, 2453int btrfs_writepages(struct address_space *mapping,
@@ -2389,13 +2463,13 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
2389 pgoff_t offset, pgoff_t last_index); 2463 pgoff_t offset, pgoff_t last_index);
2390int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2464int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2391int btrfs_readpage(struct file *file, struct page *page); 2465int btrfs_readpage(struct file *file, struct page *page);
2392void btrfs_delete_inode(struct inode *inode); 2466void btrfs_evict_inode(struct inode *inode);
2393void btrfs_put_inode(struct inode *inode); 2467void btrfs_put_inode(struct inode *inode);
2394int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2468int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2395void btrfs_dirty_inode(struct inode *inode); 2469void btrfs_dirty_inode(struct inode *inode);
2396struct inode *btrfs_alloc_inode(struct super_block *sb); 2470struct inode *btrfs_alloc_inode(struct super_block *sb);
2397void btrfs_destroy_inode(struct inode *inode); 2471void btrfs_destroy_inode(struct inode *inode);
2398void btrfs_drop_inode(struct inode *inode); 2472int btrfs_drop_inode(struct inode *inode);
2399int btrfs_init_cachep(void); 2473int btrfs_init_cachep(void);
2400void btrfs_destroy_cachep(void); 2474void btrfs_destroy_cachep(void);
2401long btrfs_ioctl_trans_end(struct file *file); 2475long btrfs_ioctl_trans_end(struct file *file);
@@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode, 2500int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size, 2501 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint); 2502 loff_t actual_len, u64 *alloc_hint);
2503int btrfs_prealloc_file_range_trans(struct inode *inode,
2504 struct btrfs_trans_handle *trans, int mode,
2505 u64 start, u64 num_bytes, u64 min_size,
2506 loff_t actual_len, u64 *alloc_hint);
2429extern const struct dentry_operations btrfs_dentry_operations; 2507extern const struct dentry_operations btrfs_dentry_operations;
2430 2508
2431/* ioctl.c */ 2509/* ioctl.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa4..f0cad5ae5be 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
427 ret = btrfs_truncate_item(trans, root, path, 427 ret = btrfs_truncate_item(trans, root, path,
428 item_len - sub_item_len, 1); 428 item_len - sub_item_len, 1);
429 } 429 }
430 return 0; 430 return ret;
431} 431}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34f7c375567..fb827d0d718 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
338 struct extent_io_tree *tree; 338 struct extent_io_tree *tree;
339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
340 u64 found_start; 340 u64 found_start;
341 int found_level;
342 unsigned long len; 341 unsigned long len;
343 struct extent_buffer *eb; 342 struct extent_buffer *eb;
344 int ret; 343 int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
369 WARN_ON(1); 368 WARN_ON(1);
370 goto err; 369 goto err;
371 } 370 }
372 found_level = btrfs_header_level(eb);
373
374 csum_tree_block(root, eb, 0); 371 csum_tree_block(root, eb, 0);
375err: 372err:
376 free_extent_buffer(eb); 373 free_extent_buffer(eb);
@@ -480,10 +477,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
480 end_io_wq->work.func = end_workqueue_fn; 477 end_io_wq->work.func = end_workqueue_fn;
481 end_io_wq->work.flags = 0; 478 end_io_wq->work.flags = 0;
482 479
483 if (bio->bi_rw & (1 << BIO_RW)) { 480 if (bio->bi_rw & REQ_WRITE) {
484 if (end_io_wq->metadata) 481 if (end_io_wq->metadata == 1)
485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 482 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
486 &end_io_wq->work); 483 &end_io_wq->work);
484 else if (end_io_wq->metadata == 2)
485 btrfs_queue_worker(&fs_info->endio_freespace_worker,
486 &end_io_wq->work);
487 else 487 else
488 btrfs_queue_worker(&fs_info->endio_write_workers, 488 btrfs_queue_worker(&fs_info->endio_write_workers,
489 &end_io_wq->work); 489 &end_io_wq->work);
@@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
497 } 497 }
498} 498}
499 499
500/*
501 * For the metadata arg you want
502 *
503 * 0 - if data
504 * 1 - if normal metadta
505 * 2 - if writing to the free space cache area
506 */
500int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 507int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
501 int metadata) 508 int metadata)
502{ 509{
@@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
533 540
534static void run_one_async_start(struct btrfs_work *work) 541static void run_one_async_start(struct btrfs_work *work)
535{ 542{
536 struct btrfs_fs_info *fs_info;
537 struct async_submit_bio *async; 543 struct async_submit_bio *async;
538 544
539 async = container_of(work, struct async_submit_bio, work); 545 async = container_of(work, struct async_submit_bio, work);
540 fs_info = BTRFS_I(async->inode)->root->fs_info;
541 async->submit_bio_start(async->inode, async->rw, async->bio, 546 async->submit_bio_start(async->inode, async->rw, async->bio,
542 async->mirror_num, async->bio_flags, 547 async->mirror_num, async->bio_flags,
543 async->bio_offset); 548 async->bio_offset);
@@ -604,7 +609,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
604 609
605 atomic_inc(&fs_info->nr_async_submits); 610 atomic_inc(&fs_info->nr_async_submits);
606 611
607 if (rw & (1 << BIO_RW_SYNCIO)) 612 if (rw & REQ_SYNC)
608 btrfs_set_work_high_prio(&async->work); 613 btrfs_set_work_high_prio(&async->work);
609 614
610 btrfs_queue_worker(&fs_info->workers, &async->work); 615 btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -668,7 +673,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668 bio, 1); 673 bio, 1);
669 BUG_ON(ret); 674 BUG_ON(ret);
670 675
671 if (!(rw & (1 << BIO_RW))) { 676 if (!(rw & REQ_WRITE)) {
672 /* 677 /*
673 * called for a read, do the setup so that checksum validation 678 * called for a read, do the setup so that checksum validation
674 * can happen in the async kernel threads 679 * can happen in the async kernel threads
@@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
850 u32 blocksize, u64 parent_transid) 855 u32 blocksize, u64 parent_transid)
851{ 856{
852 struct extent_buffer *buf = NULL; 857 struct extent_buffer *buf = NULL;
853 struct inode *btree_inode = root->fs_info->btree_inode;
854 struct extent_io_tree *io_tree;
855 int ret; 858 int ret;
856 859
857 io_tree = &BTRFS_I(btree_inode)->io_tree;
858
859 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 860 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
860 if (!buf) 861 if (!buf)
861 return NULL; 862 return NULL;
@@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
1377 u64 start = 0; 1378 u64 start = 0;
1378 struct page *page; 1379 struct page *page;
1379 struct extent_io_tree *io_tree = NULL; 1380 struct extent_io_tree *io_tree = NULL;
1380 struct btrfs_fs_info *info = NULL;
1381 struct bio_vec *bvec; 1381 struct bio_vec *bvec;
1382 int i; 1382 int i;
1383 int ret; 1383 int ret;
@@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
1396 buf_len = page->private >> 2; 1396 buf_len = page->private >> 2;
1397 start = page_offset(page) + bvec->bv_offset; 1397 start = page_offset(page) + bvec->bv_offset;
1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1399 info = BTRFS_I(page->mapping->host)->root->fs_info;
1400 } 1399 }
1401 /* are we fully contained in this bio? */ 1400 /* are we fully contained in this bio? */
1402 if (buf_len <= length) 1401 if (buf_len <= length)
@@ -1427,7 +1426,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
1427 * ram and up to date before trying to verify things. For 1426 * ram and up to date before trying to verify things. For
1428 * blocksize <= pagesize, it is basically a noop 1427 * blocksize <= pagesize, it is basically a noop
1429 */ 1428 */
1430 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && 1429 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1431 !bio_ready_for_csum(bio)) { 1430 !bio_ready_for_csum(bio)) {
1432 btrfs_queue_worker(&fs_info->endio_meta_workers, 1431 btrfs_queue_worker(&fs_info->endio_meta_workers,
1433 &end_io_wq->work); 1432 &end_io_wq->work);
@@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1680 1679
1681 init_waitqueue_head(&fs_info->transaction_throttle); 1680 init_waitqueue_head(&fs_info->transaction_throttle);
1682 init_waitqueue_head(&fs_info->transaction_wait); 1681 init_waitqueue_head(&fs_info->transaction_wait);
1682 init_waitqueue_head(&fs_info->transaction_blocked_wait);
1683 init_waitqueue_head(&fs_info->async_submit_wait); 1683 init_waitqueue_head(&fs_info->async_submit_wait);
1684 1684
1685 __setup_root(4096, 4096, 4096, 4096, tree_root, 1685 __setup_root(4096, 4096, 4096, 4096, tree_root,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1686 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1687
1688
1689 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1688 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1690 if (!bh) 1689 if (!bh)
1691 goto fail_iput; 1690 goto fail_iput;
@@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1775 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1774 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1776 fs_info->thread_pool_size, 1775 fs_info->thread_pool_size,
1777 &fs_info->generic_worker); 1776 &fs_info->generic_worker);
1777 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1778 1, &fs_info->generic_worker);
1778 1779
1779 /* 1780 /*
1780 * endios are largely parallel and should have a very 1781 * endios are largely parallel and should have a very
@@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1797 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1798 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1799 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1798 1800
1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1801 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1802 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 if (!(sb->s_flags & MS_RDONLY)) { 1995 if (!(sb->s_flags & MS_RDONLY)) {
1994 down_read(&fs_info->cleanup_work_sem); 1996 down_read(&fs_info->cleanup_work_sem);
1995 btrfs_orphan_cleanup(fs_info->fs_root); 1997 btrfs_orphan_cleanup(fs_info->fs_root);
1998 btrfs_orphan_cleanup(fs_info->tree_root);
1996 up_read(&fs_info->cleanup_work_sem); 1999 up_read(&fs_info->cleanup_work_sem);
1997 } 2000 }
1998 2001
@@ -2035,6 +2038,7 @@ fail_sb_buffer:
2035 btrfs_stop_workers(&fs_info->endio_meta_workers); 2038 btrfs_stop_workers(&fs_info->endio_meta_workers);
2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2039 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2037 btrfs_stop_workers(&fs_info->endio_write_workers); 2040 btrfs_stop_workers(&fs_info->endio_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2038 btrfs_stop_workers(&fs_info->submit_workers); 2042 btrfs_stop_workers(&fs_info->submit_workers);
2039fail_iput: 2043fail_iput:
2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2044 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2063,7 +2067,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2067 if (uptodate) {
2064 set_buffer_uptodate(bh); 2068 set_buffer_uptodate(bh);
2065 } else { 2069 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2070 if (printk_ratelimit()) {
2067 printk(KERN_WARNING "lost page write due to " 2071 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2072 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2073 bdevname(bh->b_bdev, b));
@@ -2200,21 +2204,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2204 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2205 }
2202 2206
2203 if (i == last_barrier && do_barriers && device->barriers) { 2207 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2208 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2209 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2210 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2211
2219 if (ret) 2212 if (ret)
2220 errors++; 2213 errors++;
@@ -2421,6 +2414,7 @@ int close_ctree(struct btrfs_root *root)
2421 fs_info->closing = 1; 2414 fs_info->closing = 1;
2422 smp_mb(); 2415 smp_mb();
2423 2416
2417 btrfs_put_block_group_cache(fs_info);
2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2418 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2425 ret = btrfs_commit_super(root); 2419 ret = btrfs_commit_super(root);
2426 if (ret) 2420 if (ret)
@@ -2467,6 +2461,7 @@ int close_ctree(struct btrfs_root *root)
2467 btrfs_stop_workers(&fs_info->endio_meta_workers); 2461 btrfs_stop_workers(&fs_info->endio_meta_workers);
2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2462 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2469 btrfs_stop_workers(&fs_info->endio_write_workers); 2463 btrfs_stop_workers(&fs_info->endio_write_workers);
2464 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2470 btrfs_stop_workers(&fs_info->submit_workers); 2465 btrfs_stop_workers(&fs_info->submit_workers);
2471 2466
2472 btrfs_close_devices(fs_info->fs_devices); 2467 btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a5..0c097f3aec4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
242 return NULL; 242 return NULL;
243 } 243 }
244 244
245 /* We're loading it the fast way, so we don't have a caching_ctl. */
246 if (!cache->caching_ctl) {
247 spin_unlock(&cache->lock);
248 return NULL;
249 }
250
245 ctl = cache->caching_ctl; 251 ctl = cache->caching_ctl;
246 atomic_inc(&ctl->count); 252 atomic_inc(&ctl->count);
247 spin_unlock(&cache->lock); 253 spin_unlock(&cache->lock);
@@ -421,7 +427,9 @@ err:
421 return 0; 427 return 0;
422} 428}
423 429
424static int cache_block_group(struct btrfs_block_group_cache *cache) 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans,
432 int load_cache_only)
425{ 433{
426 struct btrfs_fs_info *fs_info = cache->fs_info; 434 struct btrfs_fs_info *fs_info = cache->fs_info;
427 struct btrfs_caching_control *caching_ctl; 435 struct btrfs_caching_control *caching_ctl;
@@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
432 if (cache->cached != BTRFS_CACHE_NO) 440 if (cache->cached != BTRFS_CACHE_NO)
433 return 0; 441 return 0;
434 442
443 /*
444 * We can't do the read from on-disk cache during a commit since we need
445 * to have the normal tree locking.
446 */
447 if (!trans->transaction->in_commit) {
448 spin_lock(&cache->lock);
449 if (cache->cached != BTRFS_CACHE_NO) {
450 spin_unlock(&cache->lock);
451 return 0;
452 }
453 cache->cached = BTRFS_CACHE_STARTED;
454 spin_unlock(&cache->lock);
455
456 ret = load_free_space_cache(fs_info, cache);
457
458 spin_lock(&cache->lock);
459 if (ret == 1) {
460 cache->cached = BTRFS_CACHE_FINISHED;
461 cache->last_byte_to_unpin = (u64)-1;
462 } else {
463 cache->cached = BTRFS_CACHE_NO;
464 }
465 spin_unlock(&cache->lock);
466 if (ret == 1)
467 return 0;
468 }
469
470 if (load_cache_only)
471 return 0;
472
435 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 473 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
436 BUG_ON(!caching_ctl); 474 BUG_ON(!caching_ctl);
437 475
@@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
509 547
510 rcu_read_lock(); 548 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 549 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 550 if (found->flags & flags) {
513 rcu_read_unlock(); 551 rcu_read_unlock();
514 return found; 552 return found;
515 } 553 }
@@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
542 return num; 580 return num;
543} 581}
544 582
583static u64 div_factor_fine(u64 num, int factor)
584{
585 if (factor == 100)
586 return num;
587 num *= factor;
588 do_div(num, 100);
589 return num;
590}
591
545u64 btrfs_find_block_group(struct btrfs_root *root, 592u64 btrfs_find_block_group(struct btrfs_root *root,
546 u64 search_start, u64 search_hint, int owner) 593 u64 search_start, u64 search_hint, int owner)
547{ 594{
@@ -1695,8 +1742,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1695static void btrfs_issue_discard(struct block_device *bdev, 1742static void btrfs_issue_discard(struct block_device *bdev,
1696 u64 start, u64 len) 1743 u64 start, u64 len)
1697{ 1744{
1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1745 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1700} 1746}
1701 1747
1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1748static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -2688,6 +2734,109 @@ next_block_group(struct btrfs_root *root,
2688 return cache; 2734 return cache;
2689} 2735}
2690 2736
2737static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2738 struct btrfs_trans_handle *trans,
2739 struct btrfs_path *path)
2740{
2741 struct btrfs_root *root = block_group->fs_info->tree_root;
2742 struct inode *inode = NULL;
2743 u64 alloc_hint = 0;
2744 int num_pages = 0;
2745 int retries = 0;
2746 int ret = 0;
2747
2748 /*
2749 * If this block group is smaller than 100 megs don't bother caching the
2750 * block group.
2751 */
2752 if (block_group->key.offset < (100 * 1024 * 1024)) {
2753 spin_lock(&block_group->lock);
2754 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2755 spin_unlock(&block_group->lock);
2756 return 0;
2757 }
2758
2759again:
2760 inode = lookup_free_space_inode(root, block_group, path);
2761 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2762 ret = PTR_ERR(inode);
2763 btrfs_release_path(root, path);
2764 goto out;
2765 }
2766
2767 if (IS_ERR(inode)) {
2768 BUG_ON(retries);
2769 retries++;
2770
2771 if (block_group->ro)
2772 goto out_free;
2773
2774 ret = create_free_space_inode(root, trans, block_group, path);
2775 if (ret)
2776 goto out_free;
2777 goto again;
2778 }
2779
2780 /*
2781 * We want to set the generation to 0, that way if anything goes wrong
2782 * from here on out we know not to trust this cache when we load up next
2783 * time.
2784 */
2785 BTRFS_I(inode)->generation = 0;
2786 ret = btrfs_update_inode(trans, root, inode);
2787 WARN_ON(ret);
2788
2789 if (i_size_read(inode) > 0) {
2790 ret = btrfs_truncate_free_space_cache(root, trans, path,
2791 inode);
2792 if (ret)
2793 goto out_put;
2794 }
2795
2796 spin_lock(&block_group->lock);
2797 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2798 spin_unlock(&block_group->lock);
2799 goto out_put;
2800 }
2801 spin_unlock(&block_group->lock);
2802
2803 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2804 if (!num_pages)
2805 num_pages = 1;
2806
2807 /*
2808 * Just to make absolutely sure we have enough space, we're going to
2809 * preallocate 12 pages worth of space for each block group. In
2810 * practice we ought to use at most 8, but we need extra space so we can
2811 * add our header and have a terminator between the extents and the
2812 * bitmaps.
2813 */
2814 num_pages *= 16;
2815 num_pages *= PAGE_CACHE_SIZE;
2816
2817 ret = btrfs_check_data_free_space(inode, num_pages);
2818 if (ret)
2819 goto out_put;
2820
2821 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2822 num_pages, num_pages,
2823 &alloc_hint);
2824 btrfs_free_reserved_data_space(inode, num_pages);
2825out_put:
2826 iput(inode);
2827out_free:
2828 btrfs_release_path(root, path);
2829out:
2830 spin_lock(&block_group->lock);
2831 if (ret)
2832 block_group->disk_cache_state = BTRFS_DC_ERROR;
2833 else
2834 block_group->disk_cache_state = BTRFS_DC_SETUP;
2835 spin_unlock(&block_group->lock);
2836
2837 return ret;
2838}
2839
2691int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2840int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2692 struct btrfs_root *root) 2841 struct btrfs_root *root)
2693{ 2842{
@@ -2700,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2700 if (!path) 2849 if (!path)
2701 return -ENOMEM; 2850 return -ENOMEM;
2702 2851
2852again:
2853 while (1) {
2854 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2855 while (cache) {
2856 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2857 break;
2858 cache = next_block_group(root, cache);
2859 }
2860 if (!cache) {
2861 if (last == 0)
2862 break;
2863 last = 0;
2864 continue;
2865 }
2866 err = cache_save_setup(cache, trans, path);
2867 last = cache->key.objectid + cache->key.offset;
2868 btrfs_put_block_group(cache);
2869 }
2870
2703 while (1) { 2871 while (1) {
2704 if (last == 0) { 2872 if (last == 0) {
2705 err = btrfs_run_delayed_refs(trans, root, 2873 err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2709 2877
2710 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2878 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2711 while (cache) { 2879 while (cache) {
2880 if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2881 btrfs_put_block_group(cache);
2882 goto again;
2883 }
2884
2712 if (cache->dirty) 2885 if (cache->dirty)
2713 break; 2886 break;
2714 cache = next_block_group(root, cache); 2887 cache = next_block_group(root, cache);
@@ -2720,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2720 continue; 2893 continue;
2721 } 2894 }
2722 2895
2896 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2897 cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2723 cache->dirty = 0; 2898 cache->dirty = 0;
2724 last = cache->key.objectid + cache->key.offset; 2899 last = cache->key.objectid + cache->key.offset;
2725 2900
@@ -2728,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2728 btrfs_put_block_group(cache); 2903 btrfs_put_block_group(cache);
2729 } 2904 }
2730 2905
2906 while (1) {
2907 /*
2908 * I don't think this is needed since we're just marking our
2909 * preallocated extent as written, but just in case it can't
2910 * hurt.
2911 */
2912 if (last == 0) {
2913 err = btrfs_run_delayed_refs(trans, root,
2914 (unsigned long)-1);
2915 BUG_ON(err);
2916 }
2917
2918 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2919 while (cache) {
2920 /*
2921 * Really this shouldn't happen, but it could if we
2922 * couldn't write the entire preallocated extent and
2923 * splitting the extent resulted in a new block.
2924 */
2925 if (cache->dirty) {
2926 btrfs_put_block_group(cache);
2927 goto again;
2928 }
2929 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2930 break;
2931 cache = next_block_group(root, cache);
2932 }
2933 if (!cache) {
2934 if (last == 0)
2935 break;
2936 last = 0;
2937 continue;
2938 }
2939
2940 btrfs_write_out_cache(root, trans, cache, path);
2941
2942 /*
2943 * If we didn't have an error then the cache state is still
2944 * NEED_WRITE, so we can set it to WRITTEN.
2945 */
2946 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2947 cache->disk_cache_state = BTRFS_DC_WRITTEN;
2948 last = cache->key.objectid + cache->key.offset;
2949 btrfs_put_block_group(cache);
2950 }
2951
2731 btrfs_free_path(path); 2952 btrfs_free_path(path);
2732 return 0; 2953 return 0;
2733} 2954}
@@ -2763,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2763 if (found) { 2984 if (found) {
2764 spin_lock(&found->lock); 2985 spin_lock(&found->lock);
2765 found->total_bytes += total_bytes; 2986 found->total_bytes += total_bytes;
2987 found->disk_total += total_bytes * factor;
2766 found->bytes_used += bytes_used; 2988 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor; 2989 found->disk_used += bytes_used * factor;
2768 found->full = 0; 2990 found->full = 0;
@@ -2782,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2782 BTRFS_BLOCK_GROUP_SYSTEM | 3004 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA); 3005 BTRFS_BLOCK_GROUP_METADATA);
2784 found->total_bytes = total_bytes; 3006 found->total_bytes = total_bytes;
3007 found->disk_total = total_bytes * factor;
2785 found->bytes_used = bytes_used; 3008 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor; 3009 found->disk_used = bytes_used * factor;
2787 found->bytes_pinned = 0; 3010 found->bytes_pinned = 0;
@@ -2883,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
2883 struct btrfs_space_info *data_sinfo; 3106 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root; 3107 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used; 3108 u64 used;
2886 int ret = 0, committed = 0; 3109 int ret = 0, committed = 0, alloc_chunk = 1;
2887 3110
2888 /* make sure bytes are sectorsize aligned */ 3111 /* make sure bytes are sectorsize aligned */
2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3112 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2890 3113
3114 if (root == root->fs_info->tree_root) {
3115 alloc_chunk = 0;
3116 committed = 1;
3117 }
3118
2891 data_sinfo = BTRFS_I(inode)->space_info; 3119 data_sinfo = BTRFS_I(inode)->space_info;
2892 if (!data_sinfo) 3120 if (!data_sinfo)
2893 goto alloc; 3121 goto alloc;
@@ -2906,7 +3134,7 @@ again:
2906 * if we don't have enough free bytes in this space then we need 3134 * if we don't have enough free bytes in this space then we need
2907 * to alloc a new chunk. 3135 * to alloc a new chunk.
2908 */ 3136 */
2909 if (!data_sinfo->full) { 3137 if (!data_sinfo->full && alloc_chunk) {
2910 u64 alloc_target; 3138 u64 alloc_target;
2911 3139
2912 data_sinfo->force_alloc = 1; 3140 data_sinfo->force_alloc = 1;
@@ -2998,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
2998 rcu_read_unlock(); 3226 rcu_read_unlock();
2999} 3227}
3000 3228
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo, 3229static int should_alloc_chunk(struct btrfs_root *root,
3002 u64 alloc_bytes) 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes)
3003{ 3231{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3232 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3233 u64 thresh;
3005 3234
3006 if (sinfo->bytes_used + sinfo->bytes_reserved + 3235 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3236 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3011,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3011 alloc_bytes < div_factor(num_bytes, 8)) 3240 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0; 3241 return 0;
3013 3242
3243 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3244 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3245
3246 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3247 return 0;
3248
3014 return 1; 3249 return 1;
3015} 3250}
3016 3251
@@ -3042,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3042 goto out; 3277 goto out;
3043 } 3278 }
3044 3279
3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { 3280 if (!force && !should_alloc_chunk(extent_root, space_info,
3281 alloc_bytes)) {
3046 spin_unlock(&space_info->lock); 3282 spin_unlock(&space_info->lock);
3047 goto out; 3283 goto out;
3048 } 3284 }
3049 spin_unlock(&space_info->lock); 3285 spin_unlock(&space_info->lock);
3050 3286
3051 /* 3287 /*
3288 * If we have mixed data/metadata chunks we want to make sure we keep
3289 * allocating mixed chunks instead of individual chunks.
3290 */
3291 if (btrfs_mixed_space_info(space_info))
3292 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3293
3294 /*
3052 * if we're doing a data chunk, go ahead and make sure that 3295 * if we're doing a data chunk, go ahead and make sure that
3053 * we keep a reasonable number of metadata chunks allocated in the 3296 * we keep a reasonable number of metadata chunks allocated in the
3054 * FS as well. 3297 * FS as well.
@@ -3073,55 +3316,25 @@ out:
3073 return ret; 3316 return ret;
3074} 3317}
3075 3318
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/* 3319/*
3109 * shrink metadata reservation for delalloc 3320 * shrink metadata reservation for delalloc
3110 */ 3321 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans, 3322static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim) 3323 struct btrfs_root *root, u64 to_reclaim, int sync)
3113{ 3324{
3114 struct btrfs_block_rsv *block_rsv; 3325 struct btrfs_block_rsv *block_rsv;
3326 struct btrfs_space_info *space_info;
3115 u64 reserved; 3327 u64 reserved;
3116 u64 max_reclaim; 3328 u64 max_reclaim;
3117 u64 reclaimed = 0; 3329 u64 reclaimed = 0;
3118 int pause = 1; 3330 int pause = 1;
3119 int ret; 3331 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3120 3332
3121 block_rsv = &root->fs_info->delalloc_block_rsv; 3333 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock); 3334 space_info = block_rsv->space_info;
3123 reserved = block_rsv->reserved; 3335
3124 spin_unlock(&block_rsv->lock); 3336 smp_mb();
3337 reserved = space_info->bytes_reserved;
3125 3338
3126 if (reserved == 0) 3339 if (reserved == 0)
3127 return 0; 3340 return 0;
@@ -3129,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3129 max_reclaim = min(reserved, to_reclaim); 3342 max_reclaim = min(reserved, to_reclaim);
3130 3343
3131 while (1) { 3344 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); 3345 /* have the flusher threads jump in and do some IO */
3133 if (!ret) { 3346 smp_mb();
3134 __set_current_state(TASK_INTERRUPTIBLE); 3347 nr_pages = min_t(unsigned long, nr_pages,
3135 schedule_timeout(pause); 3348 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3136 pause <<= 1; 3349 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142 3350
3143 spin_lock(&block_rsv->lock); 3351 spin_lock(&space_info->lock);
3144 if (reserved > block_rsv->reserved) 3352 if (reserved > space_info->bytes_reserved)
3145 reclaimed = reserved - block_rsv->reserved; 3353 reclaimed += reserved - space_info->bytes_reserved;
3146 reserved = block_rsv->reserved; 3354 reserved = space_info->bytes_reserved;
3147 spin_unlock(&block_rsv->lock); 3355 spin_unlock(&space_info->lock);
3148 3356
3149 if (reserved == 0 || reclaimed >= max_reclaim) 3357 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break; 3358 break;
3151 3359
3152 if (trans && trans->transaction->blocked) 3360 if (trans && trans->transaction->blocked)
3153 return -EAGAIN; 3361 return -EAGAIN;
3362
3363 __set_current_state(TASK_INTERRUPTIBLE);
3364 schedule_timeout(pause);
3365 pause <<= 1;
3366 if (pause > HZ / 10)
3367 pause = HZ / 10;
3368
3154 } 3369 }
3155 return reclaimed >= to_reclaim; 3370 return reclaimed >= to_reclaim;
3156} 3371}
3157 3372
3158static int should_retry_reserve(struct btrfs_trans_handle *trans, 3373/*
3159 struct btrfs_root *root, 3374 * Retries tells us how many times we've called reserve_metadata_bytes. The
3160 struct btrfs_block_rsv *block_rsv, 3375 * idea is if this is the first call (retries == 0) then we will add to our
3161 u64 num_bytes, int *retries) 3376 * reserved count if we can't make the allocation in order to hold our place
3377 * while we go and try and free up space. That way for retries > 1 we don't try
3378 * and add space, we just check to see if the amount of unused space is >= the
3379 * total space, meaning that our reservation is valid.
3380 *
3381 * However if we don't intend to retry this reservation, pass -1 as retries so
3382 * that it short circuits this logic.
3383 */
3384static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3385 struct btrfs_root *root,
3386 struct btrfs_block_rsv *block_rsv,
3387 u64 orig_bytes, int flush)
3162{ 3388{
3163 struct btrfs_space_info *space_info = block_rsv->space_info; 3389 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret; 3390 u64 unused;
3391 u64 num_bytes = orig_bytes;
3392 int retries = 0;
3393 int ret = 0;
3394 bool reserved = false;
3395 bool committed = false;
3165 3396
3166 if ((*retries) > 2) 3397again:
3167 return -ENOSPC; 3398 ret = -ENOSPC;
3399 if (reserved)
3400 num_bytes = 0;
3168 3401
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); 3402 spin_lock(&space_info->lock);
3170 if (ret) 3403 unused = space_info->bytes_used + space_info->bytes_reserved +
3171 return 1; 3404 space_info->bytes_pinned + space_info->bytes_readonly +
3405 space_info->bytes_may_use;
3172 3406
3173 if (trans && trans->transaction->in_commit) 3407 /*
3174 return -ENOSPC; 3408 * The idea here is that we've not already over-reserved the block group
3409 * then we can go ahead and save our reservation first and then start
3410 * flushing if we need to. Otherwise if we've already overcommitted
3411 * lets start flushing stuff first and then come back and try to make
3412 * our reservation.
3413 */
3414 if (unused <= space_info->total_bytes) {
3415 unused -= space_info->total_bytes;
3416 if (unused >= num_bytes) {
3417 if (!reserved)
3418 space_info->bytes_reserved += orig_bytes;
3419 ret = 0;
3420 } else {
3421 /*
3422 * Ok set num_bytes to orig_bytes since we aren't
3423 * overocmmitted, this way we only try and reclaim what
3424 * we need.
3425 */
3426 num_bytes = orig_bytes;
3427 }
3428 } else {
3429 /*
3430 * Ok we're over committed, set num_bytes to the overcommitted
3431 * amount plus the amount of bytes that we need for this
3432 * reservation.
3433 */
3434 num_bytes = unused - space_info->total_bytes +
3435 (orig_bytes * (retries + 1));
3436 }
3175 3437
3176 ret = shrink_delalloc(trans, root, num_bytes); 3438 /*
3177 if (ret) 3439 * Couldn't make our reservation, save our place so while we're trying
3178 return ret; 3440 * to reclaim space we can actually use it instead of somebody else
3441 * stealing it from us.
3442 */
3443 if (ret && !reserved) {
3444 space_info->bytes_reserved += orig_bytes;
3445 reserved = true;
3446 }
3179 3447
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock); 3448 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188 3449
3189 if (trans) 3450 if (!ret)
3190 return -EAGAIN; 3451 return 0;
3191 3452
3192 trans = btrfs_join_transaction(root, 1); 3453 if (!flush)
3193 BUG_ON(IS_ERR(trans)); 3454 goto out;
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196 3455
3197 return 1; 3456 /*
3198} 3457 * We do synchronous shrinking since we don't actually unreserve
3458 * metadata until after the IO is completed.
3459 */
3460 ret = shrink_delalloc(trans, root, num_bytes, 1);
3461 if (ret > 0)
3462 return 0;
3463 else if (ret < 0)
3464 goto out;
3199 3465
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, 3466 /*
3201 u64 num_bytes) 3467 * So if we were overcommitted it's possible that somebody else flushed
3202{ 3468 * out enough space and we simply didn't have enough space to reclaim,
3203 struct btrfs_space_info *space_info = block_rsv->space_info; 3469 * so go back around and try again.
3204 u64 unused; 3470 */
3205 int ret = -ENOSPC; 3471 if (retries < 2) {
3472 retries++;
3473 goto again;
3474 }
3206 3475
3207 spin_lock(&space_info->lock); 3476 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved + 3477 /*
3209 space_info->bytes_pinned + space_info->bytes_readonly; 3478 * Not enough space to be reclaimed, don't bother committing the
3479 * transaction.
3480 */
3481 if (space_info->bytes_pinned < orig_bytes)
3482 ret = -ENOSPC;
3483 spin_unlock(&space_info->lock);
3484 if (ret)
3485 goto out;
3210 3486
3211 if (unused < space_info->total_bytes) 3487 ret = -EAGAIN;
3212 unused = space_info->total_bytes - unused; 3488 if (trans || committed)
3213 else 3489 goto out;
3214 unused = 0;
3215 3490
3216 if (unused >= num_bytes) { 3491 ret = -ENOSPC;
3217 if (block_rsv->priority >= 10) { 3492 trans = btrfs_join_transaction(root, 1);
3218 space_info->bytes_reserved += num_bytes; 3493 if (IS_ERR(trans))
3219 ret = 0; 3494 goto out;
3220 } else { 3495 ret = btrfs_commit_transaction(trans, root);
3221 if ((unused + block_rsv->reserved) * 3496 if (!ret) {
3222 block_rsv->priority >= 3497 trans = NULL;
3223 (num_bytes + block_rsv->reserved) * 10) { 3498 committed = true;
3224 space_info->bytes_reserved += num_bytes; 3499 goto again;
3225 ret = 0; 3500 }
3226 } 3501
3227 } 3502out:
3503 if (reserved) {
3504 spin_lock(&space_info->lock);
3505 space_info->bytes_reserved -= orig_bytes;
3506 spin_unlock(&space_info->lock);
3228 } 3507 }
3229 spin_unlock(&space_info->lock);
3230 3508
3231 return ret; 3509 return ret;
3232} 3510}
@@ -3328,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{ 3606{
3329 struct btrfs_block_rsv *block_rsv; 3607 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info; 3608 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332 3609
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 3610 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv) 3611 if (!block_rsv)
3335 return NULL; 3612 return NULL;
3336 3613
3337 btrfs_init_block_rsv(block_rsv); 3614 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info, 3615 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA); 3616 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv; 3617 return block_rsv;
3344} 3618}
3345 3619
@@ -3370,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3644int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root, 3645 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv, 3646 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries) 3647 u64 num_bytes)
3374{ 3648{
3375 int ret; 3649 int ret;
3376 3650
3377 if (num_bytes == 0) 3651 if (num_bytes == 0)
3378 return 0; 3652 return 0;
3379again: 3653
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3654 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3381 if (!ret) { 3655 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3656 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0; 3657 return 0;
3384 } 3658 }
3385 3659
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret; 3660 return ret;
3391} 3661}
3392 3662
@@ -3421,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3421 return 0; 3691 return 0;
3422 3692
3423 if (block_rsv->refill_used) { 3693 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3694 ret = reserve_metadata_bytes(trans, root, block_rsv,
3695 num_bytes, 0);
3425 if (!ret) { 3696 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3697 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0; 3698 return 0;
@@ -3500,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3500 3771
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3772 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock); 3773 spin_lock(&sinfo->lock);
3774 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3775 data_used = 0;
3503 meta_used = sinfo->bytes_used; 3776 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock); 3777 spin_unlock(&sinfo->lock);
3505 3778
@@ -3527,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3527 block_rsv->size = num_bytes; 3800 block_rsv->size = num_bytes;
3528 3801
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 3802 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly; 3803 sinfo->bytes_reserved + sinfo->bytes_readonly +
3804 sinfo->bytes_may_use;
3531 3805
3532 if (sinfo->total_bytes > num_bytes) { 3806 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes; 3807 num_bytes = sinfo->total_bytes - num_bytes;
@@ -3598,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3598 3872
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3873int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root, 3874 struct btrfs_root *root,
3601 int num_items, int *retries) 3875 int num_items)
3602{ 3876{
3603 u64 num_bytes; 3877 u64 num_bytes;
3604 int ret; 3878 int ret;
@@ -3608,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3608 3882
3609 num_bytes = calc_trans_metadata_size(root, num_items); 3883 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, 3884 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries); 3885 num_bytes);
3612 if (!ret) { 3886 if (!ret) {
3613 trans->bytes_reserved += num_bytes; 3887 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv; 3888 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3682,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3956 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve; 3957 u64 to_reserve;
3684 int nr_extents; 3958 int nr_extents;
3685 int retries = 0;
3686 int ret; 3959 int ret;
3687 3960
3688 if (btrfs_transaction_in_commit(root->fs_info)) 3961 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1); 3962 schedule_timeout(1);
3690 3963
3691 num_bytes = ALIGN(num_bytes, root->sectorsize); 3964 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again: 3965
3693 spin_lock(&BTRFS_I(inode)->accounting_lock); 3966 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 3967 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 3968 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3699,18 +3972,14 @@ again:
3699 nr_extents = 0; 3972 nr_extents = 0;
3700 to_reserve = 0; 3973 to_reserve = 0;
3701 } 3974 }
3975 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3702 3976
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes); 3977 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve); 3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3705 if (ret) { 3979 if (ret)
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret; 3980 return ret;
3712 }
3713 3981
3982 spin_lock(&BTRFS_I(inode)->accounting_lock);
3714 BTRFS_I(inode)->reserved_extents += nr_extents; 3983 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 3984 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock); 3985 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3718,7 +3987,7 @@ again:
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1); 3987 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719 3988
3720 if (block_rsv->size > 512 * 1024 * 1024) 3989 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve); 3990 shrink_delalloc(NULL, root, to_reserve, 0);
3722 3991
3723 return 0; 3992 return 0;
3724} 3993}
@@ -3777,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3777 struct btrfs_root *root, 4046 struct btrfs_root *root,
3778 u64 bytenr, u64 num_bytes, int alloc) 4047 u64 bytenr, u64 num_bytes, int alloc)
3779{ 4048{
3780 struct btrfs_block_group_cache *cache; 4049 struct btrfs_block_group_cache *cache = NULL;
3781 struct btrfs_fs_info *info = root->fs_info; 4050 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3783 u64 total = num_bytes; 4051 u64 total = num_bytes;
3784 u64 old_val; 4052 u64 old_val;
3785 u64 byte_in_group; 4053 u64 byte_in_group;
4054 int factor;
3786 4055
3787 /* block accounting for super block */ 4056 /* block accounting for super block */
3788 spin_lock(&info->delalloc_lock); 4057 spin_lock(&info->delalloc_lock);
@@ -3804,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3804 factor = 2; 4073 factor = 2;
3805 else 4074 else
3806 factor = 1; 4075 factor = 1;
4076 /*
4077 * If this block group has free space cache written out, we
4078 * need to make sure to load it if we are removing space. This
4079 * is because we need the unpinning stage to actually add the
4080 * space back to the block group, otherwise we will leak space.
4081 */
4082 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4083 cache_block_group(cache, trans, 1);
4084
3807 byte_in_group = bytenr - cache->key.objectid; 4085 byte_in_group = bytenr - cache->key.objectid;
3808 WARN_ON(byte_in_group > cache->key.offset); 4086 WARN_ON(byte_in_group > cache->key.offset);
3809 4087
3810 spin_lock(&cache->space_info->lock); 4088 spin_lock(&cache->space_info->lock);
3811 spin_lock(&cache->lock); 4089 spin_lock(&cache->lock);
4090
4091 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4092 cache->disk_cache_state < BTRFS_DC_CLEAR)
4093 cache->disk_cache_state = BTRFS_DC_CLEAR;
4094
3812 cache->dirty = 1; 4095 cache->dirty = 1;
3813 old_val = btrfs_block_group_used(&cache->item); 4096 old_val = btrfs_block_group_used(&cache->item);
3814 num_bytes = min(total, cache->key.offset - byte_in_group); 4097 num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -4555,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4555 bool found_uncached_bg = false; 4838 bool found_uncached_bg = false;
4556 bool failed_cluster_refill = false; 4839 bool failed_cluster_refill = false;
4557 bool failed_alloc = false; 4840 bool failed_alloc = false;
4841 bool use_cluster = true;
4558 u64 ideal_cache_percent = 0; 4842 u64 ideal_cache_percent = 0;
4559 u64 ideal_cache_offset = 0; 4843 u64 ideal_cache_offset = 0;
4560 4844
@@ -4569,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4569 return -ENOSPC; 4853 return -ENOSPC;
4570 } 4854 }
4571 4855
4856 /*
4857 * If the space info is for both data and metadata it means we have a
4858 * small filesystem and we can't use the clustering stuff.
4859 */
4860 if (btrfs_mixed_space_info(space_info))
4861 use_cluster = false;
4862
4572 if (orig_root->ref_cows || empty_size) 4863 if (orig_root->ref_cows || empty_size)
4573 allowed_chunk_alloc = 1; 4864 allowed_chunk_alloc = 1;
4574 4865
4575 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4866 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4576 last_ptr = &root->fs_info->meta_alloc_cluster; 4867 last_ptr = &root->fs_info->meta_alloc_cluster;
4577 if (!btrfs_test_opt(root, SSD)) 4868 if (!btrfs_test_opt(root, SSD))
4578 empty_cluster = 64 * 1024; 4869 empty_cluster = 64 * 1024;
4579 } 4870 }
4580 4871
4581 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4872 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4873 btrfs_test_opt(root, SSD)) {
4582 last_ptr = &root->fs_info->data_alloc_cluster; 4874 last_ptr = &root->fs_info->data_alloc_cluster;
4583 } 4875 }
4584 4876
@@ -4642,6 +4934,10 @@ have_block_group:
4642 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4934 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4643 u64 free_percent; 4935 u64 free_percent;
4644 4936
4937 ret = cache_block_group(block_group, trans, 1);
4938 if (block_group->cached == BTRFS_CACHE_FINISHED)
4939 goto have_block_group;
4940
4645 free_percent = btrfs_block_group_used(&block_group->item); 4941 free_percent = btrfs_block_group_used(&block_group->item);
4646 free_percent *= 100; 4942 free_percent *= 100;
4647 free_percent = div64_u64(free_percent, 4943 free_percent = div64_u64(free_percent,
@@ -4662,7 +4958,7 @@ have_block_group:
4662 if (loop > LOOP_CACHING_NOWAIT || 4958 if (loop > LOOP_CACHING_NOWAIT ||
4663 (loop > LOOP_FIND_IDEAL && 4959 (loop > LOOP_FIND_IDEAL &&
4664 atomic_read(&space_info->caching_threads) < 2)) { 4960 atomic_read(&space_info->caching_threads) < 2)) {
4665 ret = cache_block_group(block_group); 4961 ret = cache_block_group(block_group, trans, 0);
4666 BUG_ON(ret); 4962 BUG_ON(ret);
4667 } 4963 }
4668 found_uncached_bg = true; 4964 found_uncached_bg = true;
@@ -5219,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5219 u64 num_bytes = ins->offset; 5515 u64 num_bytes = ins->offset;
5220 5516
5221 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5517 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5222 cache_block_group(block_group); 5518 cache_block_group(block_group, trans, 0);
5223 caching_ctl = get_caching_control(block_group); 5519 caching_ctl = get_caching_control(block_group);
5224 5520
5225 if (!caching_ctl) { 5521 if (!caching_ctl) {
@@ -5309,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5309 block_rsv = get_block_rsv(trans, root); 5605 block_rsv = get_block_rsv(trans, root);
5310 5606
5311 if (block_rsv->size == 0) { 5607 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize); 5608 ret = reserve_metadata_bytes(trans, root, block_rsv,
5609 blocksize, 0);
5313 if (ret) 5610 if (ret)
5314 return ERR_PTR(ret); 5611 return ERR_PTR(ret);
5315 return block_rsv; 5612 return block_rsv;
@@ -5319,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5319 if (!ret) 5616 if (!ret)
5320 return block_rsv; 5617 return block_rsv;
5321 5618
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC); 5619 return ERR_PTR(-ENOSPC);
5328} 5620}
5329 5621
@@ -5422,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5422 u64 generation; 5714 u64 generation;
5423 u64 refs; 5715 u64 refs;
5424 u64 flags; 5716 u64 flags;
5425 u64 last = 0;
5426 u32 nritems; 5717 u32 nritems;
5427 u32 blocksize; 5718 u32 blocksize;
5428 struct btrfs_key key; 5719 struct btrfs_key key;
@@ -5490,7 +5781,6 @@ reada:
5490 generation); 5781 generation);
5491 if (ret) 5782 if (ret)
5492 break; 5783 break;
5493 last = bytenr + blocksize;
5494 nread++; 5784 nread++;
5495 } 5785 }
5496 wc->reada_slot = slot; 5786 wc->reada_slot = slot;
@@ -7814,6 +8104,40 @@ out:
7814 return ret; 8104 return ret;
7815} 8105}
7816 8106
8107void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8108{
8109 struct btrfs_block_group_cache *block_group;
8110 u64 last = 0;
8111
8112 while (1) {
8113 struct inode *inode;
8114
8115 block_group = btrfs_lookup_first_block_group(info, last);
8116 while (block_group) {
8117 spin_lock(&block_group->lock);
8118 if (block_group->iref)
8119 break;
8120 spin_unlock(&block_group->lock);
8121 block_group = next_block_group(info->tree_root,
8122 block_group);
8123 }
8124 if (!block_group) {
8125 if (last == 0)
8126 break;
8127 last = 0;
8128 continue;
8129 }
8130
8131 inode = block_group->inode;
8132 block_group->iref = 0;
8133 block_group->inode = NULL;
8134 spin_unlock(&block_group->lock);
8135 iput(inode);
8136 last = block_group->key.objectid + block_group->key.offset;
8137 btrfs_put_block_group(block_group);
8138 }
8139}
8140
7817int btrfs_free_block_groups(struct btrfs_fs_info *info) 8141int btrfs_free_block_groups(struct btrfs_fs_info *info)
7818{ 8142{
7819 struct btrfs_block_group_cache *block_group; 8143 struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7897 struct btrfs_key key; 8221 struct btrfs_key key;
7898 struct btrfs_key found_key; 8222 struct btrfs_key found_key;
7899 struct extent_buffer *leaf; 8223 struct extent_buffer *leaf;
8224 int need_clear = 0;
8225 u64 cache_gen;
7900 8226
7901 root = info->extent_root; 8227 root = info->extent_root;
7902 key.objectid = 0; 8228 key.objectid = 0;
@@ -7906,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7906 if (!path) 8232 if (!path)
7907 return -ENOMEM; 8233 return -ENOMEM;
7908 8234
8235 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
8236 if (cache_gen != 0 &&
8237 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
8238 need_clear = 1;
8239 if (btrfs_test_opt(root, CLEAR_CACHE))
8240 need_clear = 1;
8241 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
8242 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
8243
7909 while (1) { 8244 while (1) {
7910 ret = find_first_block_group(root, path, &key); 8245 ret = find_first_block_group(root, path, &key);
7911 if (ret > 0) 8246 if (ret > 0)
@@ -7928,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7928 INIT_LIST_HEAD(&cache->list); 8263 INIT_LIST_HEAD(&cache->list);
7929 INIT_LIST_HEAD(&cache->cluster_list); 8264 INIT_LIST_HEAD(&cache->cluster_list);
7930 8265
8266 if (need_clear)
8267 cache->disk_cache_state = BTRFS_DC_CLEAR;
8268
7931 /* 8269 /*
7932 * we only want to have 32k of ram per block group for keeping 8270 * we only want to have 32k of ram per block group for keeping
7933 * track of free space, and if we pass 1/2 of that we want to 8271 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8032 cache->key.offset = size; 8370 cache->key.offset = size;
8033 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8371 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8034 cache->sectorsize = root->sectorsize; 8372 cache->sectorsize = root->sectorsize;
8373 cache->fs_info = root->fs_info;
8035 8374
8036 /* 8375 /*
8037 * we only want to have 32k of ram per block group for keeping track 8376 * we only want to have 32k of ram per block group for keeping track
@@ -8088,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8088 struct btrfs_path *path; 8427 struct btrfs_path *path;
8089 struct btrfs_block_group_cache *block_group; 8428 struct btrfs_block_group_cache *block_group;
8090 struct btrfs_free_cluster *cluster; 8429 struct btrfs_free_cluster *cluster;
8430 struct btrfs_root *tree_root = root->fs_info->tree_root;
8091 struct btrfs_key key; 8431 struct btrfs_key key;
8432 struct inode *inode;
8092 int ret; 8433 int ret;
8434 int factor;
8093 8435
8094 root = root->fs_info->extent_root; 8436 root = root->fs_info->extent_root;
8095 8437
@@ -8098,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8098 BUG_ON(!block_group->ro); 8440 BUG_ON(!block_group->ro);
8099 8441
8100 memcpy(&key, &block_group->key, sizeof(key)); 8442 memcpy(&key, &block_group->key, sizeof(key));
8443 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8444 BTRFS_BLOCK_GROUP_RAID1 |
8445 BTRFS_BLOCK_GROUP_RAID10))
8446 factor = 2;
8447 else
8448 factor = 1;
8101 8449
8102 /* make sure this block group isn't part of an allocation cluster */ 8450 /* make sure this block group isn't part of an allocation cluster */
8103 cluster = &root->fs_info->data_alloc_cluster; 8451 cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8117 path = btrfs_alloc_path(); 8465 path = btrfs_alloc_path();
8118 BUG_ON(!path); 8466 BUG_ON(!path);
8119 8467
8468 inode = lookup_free_space_inode(root, block_group, path);
8469 if (!IS_ERR(inode)) {
8470 btrfs_orphan_add(trans, inode);
8471 clear_nlink(inode);
8472 /* One for the block groups ref */
8473 spin_lock(&block_group->lock);
8474 if (block_group->iref) {
8475 block_group->iref = 0;
8476 block_group->inode = NULL;
8477 spin_unlock(&block_group->lock);
8478 iput(inode);
8479 } else {
8480 spin_unlock(&block_group->lock);
8481 }
8482 /* One for our lookup ref */
8483 iput(inode);
8484 }
8485
8486 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8487 key.offset = block_group->key.objectid;
8488 key.type = 0;
8489
8490 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8491 if (ret < 0)
8492 goto out;
8493 if (ret > 0)
8494 btrfs_release_path(tree_root, path);
8495 if (ret == 0) {
8496 ret = btrfs_del_item(trans, tree_root, path);
8497 if (ret)
8498 goto out;
8499 btrfs_release_path(tree_root, path);
8500 }
8501
8120 spin_lock(&root->fs_info->block_group_cache_lock); 8502 spin_lock(&root->fs_info->block_group_cache_lock);
8121 rb_erase(&block_group->cache_node, 8503 rb_erase(&block_group->cache_node,
8122 &root->fs_info->block_group_cache_tree); 8504 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8138 spin_lock(&block_group->space_info->lock); 8520 spin_lock(&block_group->space_info->lock);
8139 block_group->space_info->total_bytes -= block_group->key.offset; 8521 block_group->space_info->total_bytes -= block_group->key.offset;
8140 block_group->space_info->bytes_readonly -= block_group->key.offset; 8522 block_group->space_info->bytes_readonly -= block_group->key.offset;
8523 block_group->space_info->disk_total -= block_group->key.offset * factor;
8141 spin_unlock(&block_group->space_info->lock); 8524 spin_unlock(&block_group->space_info->lock);
8142 8525
8526 memcpy(&key, &block_group->key, sizeof(key));
8527
8143 btrfs_clear_space_info_full(root->fs_info); 8528 btrfs_clear_space_info_full(root->fs_info);
8144 8529
8145 btrfs_put_block_group(block_group); 8530 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53..eac10e3260a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
104 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
105{ 105{
106 tree->state = RB_ROOT; 106 tree->state = RB_ROOT;
107 tree->buffer = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
108 tree->ops = NULL; 108 tree->ops = NULL;
109 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
110 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
235 return ret; 235 return ret;
236} 236}
237 237
238static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
239 u64 offset, struct rb_node *node)
240{
241 struct rb_root *root = &tree->buffer;
242 struct rb_node **p = &root->rb_node;
243 struct rb_node *parent = NULL;
244 struct extent_buffer *eb;
245
246 while (*p) {
247 parent = *p;
248 eb = rb_entry(parent, struct extent_buffer, rb_node);
249
250 if (offset < eb->start)
251 p = &(*p)->rb_left;
252 else if (offset > eb->start)
253 p = &(*p)->rb_right;
254 else
255 return eb;
256 }
257
258 rb_link_node(node, parent, p);
259 rb_insert_color(node, root);
260 return NULL;
261}
262
263static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
264 u64 offset)
265{
266 struct rb_root *root = &tree->buffer;
267 struct rb_node *n = root->rb_node;
268 struct extent_buffer *eb;
269
270 while (n) {
271 eb = rb_entry(n, struct extent_buffer, rb_node);
272 if (offset < eb->start)
273 n = n->rb_left;
274 else if (offset > eb->start)
275 n = n->rb_right;
276 else
277 return eb;
278 }
279 return NULL;
280}
281
282static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 238static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
283 struct extent_state *other) 239 struct extent_state *other)
284{ 240{
@@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901 struct page *page = bvec->bv_page; 1857 struct page *page = bvec->bv_page;
1902 struct extent_io_tree *tree = bio->bi_private; 1858 struct extent_io_tree *tree = bio->bi_private;
1903 u64 start; 1859 u64 start;
1904 u64 end;
1905 1860
1906 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1907 end = start + bvec->bv_len - 1;
1908 1862
1909 bio->bi_private = NULL; 1863 bio->bi_private = NULL;
1910 1864
@@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2204 u64 last_byte = i_size_read(inode); 2158 u64 last_byte = i_size_read(inode);
2205 u64 block_start; 2159 u64 block_start;
2206 u64 iosize; 2160 u64 iosize;
2207 u64 unlock_start;
2208 sector_t sector; 2161 sector_t sector;
2209 struct extent_state *cached_state = NULL; 2162 struct extent_state *cached_state = NULL;
2210 struct extent_map *em; 2163 struct extent_map *em;
@@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2329 if (tree->ops && tree->ops->writepage_end_io_hook) 2282 if (tree->ops && tree->ops->writepage_end_io_hook)
2330 tree->ops->writepage_end_io_hook(page, start, 2283 tree->ops->writepage_end_io_hook(page, start,
2331 page_end, NULL, 1); 2284 page_end, NULL, 1);
2332 unlock_start = page_end + 1;
2333 goto done; 2285 goto done;
2334 } 2286 }
2335 2287
@@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2340 if (tree->ops && tree->ops->writepage_end_io_hook) 2292 if (tree->ops && tree->ops->writepage_end_io_hook)
2341 tree->ops->writepage_end_io_hook(page, cur, 2293 tree->ops->writepage_end_io_hook(page, cur,
2342 page_end, NULL, 1); 2294 page_end, NULL, 1);
2343 unlock_start = page_end + 1;
2344 break; 2295 break;
2345 } 2296 }
2346 em = epd->get_extent(inode, page, pg_offset, cur, 2297 em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2387 2338
2388 cur += iosize; 2339 cur += iosize;
2389 pg_offset += iosize; 2340 pg_offset += iosize;
2390 unlock_start = cur;
2391 continue; 2341 continue;
2392 } 2342 }
2393 /* leave this out until we have a page_mkwrite call */ 2343 /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2473 pgoff_t index; 2423 pgoff_t index;
2474 pgoff_t end; /* Inclusive */ 2424 pgoff_t end; /* Inclusive */
2475 int scanned = 0; 2425 int scanned = 0;
2476 int range_whole = 0;
2477 2426
2478 pagevec_init(&pvec, 0); 2427 pagevec_init(&pvec, 0);
2479 if (wbc->range_cyclic) { 2428 if (wbc->range_cyclic) {
@@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2482 } else { 2431 } else {
2483 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2432 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2484 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2433 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2485 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2486 range_whole = 1;
2487 scanned = 1; 2434 scanned = 1;
2488 } 2435 }
2489retry: 2436retry:
@@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2823 NULL, 1, 2770 NULL, 1,
2824 end_bio_extent_preparewrite, 0, 2771 end_bio_extent_preparewrite, 0,
2825 0, 0); 2772 0, 0);
2773 if (ret && !err)
2774 err = ret;
2826 iocount++; 2775 iocount++;
2827 block_start = block_start + iosize; 2776 block_start = block_start + iosize;
2828 } else { 2777 } else {
@@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3104 kmem_cache_free(extent_buffer_cache, eb); 3053 kmem_cache_free(extent_buffer_cache, eb);
3105} 3054}
3106 3055
3056/*
3057 * Helper for releasing extent buffer page.
3058 */
3059static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3060 unsigned long start_idx)
3061{
3062 unsigned long index;
3063 struct page *page;
3064
3065 if (!eb->first_page)
3066 return;
3067
3068 index = num_extent_pages(eb->start, eb->len);
3069 if (start_idx >= index)
3070 return;
3071
3072 do {
3073 index--;
3074 page = extent_buffer_page(eb, index);
3075 if (page)
3076 page_cache_release(page);
3077 } while (index != start_idx);
3078}
3079
3080/*
3081 * Helper for releasing the extent buffer.
3082 */
3083static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3084{
3085 btrfs_release_extent_buffer_page(eb, 0);
3086 __free_extent_buffer(eb);
3087}
3088
3107struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3089struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3108 u64 start, unsigned long len, 3090 u64 start, unsigned long len,
3109 struct page *page0, 3091 struct page *page0,
@@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3117 struct page *p; 3099 struct page *p;
3118 struct address_space *mapping = tree->mapping; 3100 struct address_space *mapping = tree->mapping;
3119 int uptodate = 1; 3101 int uptodate = 1;
3102 int ret;
3120 3103
3121 spin_lock(&tree->buffer_lock); 3104 rcu_read_lock();
3122 eb = buffer_search(tree, start); 3105 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3123 if (eb) { 3106 if (eb && atomic_inc_not_zero(&eb->refs)) {
3124 atomic_inc(&eb->refs); 3107 rcu_read_unlock();
3125 spin_unlock(&tree->buffer_lock);
3126 mark_page_accessed(eb->first_page); 3108 mark_page_accessed(eb->first_page);
3127 return eb; 3109 return eb;
3128 } 3110 }
3129 spin_unlock(&tree->buffer_lock); 3111 rcu_read_unlock();
3130 3112
3131 eb = __alloc_extent_buffer(tree, start, len, mask); 3113 eb = __alloc_extent_buffer(tree, start, len, mask);
3132 if (!eb) 3114 if (!eb)
@@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3165 if (uptodate) 3147 if (uptodate)
3166 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3148 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3167 3149
3150 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3151 if (ret)
3152 goto free_eb;
3153
3168 spin_lock(&tree->buffer_lock); 3154 spin_lock(&tree->buffer_lock);
3169 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3155 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3170 if (exists) { 3156 if (ret == -EEXIST) {
3157 exists = radix_tree_lookup(&tree->buffer,
3158 start >> PAGE_CACHE_SHIFT);
3171 /* add one reference for the caller */ 3159 /* add one reference for the caller */
3172 atomic_inc(&exists->refs); 3160 atomic_inc(&exists->refs);
3173 spin_unlock(&tree->buffer_lock); 3161 spin_unlock(&tree->buffer_lock);
3162 radix_tree_preload_end();
3174 goto free_eb; 3163 goto free_eb;
3175 } 3164 }
3176 /* add one reference for the tree */ 3165 /* add one reference for the tree */
3177 atomic_inc(&eb->refs); 3166 atomic_inc(&eb->refs);
3178 spin_unlock(&tree->buffer_lock); 3167 spin_unlock(&tree->buffer_lock);
3168 radix_tree_preload_end();
3179 return eb; 3169 return eb;
3180 3170
3181free_eb: 3171free_eb:
3182 if (!atomic_dec_and_test(&eb->refs)) 3172 if (!atomic_dec_and_test(&eb->refs))
3183 return exists; 3173 return exists;
3184 for (index = 1; index < i; index++) 3174 btrfs_release_extent_buffer(eb);
3185 page_cache_release(extent_buffer_page(eb, index));
3186 page_cache_release(extent_buffer_page(eb, 0));
3187 __free_extent_buffer(eb);
3188 return exists; 3175 return exists;
3189} 3176}
3190 3177
@@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3194{ 3181{
3195 struct extent_buffer *eb; 3182 struct extent_buffer *eb;
3196 3183
3197 spin_lock(&tree->buffer_lock); 3184 rcu_read_lock();
3198 eb = buffer_search(tree, start); 3185 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3199 if (eb) 3186 if (eb && atomic_inc_not_zero(&eb->refs)) {
3200 atomic_inc(&eb->refs); 3187 rcu_read_unlock();
3201 spin_unlock(&tree->buffer_lock);
3202
3203 if (eb)
3204 mark_page_accessed(eb->first_page); 3188 mark_page_accessed(eb->first_page);
3189 return eb;
3190 }
3191 rcu_read_unlock();
3205 3192
3206 return eb; 3193 return NULL;
3207} 3194}
3208 3195
3209void free_extent_buffer(struct extent_buffer *eb) 3196void free_extent_buffer(struct extent_buffer *eb)
@@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3833 } 3820 }
3834} 3821}
3835 3822
3823static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3824{
3825 struct extent_buffer *eb =
3826 container_of(head, struct extent_buffer, rcu_head);
3827
3828 btrfs_release_extent_buffer(eb);
3829}
3830
3836int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3831int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3837{ 3832{
3838 u64 start = page_offset(page); 3833 u64 start = page_offset(page);
3839 struct extent_buffer *eb; 3834 struct extent_buffer *eb;
3840 int ret = 1; 3835 int ret = 1;
3841 unsigned long i;
3842 unsigned long num_pages;
3843 3836
3844 spin_lock(&tree->buffer_lock); 3837 spin_lock(&tree->buffer_lock);
3845 eb = buffer_search(tree, start); 3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3846 if (!eb) 3839 if (!eb)
3847 goto out; 3840 goto out;
3848 3841
3849 if (atomic_read(&eb->refs) > 1) { 3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3850 ret = 0; 3843 ret = 0;
3851 goto out; 3844 goto out;
3852 } 3845 }
3853 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3846
3847 /*
3848 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3849 * Or go back.
3850 */
3851 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3854 ret = 0; 3852 ret = 0;
3855 goto out; 3853 goto out;
3856 } 3854 }
3857 /* at this point we can safely release the extent buffer */ 3855
3858 num_pages = num_extent_pages(eb->start, eb->len); 3856 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3859 for (i = 0; i < num_pages; i++)
3860 page_cache_release(extent_buffer_page(eb, i));
3861 rb_erase(&eb->rb_node, &tree->buffer);
3862 __free_extent_buffer(eb);
3863out: 3857out:
3864 spin_unlock(&tree->buffer_lock); 3858 spin_unlock(&tree->buffer_lock);
3859
3860 /* at this point we can safely release the extent buffer */
3861 if (atomic_read(&eb->refs) == 0)
3862 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3865 return ret; 3863 return ret;
3866} 3864}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590d..1c6d4f342ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
85 85
86struct extent_io_tree { 86struct extent_io_tree {
87 struct rb_root state; 87 struct rb_root state;
88 struct rb_root buffer; 88 struct radix_tree_root buffer;
89 struct address_space *mapping; 89 struct address_space *mapping;
90 u64 dirty_bytes; 90 u64 dirty_bytes;
91 spinlock_t lock; 91 spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
123 unsigned long bflags; 123 unsigned long bflags;
124 atomic_t refs; 124 atomic_t refs;
125 struct list_head leak_list; 125 struct list_head leak_list;
126 struct rb_node rb_node; 126 struct rcu_head rcu_head;
127 127
128 /* the spinlock is used to protect most operations */ 128 /* the spinlock is used to protect most operations */
129 spinlock_t lock; 129 spinlock_t lock;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d645..23cb8da3ff6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
335 goto out; 335 goto out;
336 } 336 }
337 if (IS_ERR(rb_node)) { 337 if (IS_ERR(rb_node)) {
338 em = ERR_PTR(PTR_ERR(rb_node)); 338 em = ERR_CAST(rb_node);
339 goto out; 339 goto out;
340 } 340 }
341 em = rb_entry(rb_node, struct extent_map, rb_node); 341 em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
384 goto out; 384 goto out;
385 } 385 }
386 if (IS_ERR(rb_node)) { 386 if (IS_ERR(rb_node)) {
387 em = ERR_PTR(PTR_ERR(rb_node)); 387 em = ERR_CAST(rb_node);
388 goto out; 388 goto out;
389 } 389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node); 390 em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d9..22ee0dc2e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,761 @@
23#include "ctree.h" 23#include "ctree.h"
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h"
26 27
27#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
28#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
29 30
31static void recalculate_thresholds(struct btrfs_block_group_cache
32 *block_group);
33static int link_free_space(struct btrfs_block_group_cache *block_group,
34 struct btrfs_free_space *info);
35
36struct inode *lookup_free_space_inode(struct btrfs_root *root,
37 struct btrfs_block_group_cache
38 *block_group, struct btrfs_path *path)
39{
40 struct btrfs_key key;
41 struct btrfs_key location;
42 struct btrfs_disk_key disk_key;
43 struct btrfs_free_space_header *header;
44 struct extent_buffer *leaf;
45 struct inode *inode = NULL;
46 int ret;
47
48 spin_lock(&block_group->lock);
49 if (block_group->inode)
50 inode = igrab(block_group->inode);
51 spin_unlock(&block_group->lock);
52 if (inode)
53 return inode;
54
55 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
56 key.offset = block_group->key.objectid;
57 key.type = 0;
58
59 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
60 if (ret < 0)
61 return ERR_PTR(ret);
62 if (ret > 0) {
63 btrfs_release_path(root, path);
64 return ERR_PTR(-ENOENT);
65 }
66
67 leaf = path->nodes[0];
68 header = btrfs_item_ptr(leaf, path->slots[0],
69 struct btrfs_free_space_header);
70 btrfs_free_space_key(leaf, header, &disk_key);
71 btrfs_disk_key_to_cpu(&location, &disk_key);
72 btrfs_release_path(root, path);
73
74 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
75 if (!inode)
76 return ERR_PTR(-ENOENT);
77 if (IS_ERR(inode))
78 return inode;
79 if (is_bad_inode(inode)) {
80 iput(inode);
81 return ERR_PTR(-ENOENT);
82 }
83
84 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode);
87 block_group->iref = 1;
88 }
89 spin_unlock(&block_group->lock);
90
91 return inode;
92}
93
94int create_free_space_inode(struct btrfs_root *root,
95 struct btrfs_trans_handle *trans,
96 struct btrfs_block_group_cache *block_group,
97 struct btrfs_path *path)
98{
99 struct btrfs_key key;
100 struct btrfs_disk_key disk_key;
101 struct btrfs_free_space_header *header;
102 struct btrfs_inode_item *inode_item;
103 struct extent_buffer *leaf;
104 u64 objectid;
105 int ret;
106
107 ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
108 if (ret < 0)
109 return ret;
110
111 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
112 if (ret)
113 return ret;
114
115 leaf = path->nodes[0];
116 inode_item = btrfs_item_ptr(leaf, path->slots[0],
117 struct btrfs_inode_item);
118 btrfs_item_key(leaf, &disk_key, path->slots[0]);
119 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
120 sizeof(*inode_item));
121 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
122 btrfs_set_inode_size(leaf, inode_item, 0);
123 btrfs_set_inode_nbytes(leaf, inode_item, 0);
124 btrfs_set_inode_uid(leaf, inode_item, 0);
125 btrfs_set_inode_gid(leaf, inode_item, 0);
126 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
127 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
128 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
129 btrfs_set_inode_nlink(leaf, inode_item, 1);
130 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
131 btrfs_set_inode_block_group(leaf, inode_item,
132 block_group->key.objectid);
133 btrfs_mark_buffer_dirty(leaf);
134 btrfs_release_path(root, path);
135
136 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
137 key.offset = block_group->key.objectid;
138 key.type = 0;
139
140 ret = btrfs_insert_empty_item(trans, root, path, &key,
141 sizeof(struct btrfs_free_space_header));
142 if (ret < 0) {
143 btrfs_release_path(root, path);
144 return ret;
145 }
146 leaf = path->nodes[0];
147 header = btrfs_item_ptr(leaf, path->slots[0],
148 struct btrfs_free_space_header);
149 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
150 btrfs_set_free_space_key(leaf, header, &disk_key);
151 btrfs_mark_buffer_dirty(leaf);
152 btrfs_release_path(root, path);
153
154 return 0;
155}
156
157int btrfs_truncate_free_space_cache(struct btrfs_root *root,
158 struct btrfs_trans_handle *trans,
159 struct btrfs_path *path,
160 struct inode *inode)
161{
162 loff_t oldsize;
163 int ret = 0;
164
165 trans->block_rsv = root->orphan_block_rsv;
166 ret = btrfs_block_rsv_check(trans, root,
167 root->orphan_block_rsv,
168 0, 5);
169 if (ret)
170 return ret;
171
172 oldsize = i_size_read(inode);
173 btrfs_i_size_write(inode, 0);
174 truncate_pagecache(inode, oldsize, 0);
175
176 /*
177 * We don't need an orphan item because truncating the free space cache
178 * will never be split across transactions.
179 */
180 ret = btrfs_truncate_inode_items(trans, root, inode,
181 0, BTRFS_EXTENT_DATA_KEY);
182 if (ret) {
183 WARN_ON(1);
184 return ret;
185 }
186
187 return btrfs_update_inode(trans, root, inode);
188}
189
190static int readahead_cache(struct inode *inode)
191{
192 struct file_ra_state *ra;
193 unsigned long last_index;
194
195 ra = kzalloc(sizeof(*ra), GFP_NOFS);
196 if (!ra)
197 return -ENOMEM;
198
199 file_ra_state_init(ra, inode->i_mapping);
200 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
201
202 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
203
204 kfree(ra);
205
206 return 0;
207}
208
209int load_free_space_cache(struct btrfs_fs_info *fs_info,
210 struct btrfs_block_group_cache *block_group)
211{
212 struct btrfs_root *root = fs_info->tree_root;
213 struct inode *inode;
214 struct btrfs_free_space_header *header;
215 struct extent_buffer *leaf;
216 struct page *page;
217 struct btrfs_path *path;
218 u32 *checksums = NULL, *crc;
219 char *disk_crcs = NULL;
220 struct btrfs_key key;
221 struct list_head bitmaps;
222 u64 num_entries;
223 u64 num_bitmaps;
224 u64 generation;
225 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0;
227 unsigned long first_page_offset;
228 int num_checksums;
229 int ret = 0;
230
231 /*
232 * If we're unmounting then just return, since this does a search on the
233 * normal root and not the commit root and we could deadlock.
234 */
235 smp_mb();
236 if (fs_info->closing)
237 return 0;
238
239 /*
240 * If this block group has been marked to be cleared for one reason or
241 * another then we can't trust the on disk cache, so just return.
242 */
243 spin_lock(&block_group->lock);
244 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
245 spin_unlock(&block_group->lock);
246 return 0;
247 }
248 spin_unlock(&block_group->lock);
249
250 INIT_LIST_HEAD(&bitmaps);
251
252 path = btrfs_alloc_path();
253 if (!path)
254 return 0;
255
256 inode = lookup_free_space_inode(root, block_group, path);
257 if (IS_ERR(inode)) {
258 btrfs_free_path(path);
259 return 0;
260 }
261
262 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) {
264 btrfs_free_path(path);
265 goto out;
266 }
267
268 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
269 key.offset = block_group->key.objectid;
270 key.type = 0;
271
272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
273 if (ret) {
274 btrfs_free_path(path);
275 goto out;
276 }
277
278 leaf = path->nodes[0];
279 header = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_free_space_header);
281 num_entries = btrfs_free_space_entries(leaf, header);
282 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
283 generation = btrfs_free_space_generation(leaf, header);
284 btrfs_free_path(path);
285
286 if (BTRFS_I(inode)->generation != generation) {
287 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
288 " not match free space cache generation (%llu) for "
289 "block group %llu\n",
290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid);
293 goto out;
294 }
295
296 if (!num_entries)
297 goto out;
298
299 /* Setup everything for doing checksumming */
300 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
301 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
302 if (!checksums)
303 goto out;
304 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
305 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
306 if (!disk_crcs)
307 goto out;
308
309 ret = readahead_cache(inode);
310 if (ret) {
311 ret = 0;
312 goto out;
313 }
314
315 while (1) {
316 struct btrfs_free_space_entry *entry;
317 struct btrfs_free_space *e;
318 void *addr;
319 unsigned long offset = 0;
320 unsigned long start_offset = 0;
321 int need_loop = 0;
322
323 if (!num_entries && !num_bitmaps)
324 break;
325
326 if (index == 0) {
327 start_offset = first_page_offset;
328 offset = start_offset;
329 }
330
331 page = grab_cache_page(inode->i_mapping, index);
332 if (!page) {
333 ret = 0;
334 goto free_cache;
335 }
336
337 if (!PageUptodate(page)) {
338 btrfs_readpage(NULL, page);
339 lock_page(page);
340 if (!PageUptodate(page)) {
341 unlock_page(page);
342 page_cache_release(page);
343 printk(KERN_ERR "btrfs: error reading free "
344 "space cache: %llu\n",
345 (unsigned long long)
346 block_group->key.objectid);
347 goto free_cache;
348 }
349 }
350 addr = kmap(page);
351
352 if (index == 0) {
353 u64 *gen;
354
355 memcpy(disk_crcs, addr, first_page_offset);
356 gen = addr + (sizeof(u32) * num_checksums);
357 if (*gen != BTRFS_I(inode)->generation) {
358 printk(KERN_ERR "btrfs: space cache generation"
359 " (%llu) does not match inode (%llu) "
360 "for block group %llu\n",
361 (unsigned long long)*gen,
362 (unsigned long long)
363 BTRFS_I(inode)->generation,
364 (unsigned long long)
365 block_group->key.objectid);
366 kunmap(page);
367 unlock_page(page);
368 page_cache_release(page);
369 goto free_cache;
370 }
371 crc = (u32 *)disk_crcs;
372 }
373 entry = addr + start_offset;
374
375 /* First lets check our crc before we do anything fun */
376 cur_crc = ~(u32)0;
377 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
378 PAGE_CACHE_SIZE - start_offset);
379 btrfs_csum_final(cur_crc, (char *)&cur_crc);
380 if (cur_crc != *crc) {
381 printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
382 "block group %llu\n", index,
383 (unsigned long long)block_group->key.objectid);
384 kunmap(page);
385 unlock_page(page);
386 page_cache_release(page);
387 goto free_cache;
388 }
389 crc++;
390
391 while (1) {
392 if (!num_entries)
393 break;
394
395 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
397 if (!e) {
398 kunmap(page);
399 unlock_page(page);
400 page_cache_release(page);
401 goto free_cache;
402 }
403
404 e->offset = le64_to_cpu(entry->offset);
405 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) {
407 kunmap(page);
408 kfree(e);
409 unlock_page(page);
410 page_cache_release(page);
411 goto free_cache;
412 }
413
414 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
415 spin_lock(&block_group->tree_lock);
416 ret = link_free_space(block_group, e);
417 spin_unlock(&block_group->tree_lock);
418 BUG_ON(ret);
419 } else {
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) {
422 kunmap(page);
423 kfree(e);
424 unlock_page(page);
425 page_cache_release(page);
426 goto free_cache;
427 }
428 spin_lock(&block_group->tree_lock);
429 ret = link_free_space(block_group, e);
430 block_group->total_bitmaps++;
431 recalculate_thresholds(block_group);
432 spin_unlock(&block_group->tree_lock);
433 list_add_tail(&e->list, &bitmaps);
434 }
435
436 num_entries--;
437 offset += sizeof(struct btrfs_free_space_entry);
438 if (offset + sizeof(struct btrfs_free_space_entry) >=
439 PAGE_CACHE_SIZE)
440 break;
441 entry++;
442 }
443
444 /*
445 * We read an entry out of this page, we need to move on to the
446 * next page.
447 */
448 if (need_loop) {
449 kunmap(page);
450 goto next;
451 }
452
453 /*
454 * We add the bitmaps at the end of the entries in order that
455 * the bitmap entries are added to the cache.
456 */
457 e = list_entry(bitmaps.next, struct btrfs_free_space, list);
458 list_del_init(&e->list);
459 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
460 kunmap(page);
461 num_bitmaps--;
462next:
463 unlock_page(page);
464 page_cache_release(page);
465 index++;
466 }
467
468 ret = 1;
469out:
470 kfree(checksums);
471 kfree(disk_crcs);
472 iput(inode);
473 return ret;
474
475free_cache:
476 /* This cache is bogus, make sure it gets cleared */
477 spin_lock(&block_group->lock);
478 block_group->disk_cache_state = BTRFS_DC_CLEAR;
479 spin_unlock(&block_group->lock);
480 btrfs_remove_free_space_cache(block_group);
481 goto out;
482}
483
484int btrfs_write_out_cache(struct btrfs_root *root,
485 struct btrfs_trans_handle *trans,
486 struct btrfs_block_group_cache *block_group,
487 struct btrfs_path *path)
488{
489 struct btrfs_free_space_header *header;
490 struct extent_buffer *leaf;
491 struct inode *inode;
492 struct rb_node *node;
493 struct list_head *pos, *n;
494 struct page *page;
495 struct extent_state *cached_state = NULL;
496 struct list_head bitmap_list;
497 struct btrfs_key key;
498 u64 bytes = 0;
499 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset;
502 int num_checksums;
503 int entries = 0;
504 int bitmaps = 0;
505 int ret = 0;
506
507 root = root->fs_info->tree_root;
508
509 INIT_LIST_HEAD(&bitmap_list);
510
511 spin_lock(&block_group->lock);
512 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
513 spin_unlock(&block_group->lock);
514 return 0;
515 }
516 spin_unlock(&block_group->lock);
517
518 inode = lookup_free_space_inode(root, block_group, path);
519 if (IS_ERR(inode))
520 return 0;
521
522 if (!i_size_read(inode)) {
523 iput(inode);
524 return 0;
525 }
526
527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
528 filemap_write_and_wait(inode->i_mapping);
529 btrfs_wait_ordered_range(inode, inode->i_size &
530 ~(root->sectorsize - 1), (u64)-1);
531
532 /* We need a checksum per page. */
533 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
534 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
535 if (!crc) {
536 iput(inode);
537 return 0;
538 }
539
540 /* Since the first page has all of our checksums and our generation we
541 * need to calculate the offset into the page that we can start writing
542 * our entries.
543 */
544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
545
546 node = rb_first(&block_group->free_space_offset);
547 if (!node)
548 goto out_free;
549
550 /*
551 * Lock all pages first so we can lock the extent safely.
552 *
553 * NOTE: Because we hold the ref the entire time we're going to write to
554 * the page find_get_page should never fail, so we don't do a check
555 * after find_get_page at this point. Just putting this here so people
556 * know and don't freak out.
557 */
558 while (index <= last_index) {
559 page = grab_cache_page(inode->i_mapping, index);
560 if (!page) {
561 pgoff_t i = 0;
562
563 while (i < index) {
564 page = find_get_page(inode->i_mapping, i);
565 unlock_page(page);
566 page_cache_release(page);
567 page_cache_release(page);
568 i++;
569 }
570 goto out_free;
571 }
572 index++;
573 }
574
575 index = 0;
576 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
577 0, &cached_state, GFP_NOFS);
578
579 /* Write out the extent entries */
580 do {
581 struct btrfs_free_space_entry *entry;
582 void *addr;
583 unsigned long offset = 0;
584 unsigned long start_offset = 0;
585
586 if (index == 0) {
587 start_offset = first_page_offset;
588 offset = start_offset;
589 }
590
591 page = find_get_page(inode->i_mapping, index);
592
593 addr = kmap(page);
594 entry = addr + start_offset;
595
596 memset(addr, 0, PAGE_CACHE_SIZE);
597 while (1) {
598 struct btrfs_free_space *e;
599
600 e = rb_entry(node, struct btrfs_free_space, offset_index);
601 entries++;
602
603 entry->offset = cpu_to_le64(e->offset);
604 entry->bytes = cpu_to_le64(e->bytes);
605 if (e->bitmap) {
606 entry->type = BTRFS_FREE_SPACE_BITMAP;
607 list_add_tail(&e->list, &bitmap_list);
608 bitmaps++;
609 } else {
610 entry->type = BTRFS_FREE_SPACE_EXTENT;
611 }
612 node = rb_next(node);
613 if (!node)
614 break;
615 offset += sizeof(struct btrfs_free_space_entry);
616 if (offset + sizeof(struct btrfs_free_space_entry) >=
617 PAGE_CACHE_SIZE)
618 break;
619 entry++;
620 }
621 *crc = ~(u32)0;
622 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
623 PAGE_CACHE_SIZE - start_offset);
624 kunmap(page);
625
626 btrfs_csum_final(*crc, (char *)crc);
627 crc++;
628
629 bytes += PAGE_CACHE_SIZE;
630
631 ClearPageChecked(page);
632 set_page_extent_mapped(page);
633 SetPageUptodate(page);
634 set_page_dirty(page);
635
636 /*
637 * We need to release our reference we got for grab_cache_page,
638 * except for the first page which will hold our checksums, we
639 * do that below.
640 */
641 if (index != 0) {
642 unlock_page(page);
643 page_cache_release(page);
644 }
645
646 page_cache_release(page);
647
648 index++;
649 } while (node);
650
651 /* Write out the bitmaps */
652 list_for_each_safe(pos, n, &bitmap_list) {
653 void *addr;
654 struct btrfs_free_space *entry =
655 list_entry(pos, struct btrfs_free_space, list);
656
657 page = find_get_page(inode->i_mapping, index);
658
659 addr = kmap(page);
660 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
661 *crc = ~(u32)0;
662 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
663 kunmap(page);
664 btrfs_csum_final(*crc, (char *)crc);
665 crc++;
666 bytes += PAGE_CACHE_SIZE;
667
668 ClearPageChecked(page);
669 set_page_extent_mapped(page);
670 SetPageUptodate(page);
671 set_page_dirty(page);
672 unlock_page(page);
673 page_cache_release(page);
674 page_cache_release(page);
675 list_del_init(&entry->list);
676 index++;
677 }
678
679 /* Zero out the rest of the pages just to make sure */
680 while (index <= last_index) {
681 void *addr;
682
683 page = find_get_page(inode->i_mapping, index);
684
685 addr = kmap(page);
686 memset(addr, 0, PAGE_CACHE_SIZE);
687 kunmap(page);
688 ClearPageChecked(page);
689 set_page_extent_mapped(page);
690 SetPageUptodate(page);
691 set_page_dirty(page);
692 unlock_page(page);
693 page_cache_release(page);
694 page_cache_release(page);
695 bytes += PAGE_CACHE_SIZE;
696 index++;
697 }
698
699 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
700
701 /* Write the checksums and trans id to the first page */
702 {
703 void *addr;
704 u64 *gen;
705
706 page = find_get_page(inode->i_mapping, 0);
707
708 addr = kmap(page);
709 memcpy(addr, checksums, sizeof(u32) * num_checksums);
710 gen = addr + (sizeof(u32) * num_checksums);
711 *gen = trans->transid;
712 kunmap(page);
713 ClearPageChecked(page);
714 set_page_extent_mapped(page);
715 SetPageUptodate(page);
716 set_page_dirty(page);
717 unlock_page(page);
718 page_cache_release(page);
719 page_cache_release(page);
720 }
721 BTRFS_I(inode)->generation = trans->transid;
722
723 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
724 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
725
726 filemap_write_and_wait(inode->i_mapping);
727
728 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
729 key.offset = block_group->key.objectid;
730 key.type = 0;
731
732 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
733 if (ret < 0) {
734 ret = 0;
735 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
736 EXTENT_DIRTY | EXTENT_DELALLOC |
737 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
738 goto out_free;
739 }
740 leaf = path->nodes[0];
741 if (ret > 0) {
742 struct btrfs_key found_key;
743 BUG_ON(!path->slots[0]);
744 path->slots[0]--;
745 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
746 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
747 found_key.offset != block_group->key.objectid) {
748 ret = 0;
749 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
750 EXTENT_DIRTY | EXTENT_DELALLOC |
751 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
752 GFP_NOFS);
753 btrfs_release_path(root, path);
754 goto out_free;
755 }
756 }
757 header = btrfs_item_ptr(leaf, path->slots[0],
758 struct btrfs_free_space_header);
759 btrfs_set_free_space_entries(leaf, header, entries);
760 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
761 btrfs_set_free_space_generation(leaf, header, trans->transid);
762 btrfs_mark_buffer_dirty(leaf);
763 btrfs_release_path(root, path);
764
765 ret = 1;
766
767out_free:
768 if (ret == 0) {
769 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
770 spin_lock(&block_group->lock);
771 block_group->disk_cache_state = BTRFS_DC_ERROR;
772 spin_unlock(&block_group->lock);
773 BTRFS_I(inode)->generation = 0;
774 }
775 kfree(checksums);
776 btrfs_update_inode(trans, root, inode);
777 iput(inode);
778 return ret;
779}
780
30static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 781static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
31 u64 offset) 782 u64 offset)
32{ 783{
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011..e49ca5c321b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
27 struct list_head list; 27 struct list_head list;
28}; 28};
29 29
30struct inode *lookup_free_space_inode(struct btrfs_root *root,
31 struct btrfs_block_group_cache
32 *block_group, struct btrfs_path *path);
33int create_free_space_inode(struct btrfs_root *root,
34 struct btrfs_trans_handle *trans,
35 struct btrfs_block_group_cache *block_group,
36 struct btrfs_path *path);
37
38int btrfs_truncate_free_space_cache(struct btrfs_root *root,
39 struct btrfs_trans_handle *trans,
40 struct btrfs_path *path,
41 struct inode *inode);
42int load_free_space_cache(struct btrfs_fs_info *fs_info,
43 struct btrfs_block_group_cache *block_group);
44int btrfs_write_out_cache(struct btrfs_root *root,
45 struct btrfs_trans_handle *trans,
46 struct btrfs_block_group_cache *block_group,
47 struct btrfs_path *path);
30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 48int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytenr, u64 size); 49 u64 bytenr, u64 size);
32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 50int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad474..558cac2dfa5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
319 struct btrfs_root *root = BTRFS_I(inode)->root; 319 struct btrfs_root *root = BTRFS_I(inode)->root;
320 struct btrfs_trans_handle *trans; 320 struct btrfs_trans_handle *trans;
321 u64 num_bytes; 321 u64 num_bytes;
322 u64 orig_start;
323 u64 disk_num_bytes;
324 u64 blocksize = root->sectorsize; 322 u64 blocksize = root->sectorsize;
325 u64 actual_end; 323 u64 actual_end;
326 u64 isize = i_size_read(inode); 324 u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
335 int i; 333 int i;
336 int will_compress; 334 int will_compress;
337 335
338 orig_start = start;
339
340 actual_end = min_t(u64, isize, end + 1); 336 actual_end = min_t(u64, isize, end + 1);
341again: 337again:
342 will_compress = 0; 338 will_compress = 0;
@@ -371,7 +367,6 @@ again:
371 total_compressed = min(total_compressed, max_uncompressed); 367 total_compressed = min(total_compressed, max_uncompressed);
372 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 368 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
373 num_bytes = max(blocksize, num_bytes); 369 num_bytes = max(blocksize, num_bytes);
374 disk_num_bytes = num_bytes;
375 total_in = 0; 370 total_in = 0;
376 ret = 0; 371 ret = 0;
377 372
@@ -467,7 +462,6 @@ again:
467 if (total_compressed >= total_in) { 462 if (total_compressed >= total_in) {
468 will_compress = 0; 463 will_compress = 0;
469 } else { 464 } else {
470 disk_num_bytes = total_compressed;
471 num_bytes = total_in; 465 num_bytes = total_in;
472 } 466 }
473 } 467 }
@@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode,
757 u64 disk_num_bytes; 751 u64 disk_num_bytes;
758 u64 cur_alloc_size; 752 u64 cur_alloc_size;
759 u64 blocksize = root->sectorsize; 753 u64 blocksize = root->sectorsize;
760 u64 actual_end;
761 u64 isize = i_size_read(inode);
762 struct btrfs_key ins; 754 struct btrfs_key ins;
763 struct extent_map *em; 755 struct extent_map *em;
764 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 756 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
765 int ret = 0; 757 int ret = 0;
766 758
759 BUG_ON(root == root->fs_info->tree_root);
767 trans = btrfs_join_transaction(root, 1); 760 trans = btrfs_join_transaction(root, 1);
768 BUG_ON(!trans); 761 BUG_ON(!trans);
769 btrfs_set_trans_block_group(trans, inode); 762 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 763 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
771 764
772 actual_end = min_t(u64, isize, end + 1);
773
774 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 765 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
775 num_bytes = max(blocksize, num_bytes); 766 num_bytes = max(blocksize, num_bytes);
776 disk_num_bytes = num_bytes; 767 disk_num_bytes = num_bytes;
@@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1035 int type; 1026 int type;
1036 int nocow; 1027 int nocow;
1037 int check_prev = 1; 1028 int check_prev = 1;
1029 bool nolock = false;
1038 1030
1039 path = btrfs_alloc_path(); 1031 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1032 BUG_ON(!path);
1041 trans = btrfs_join_transaction(root, 1); 1033 if (root == root->fs_info->tree_root) {
1034 nolock = true;
1035 trans = btrfs_join_transaction_nolock(root, 1);
1036 } else {
1037 trans = btrfs_join_transaction(root, 1);
1038 }
1042 BUG_ON(!trans); 1039 BUG_ON(!trans);
1043 1040
1044 cow_start = (u64)-1; 1041 cow_start = (u64)-1;
@@ -1211,8 +1208,13 @@ out_check:
1211 BUG_ON(ret); 1208 BUG_ON(ret);
1212 } 1209 }
1213 1210
1214 ret = btrfs_end_transaction(trans, root); 1211 if (nolock) {
1215 BUG_ON(ret); 1212 ret = btrfs_end_transaction_nolock(trans, root);
1213 BUG_ON(ret);
1214 } else {
1215 ret = btrfs_end_transaction(trans, root);
1216 BUG_ON(ret);
1217 }
1216 btrfs_free_path(path); 1218 btrfs_free_path(path);
1217 return 0; 1219 return 0;
1218} 1220}
@@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1291 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1290 struct btrfs_root *root = BTRFS_I(inode)->root; 1292 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start; 1293 u64 len = state->end + 1 - state->start;
1294 int do_list = (root->root_key.objectid !=
1295 BTRFS_ROOT_TREE_OBJECTID);
1292 1296
1293 if (*bits & EXTENT_FIRST_DELALLOC) 1297 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC; 1298 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
1298 spin_lock(&root->fs_info->delalloc_lock); 1302 spin_lock(&root->fs_info->delalloc_lock);
1299 BTRFS_I(inode)->delalloc_bytes += len; 1303 BTRFS_I(inode)->delalloc_bytes += len;
1300 root->fs_info->delalloc_bytes += len; 1304 root->fs_info->delalloc_bytes += len;
1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1305 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1306 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1303 &root->fs_info->delalloc_inodes); 1307 &root->fs_info->delalloc_inodes);
1304 } 1308 }
@@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1325 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1322 struct btrfs_root *root = BTRFS_I(inode)->root; 1326 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start; 1327 u64 len = state->end + 1 - state->start;
1328 int do_list = (root->root_key.objectid !=
1329 BTRFS_ROOT_TREE_OBJECTID);
1324 1330
1325 if (*bits & EXTENT_FIRST_DELALLOC) 1331 if (*bits & EXTENT_FIRST_DELALLOC)
1326 *bits &= ~EXTENT_FIRST_DELALLOC; 1332 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1330 if (*bits & EXTENT_DO_ACCOUNTING) 1336 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len); 1337 btrfs_delalloc_release_metadata(inode, len);
1332 1338
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) 1339 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1340 && do_list)
1334 btrfs_free_reserved_data_space(inode, len); 1341 btrfs_free_reserved_data_space(inode, len);
1335 1342
1336 spin_lock(&root->fs_info->delalloc_lock); 1343 spin_lock(&root->fs_info->delalloc_lock);
1337 root->fs_info->delalloc_bytes -= len; 1344 root->fs_info->delalloc_bytes -= len;
1338 BTRFS_I(inode)->delalloc_bytes -= len; 1345 BTRFS_I(inode)->delalloc_bytes -= len;
1339 1346
1340 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1347 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1348 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1349 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1343 } 1350 }
@@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1372 1379
1373 if (map_length < length + size) 1380 if (map_length < length + size)
1374 return 1; 1381 return 1;
1375 return 0; 1382 return ret;
1376} 1383}
1377 1384
1378/* 1385/*
@@ -1426,10 +1433,13 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1426 1433
1427 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1434 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1428 1435
1429 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1436 if (root == root->fs_info->tree_root)
1437 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1438 else
1439 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1430 BUG_ON(ret); 1440 BUG_ON(ret);
1431 1441
1432 if (!(rw & (1 << BIO_RW))) { 1442 if (!(rw & REQ_WRITE)) {
1433 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1443 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1434 return btrfs_submit_compressed_read(inode, bio, 1444 return btrfs_submit_compressed_read(inode, bio,
1435 mirror_num, bio_flags); 1445 mirror_num, bio_flags);
@@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1662 struct extent_state *cached_state = NULL; 1672 struct extent_state *cached_state = NULL;
1663 int compressed = 0; 1673 int compressed = 0;
1664 int ret; 1674 int ret;
1675 bool nolock = false;
1665 1676
1666 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1677 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1667 end - start + 1); 1678 end - start + 1);
@@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1669 return 0; 1680 return 0;
1670 BUG_ON(!ordered_extent); 1681 BUG_ON(!ordered_extent);
1671 1682
1683 nolock = (root == root->fs_info->tree_root);
1684
1672 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1685 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1673 BUG_ON(!list_empty(&ordered_extent->list)); 1686 BUG_ON(!list_empty(&ordered_extent->list));
1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1687 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1675 if (!ret) { 1688 if (!ret) {
1676 trans = btrfs_join_transaction(root, 1); 1689 if (nolock)
1690 trans = btrfs_join_transaction_nolock(root, 1);
1691 else
1692 trans = btrfs_join_transaction(root, 1);
1693 BUG_ON(!trans);
1677 btrfs_set_trans_block_group(trans, inode); 1694 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1695 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1679 ret = btrfs_update_inode(trans, root, inode); 1696 ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1686 ordered_extent->file_offset + ordered_extent->len - 1, 1703 ordered_extent->file_offset + ordered_extent->len - 1,
1687 0, &cached_state, GFP_NOFS); 1704 0, &cached_state, GFP_NOFS);
1688 1705
1689 trans = btrfs_join_transaction(root, 1); 1706 if (nolock)
1707 trans = btrfs_join_transaction_nolock(root, 1);
1708 else
1709 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode); 1710 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1692 1712
@@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1700 ordered_extent->len); 1720 ordered_extent->len);
1701 BUG_ON(ret); 1721 BUG_ON(ret);
1702 } else { 1722 } else {
1723 BUG_ON(root == root->fs_info->tree_root);
1703 ret = insert_reserved_file_extent(trans, inode, 1724 ret = insert_reserved_file_extent(trans, inode,
1704 ordered_extent->file_offset, 1725 ordered_extent->file_offset,
1705 ordered_extent->start, 1726 ordered_extent->start,
@@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1724 ret = btrfs_update_inode(trans, root, inode); 1745 ret = btrfs_update_inode(trans, root, inode);
1725 BUG_ON(ret); 1746 BUG_ON(ret);
1726out: 1747out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1748 if (nolock) {
1728 if (trans) 1749 if (trans)
1729 btrfs_end_transaction(trans, root); 1750 btrfs_end_transaction_nolock(trans, root);
1751 } else {
1752 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1753 if (trans)
1754 btrfs_end_transaction(trans, root);
1755 }
1756
1730 /* once for us */ 1757 /* once for us */
1731 btrfs_put_ordered_extent(ordered_extent); 1758 btrfs_put_ordered_extent(ordered_extent);
1732 /* once for the tree */ 1759 /* once for the tree */
@@ -1841,7 +1868,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1841 bio->bi_size = 0; 1868 bio->bi_size = 0;
1842 1869
1843 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1870 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1844 if (failed_bio->bi_rw & (1 << BIO_RW)) 1871 if (failed_bio->bi_rw & REQ_WRITE)
1845 rw = WRITE; 1872 rw = WRITE;
1846 else 1873 else
1847 rw = READ; 1874 rw = READ;
@@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2237{ 2264{
2238 struct btrfs_path *path; 2265 struct btrfs_path *path;
2239 struct extent_buffer *leaf; 2266 struct extent_buffer *leaf;
2240 struct btrfs_item *item;
2241 struct btrfs_key key, found_key; 2267 struct btrfs_key key, found_key;
2242 struct btrfs_trans_handle *trans; 2268 struct btrfs_trans_handle *trans;
2243 struct inode *inode; 2269 struct inode *inode;
@@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2275 2301
2276 /* pull out the item */ 2302 /* pull out the item */
2277 leaf = path->nodes[0]; 2303 leaf = path->nodes[0];
2278 item = btrfs_item_nr(leaf, path->slots[0]);
2279 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2304 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2280 2305
2281 /* make sure the item matches what we want */ 2306 /* make sure the item matches what we want */
@@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2651 2676
2652 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2677 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2653 dir, index); 2678 dir, index);
2654 BUG_ON(ret); 2679 if (ret == -ENOENT)
2680 ret = 0;
2655err: 2681err:
2656 btrfs_free_path(path); 2682 btrfs_free_path(path);
2657 if (ret) 2683 if (ret)
@@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root,
2672{ 2698{
2673 struct extent_buffer *eb; 2699 struct extent_buffer *eb;
2674 int level; 2700 int level;
2675 int ret;
2676 u64 refs = 1; 2701 u64 refs = 1;
2702 int uninitialized_var(ret);
2677 2703
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2704 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level]) 2705 if (!path->nodes[level])
@@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root,
2686 if (refs > 1) 2712 if (refs > 1)
2687 return 1; 2713 return 1;
2688 } 2714 }
2689 return 0; 2715 return ret; /* XXX callers? */
2690} 2716}
2691 2717
2692/* 2718/*
@@ -2938,7 +2964,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2938 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2964 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2939 ret = btrfs_update_inode(trans, root, dir); 2965 ret = btrfs_update_inode(trans, root, dir);
2940 BUG_ON(ret); 2966 BUG_ON(ret);
2941 dir->i_sb->s_dirt = 1;
2942 2967
2943 btrfs_free_path(path); 2968 btrfs_free_path(path);
2944 return 0; 2969 return 0;
@@ -3197,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3197 3222
3198 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3223 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3199 3224
3200 if (root->ref_cows) 3225 if (root->ref_cows || root == root->fs_info->tree_root)
3201 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3226 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3202 3227
3203 path = btrfs_alloc_path(); 3228 path = btrfs_alloc_path();
@@ -3345,7 +3370,8 @@ delete:
3345 } else { 3370 } else {
3346 break; 3371 break;
3347 } 3372 }
3348 if (found_extent && root->ref_cows) { 3373 if (found_extent && (root->ref_cows ||
3374 root == root->fs_info->tree_root)) {
3349 btrfs_set_path_blocking(path); 3375 btrfs_set_path_blocking(path);
3350 ret = btrfs_free_extent(trans, root, extent_start, 3376 ret = btrfs_free_extent(trans, root, extent_start,
3351 extent_num_bytes, 0, 3377 extent_num_bytes, 0,
@@ -3656,17 +3682,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3656 if (err) 3682 if (err)
3657 return err; 3683 return err;
3658 } 3684 }
3659 attr->ia_valid &= ~ATTR_SIZE;
3660 3685
3661 if (attr->ia_valid) 3686 if (attr->ia_valid) {
3662 err = inode_setattr(inode, attr); 3687 setattr_copy(inode, attr);
3688 mark_inode_dirty(inode);
3689
3690 if (attr->ia_valid & ATTR_MODE)
3691 err = btrfs_acl_chmod(inode);
3692 }
3663 3693
3664 if (!err && ((attr->ia_valid & ATTR_MODE)))
3665 err = btrfs_acl_chmod(inode);
3666 return err; 3694 return err;
3667} 3695}
3668 3696
3669void btrfs_delete_inode(struct inode *inode) 3697void btrfs_evict_inode(struct inode *inode)
3670{ 3698{
3671 struct btrfs_trans_handle *trans; 3699 struct btrfs_trans_handle *trans;
3672 struct btrfs_root *root = BTRFS_I(inode)->root; 3700 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3674,10 +3702,15 @@ void btrfs_delete_inode(struct inode *inode)
3674 int ret; 3702 int ret;
3675 3703
3676 truncate_inode_pages(&inode->i_data, 0); 3704 truncate_inode_pages(&inode->i_data, 0);
3705 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3706 root == root->fs_info->tree_root))
3707 goto no_delete;
3708
3677 if (is_bad_inode(inode)) { 3709 if (is_bad_inode(inode)) {
3678 btrfs_orphan_del(NULL, inode); 3710 btrfs_orphan_del(NULL, inode);
3679 goto no_delete; 3711 goto no_delete;
3680 } 3712 }
3713 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3681 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3714 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3682 3715
3683 if (root->fs_info->log_root_recovering) { 3716 if (root->fs_info->log_root_recovering) {
@@ -3727,7 +3760,7 @@ void btrfs_delete_inode(struct inode *inode)
3727 btrfs_end_transaction(trans, root); 3760 btrfs_end_transaction(trans, root);
3728 btrfs_btree_balance_dirty(root, nr); 3761 btrfs_btree_balance_dirty(root, nr);
3729no_delete: 3762no_delete:
3730 clear_inode(inode); 3763 end_writeback(inode);
3731 return; 3764 return;
3732} 3765}
3733 3766
@@ -3844,7 +3877,7 @@ again:
3844 p = &root->inode_tree.rb_node; 3877 p = &root->inode_tree.rb_node;
3845 parent = NULL; 3878 parent = NULL;
3846 3879
3847 if (hlist_unhashed(&inode->i_hash)) 3880 if (inode_unhashed(inode))
3848 return; 3881 return;
3849 3882
3850 spin_lock(&root->inode_lock); 3883 spin_lock(&root->inode_lock);
@@ -3858,7 +3891,7 @@ again:
3858 p = &parent->rb_right; 3891 p = &parent->rb_right;
3859 else { 3892 else {
3860 WARN_ON(!(entry->vfs_inode.i_state & 3893 WARN_ON(!(entry->vfs_inode.i_state &
3861 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3894 (I_WILL_FREE | I_FREEING)));
3862 rb_erase(parent, &root->inode_tree); 3895 rb_erase(parent, &root->inode_tree);
3863 RB_CLEAR_NODE(parent); 3896 RB_CLEAR_NODE(parent);
3864 spin_unlock(&root->inode_lock); 3897 spin_unlock(&root->inode_lock);
@@ -3883,7 +3916,14 @@ static void inode_tree_del(struct inode *inode)
3883 } 3916 }
3884 spin_unlock(&root->inode_lock); 3917 spin_unlock(&root->inode_lock);
3885 3918
3886 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3919 /*
3920 * Free space cache has inodes in the tree root, but the tree root has a
3921 * root_refs of 0, so this could end up dropping the tree root as a
3922 * snapshot, so we need the extra !root->fs_info->tree_root check to
3923 * make sure we don't drop it.
3924 */
3925 if (empty && btrfs_root_refs(&root->root_item) == 0 &&
3926 root != root->fs_info->tree_root) {
3887 synchronize_srcu(&root->fs_info->subvol_srcu); 3927 synchronize_srcu(&root->fs_info->subvol_srcu);
3888 spin_lock(&root->inode_lock); 3928 spin_lock(&root->inode_lock);
3889 empty = RB_EMPTY_ROOT(&root->inode_tree); 3929 empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3937,7 +3977,7 @@ again:
3937 if (atomic_read(&inode->i_count) > 1) 3977 if (atomic_read(&inode->i_count) > 1)
3938 d_prune_aliases(inode); 3978 d_prune_aliases(inode);
3939 /* 3979 /*
3940 * btrfs_drop_inode will remove it from 3980 * btrfs_drop_inode will have it removed from
3941 * the inode cache when its usage count 3981 * the inode cache when its usage count
3942 * hits zero. 3982 * hits zero.
3943 */ 3983 */
@@ -4277,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4277 struct btrfs_root *root = BTRFS_I(inode)->root; 4317 struct btrfs_root *root = BTRFS_I(inode)->root;
4278 struct btrfs_trans_handle *trans; 4318 struct btrfs_trans_handle *trans;
4279 int ret = 0; 4319 int ret = 0;
4320 bool nolock = false;
4280 4321
4281 if (BTRFS_I(inode)->dummy_inode) 4322 if (BTRFS_I(inode)->dummy_inode)
4282 return 0; 4323 return 0;
4283 4324
4325 smp_mb();
4326 nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
4327
4284 if (wbc->sync_mode == WB_SYNC_ALL) { 4328 if (wbc->sync_mode == WB_SYNC_ALL) {
4285 trans = btrfs_join_transaction(root, 1); 4329 if (nolock)
4330 trans = btrfs_join_transaction_nolock(root, 1);
4331 else
4332 trans = btrfs_join_transaction(root, 1);
4286 btrfs_set_trans_block_group(trans, inode); 4333 btrfs_set_trans_block_group(trans, inode);
4287 ret = btrfs_commit_transaction(trans, root); 4334 if (nolock)
4335 ret = btrfs_end_transaction_nolock(trans, root);
4336 else
4337 ret = btrfs_commit_transaction(trans, root);
4288 } 4338 }
4289 return ret; 4339 return ret;
4290} 4340}
@@ -4753,7 +4803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4753 } 4803 }
4754 4804
4755 btrfs_set_trans_block_group(trans, dir); 4805 btrfs_set_trans_block_group(trans, dir);
4756 atomic_inc(&inode->i_count); 4806 ihold(inode);
4757 4807
4758 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4808 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
4759 4809
@@ -5640,9 +5690,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5640 struct btrfs_root *root = BTRFS_I(inode)->root; 5690 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip; 5691 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec; 5692 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum; 5693 int skip_sum;
5645 int write = rw & (1 << BIO_RW); 5694 int write = rw & REQ_WRITE;
5646 int ret = 0; 5695 int ret = 0;
5647 5696
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 5697 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -5666,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5666 dip->inode = inode; 5715 dip->inode = inode;
5667 dip->logical_offset = file_offset; 5716 dip->logical_offset = file_offset;
5668 5717
5669 start = dip->logical_offset;
5670 dip->bytes = 0; 5718 dip->bytes = 0;
5671 do { 5719 do {
5672 dip->bytes += bvec->bv_len; 5720 dip->bytes += bvec->bv_len;
@@ -6303,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode)
6303 spin_unlock(&root->fs_info->ordered_extent_lock); 6351 spin_unlock(&root->fs_info->ordered_extent_lock);
6304 } 6352 }
6305 6353
6354 if (root == root->fs_info->tree_root) {
6355 struct btrfs_block_group_cache *block_group;
6356
6357 block_group = btrfs_lookup_block_group(root->fs_info,
6358 BTRFS_I(inode)->block_group);
6359 if (block_group && block_group->inode == inode) {
6360 spin_lock(&block_group->lock);
6361 block_group->inode = NULL;
6362 spin_unlock(&block_group->lock);
6363 btrfs_put_block_group(block_group);
6364 } else if (block_group) {
6365 btrfs_put_block_group(block_group);
6366 }
6367 }
6368
6306 spin_lock(&root->orphan_lock); 6369 spin_lock(&root->orphan_lock);
6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6370 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6371 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6331,13 +6394,15 @@ free:
6331 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6394 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6332} 6395}
6333 6396
6334void btrfs_drop_inode(struct inode *inode) 6397int btrfs_drop_inode(struct inode *inode)
6335{ 6398{
6336 struct btrfs_root *root = BTRFS_I(inode)->root; 6399 struct btrfs_root *root = BTRFS_I(inode)->root;
6337 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6400
6338 generic_delete_inode(inode); 6401 if (btrfs_root_refs(&root->root_item) == 0 &&
6402 root != root->fs_info->tree_root)
6403 return 1;
6339 else 6404 else
6340 generic_drop_inode(inode); 6405 return generic_drop_inode(inode);
6341} 6406}
6342 6407
6343static void init_once(void *foo) 6408static void init_once(void *foo)
@@ -6603,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
6603 return 0; 6668 return 0;
6604} 6669}
6605 6670
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) 6671int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
6672 int sync)
6607{ 6673{
6608 struct btrfs_inode *binode; 6674 struct btrfs_inode *binode;
6609 struct inode *inode = NULL; 6675 struct inode *inode = NULL;
@@ -6625,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6625 spin_unlock(&root->fs_info->delalloc_lock); 6691 spin_unlock(&root->fs_info->delalloc_lock);
6626 6692
6627 if (inode) { 6693 if (inode) {
6628 write_inode_now(inode, 0); 6694 if (sync) {
6695 filemap_write_and_wait(inode->i_mapping);
6696 /*
6697 * We have to do this because compression doesn't
6698 * actually set PG_writeback until it submits the pages
6699 * for IO, which happens in an async thread, so we could
6700 * race and not actually wait for any writeback pages
6701 * because they've not been submitted yet. Technically
6702 * this could still be the case for the ordered stuff
6703 * since the async thread may not have started to do its
6704 * work yet. If this becomes the case then we need to
6705 * figure out a way to make sure that in writepage we
6706 * wait for any async pages to be submitted before
6707 * returning so that fdatawait does what its supposed to
6708 * do.
6709 */
6710 btrfs_wait_ordered_range(inode, 0, (u64)-1);
6711 } else {
6712 filemap_flush(inode->i_mapping);
6713 }
6629 if (delay_iput) 6714 if (delay_iput)
6630 btrfs_add_delayed_iput(inode); 6715 btrfs_add_delayed_iput(inode);
6631 else 6716 else
@@ -6751,27 +6836,33 @@ out_unlock:
6751 return err; 6836 return err;
6752} 6837}
6753 6838
6754int btrfs_prealloc_file_range(struct inode *inode, int mode, 6839static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6755 u64 start, u64 num_bytes, u64 min_size, 6840 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint) 6841 loff_t actual_len, u64 *alloc_hint,
6842 struct btrfs_trans_handle *trans)
6757{ 6843{
6758 struct btrfs_trans_handle *trans;
6759 struct btrfs_root *root = BTRFS_I(inode)->root; 6844 struct btrfs_root *root = BTRFS_I(inode)->root;
6760 struct btrfs_key ins; 6845 struct btrfs_key ins;
6761 u64 cur_offset = start; 6846 u64 cur_offset = start;
6762 int ret = 0; 6847 int ret = 0;
6848 bool own_trans = true;
6763 6849
6850 if (trans)
6851 own_trans = false;
6764 while (num_bytes > 0) { 6852 while (num_bytes > 0) {
6765 trans = btrfs_start_transaction(root, 3); 6853 if (own_trans) {
6766 if (IS_ERR(trans)) { 6854 trans = btrfs_start_transaction(root, 3);
6767 ret = PTR_ERR(trans); 6855 if (IS_ERR(trans)) {
6768 break; 6856 ret = PTR_ERR(trans);
6857 break;
6858 }
6769 } 6859 }
6770 6860
6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 6861 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1); 6862 0, *alloc_hint, (u64)-1, &ins, 1);
6773 if (ret) { 6863 if (ret) {
6774 btrfs_end_transaction(trans, root); 6864 if (own_trans)
6865 btrfs_end_transaction(trans, root);
6775 break; 6866 break;
6776 } 6867 }
6777 6868
@@ -6804,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
6804 ret = btrfs_update_inode(trans, root, inode); 6895 ret = btrfs_update_inode(trans, root, inode);
6805 BUG_ON(ret); 6896 BUG_ON(ret);
6806 6897
6807 btrfs_end_transaction(trans, root); 6898 if (own_trans)
6899 btrfs_end_transaction(trans, root);
6808 } 6900 }
6809 return ret; 6901 return ret;
6810} 6902}
6811 6903
6904int btrfs_prealloc_file_range(struct inode *inode, int mode,
6905 u64 start, u64 num_bytes, u64 min_size,
6906 loff_t actual_len, u64 *alloc_hint)
6907{
6908 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
6909 min_size, actual_len, alloc_hint,
6910 NULL);
6911}
6912
6913int btrfs_prealloc_file_range_trans(struct inode *inode,
6914 struct btrfs_trans_handle *trans, int mode,
6915 u64 start, u64 num_bytes, u64 min_size,
6916 loff_t actual_len, u64 *alloc_hint)
6917{
6918 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
6919 min_size, actual_len, alloc_hint, trans);
6920}
6921
6812static long btrfs_fallocate(struct inode *inode, int mode, 6922static long btrfs_fallocate(struct inode *inode, int mode,
6813 loff_t offset, loff_t len) 6923 loff_t offset, loff_t len)
6814{ 6924{
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58db..463d91b4dd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
224 224
225static noinline int create_subvol(struct btrfs_root *root, 225static noinline int create_subvol(struct btrfs_root *root,
226 struct dentry *dentry, 226 struct dentry *dentry,
227 char *name, int namelen) 227 char *name, int namelen,
228 u64 *async_transid)
228{ 229{
229 struct btrfs_trans_handle *trans; 230 struct btrfs_trans_handle *trans;
230 struct btrfs_key key; 231 struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
338 339
339 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 340 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
340fail: 341fail:
341 err = btrfs_commit_transaction(trans, root); 342 if (async_transid) {
343 *async_transid = trans->transid;
344 err = btrfs_commit_transaction_async(trans, root, 1);
345 } else {
346 err = btrfs_commit_transaction(trans, root);
347 }
342 if (err && !ret) 348 if (err && !ret)
343 ret = err; 349 ret = err;
344 return ret; 350 return ret;
345} 351}
346 352
347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) 353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354 char *name, int namelen, u64 *async_transid)
348{ 355{
349 struct inode *inode; 356 struct inode *inode;
350 struct btrfs_pending_snapshot *pending_snapshot; 357 struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
373 380
374 list_add(&pending_snapshot->list, 381 list_add(&pending_snapshot->list,
375 &trans->transaction->pending_snapshots); 382 &trans->transaction->pending_snapshots);
376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); 383 if (async_transid) {
384 *async_transid = trans->transid;
385 ret = btrfs_commit_transaction_async(trans,
386 root->fs_info->extent_root, 1);
387 } else {
388 ret = btrfs_commit_transaction(trans,
389 root->fs_info->extent_root);
390 }
377 BUG_ON(ret); 391 BUG_ON(ret);
378 392
379 ret = pending_snapshot->error; 393 ret = pending_snapshot->error;
@@ -395,6 +409,76 @@ fail:
395 return ret; 409 return ret;
396} 410}
397 411
412/* copy of check_sticky in fs/namei.c()
413* It's inline, so penalty for filesystems that don't use sticky bit is
414* minimal.
415*/
416static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
417{
418 uid_t fsuid = current_fsuid();
419
420 if (!(dir->i_mode & S_ISVTX))
421 return 0;
422 if (inode->i_uid == fsuid)
423 return 0;
424 if (dir->i_uid == fsuid)
425 return 0;
426 return !capable(CAP_FOWNER);
427}
428
429/* copy of may_delete in fs/namei.c()
430 * Check whether we can remove a link victim from directory dir, check
431 * whether the type of victim is right.
432 * 1. We can't do it if dir is read-only (done in permission())
433 * 2. We should have write and exec permissions on dir
434 * 3. We can't remove anything from append-only dir
435 * 4. We can't do anything with immutable dir (done in permission())
436 * 5. If the sticky bit on dir is set we should either
437 * a. be owner of dir, or
438 * b. be owner of victim, or
439 * c. have CAP_FOWNER capability
440 * 6. If the victim is append-only or immutable we can't do antyhing with
441 * links pointing to it.
442 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
443 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
444 * 9. We can't remove a root or mountpoint.
445 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
446 * nfs_async_unlink().
447 */
448
449static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
450{
451 int error;
452
453 if (!victim->d_inode)
454 return -ENOENT;
455
456 BUG_ON(victim->d_parent->d_inode != dir);
457 audit_inode_child(victim, dir);
458
459 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
460 if (error)
461 return error;
462 if (IS_APPEND(dir))
463 return -EPERM;
464 if (btrfs_check_sticky(dir, victim->d_inode)||
465 IS_APPEND(victim->d_inode)||
466 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
467 return -EPERM;
468 if (isdir) {
469 if (!S_ISDIR(victim->d_inode->i_mode))
470 return -ENOTDIR;
471 if (IS_ROOT(victim))
472 return -EBUSY;
473 } else if (S_ISDIR(victim->d_inode->i_mode))
474 return -EISDIR;
475 if (IS_DEADDIR(dir))
476 return -ENOENT;
477 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
478 return -EBUSY;
479 return 0;
480}
481
398/* copy of may_create in fs/namei.c() */ 482/* copy of may_create in fs/namei.c() */
399static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 483static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
400{ 484{
@@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
412 */ 496 */
413static noinline int btrfs_mksubvol(struct path *parent, 497static noinline int btrfs_mksubvol(struct path *parent,
414 char *name, int namelen, 498 char *name, int namelen,
415 struct btrfs_root *snap_src) 499 struct btrfs_root *snap_src,
500 u64 *async_transid)
416{ 501{
417 struct inode *dir = parent->dentry->d_inode; 502 struct inode *dir = parent->dentry->d_inode;
418 struct dentry *dentry; 503 struct dentry *dentry;
@@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
443 goto out_up_read; 528 goto out_up_read;
444 529
445 if (snap_src) { 530 if (snap_src) {
446 error = create_snapshot(snap_src, dentry); 531 error = create_snapshot(snap_src, dentry,
532 name, namelen, async_transid);
447 } else { 533 } else {
448 error = create_subvol(BTRFS_I(dir)->root, dentry, 534 error = create_subvol(BTRFS_I(dir)->root, dentry,
449 name, namelen); 535 name, namelen, async_transid);
450 } 536 }
451 if (!error) 537 if (!error)
452 fsnotify_mkdir(dir, dentry); 538 fsnotify_mkdir(dir, dentry);
@@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
708 char *sizestr; 794 char *sizestr;
709 char *devstr = NULL; 795 char *devstr = NULL;
710 int ret = 0; 796 int ret = 0;
711 int namelen;
712 int mod = 0; 797 int mod = 0;
713 798
714 if (root->fs_info->sb->s_flags & MS_RDONLY) 799 if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
722 return PTR_ERR(vol_args); 807 return PTR_ERR(vol_args);
723 808
724 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 809 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
725 namelen = strlen(vol_args->name);
726 810
727 mutex_lock(&root->fs_info->volume_mutex); 811 mutex_lock(&root->fs_info->volume_mutex);
728 sizestr = vol_args->name; 812 sizestr = vol_args->name;
@@ -801,11 +885,13 @@ out_unlock:
801 return ret; 885 return ret;
802} 886}
803 887
804static noinline int btrfs_ioctl_snap_create(struct file *file, 888static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
805 void __user *arg, int subvol) 889 char *name,
890 unsigned long fd,
891 int subvol,
892 u64 *transid)
806{ 893{
807 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 894 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
808 struct btrfs_ioctl_vol_args *vol_args;
809 struct file *src_file; 895 struct file *src_file;
810 int namelen; 896 int namelen;
811 int ret = 0; 897 int ret = 0;
@@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
813 if (root->fs_info->sb->s_flags & MS_RDONLY) 899 if (root->fs_info->sb->s_flags & MS_RDONLY)
814 return -EROFS; 900 return -EROFS;
815 901
816 vol_args = memdup_user(arg, sizeof(*vol_args)); 902 namelen = strlen(name);
817 if (IS_ERR(vol_args)) 903 if (strchr(name, '/')) {
818 return PTR_ERR(vol_args);
819
820 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
821 namelen = strlen(vol_args->name);
822 if (strchr(vol_args->name, '/')) {
823 ret = -EINVAL; 904 ret = -EINVAL;
824 goto out; 905 goto out;
825 } 906 }
826 907
827 if (subvol) { 908 if (subvol) {
828 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 909 ret = btrfs_mksubvol(&file->f_path, name, namelen,
829 NULL); 910 NULL, transid);
830 } else { 911 } else {
831 struct inode *src_inode; 912 struct inode *src_inode;
832 src_file = fget(vol_args->fd); 913 src_file = fget(fd);
833 if (!src_file) { 914 if (!src_file) {
834 ret = -EINVAL; 915 ret = -EINVAL;
835 goto out; 916 goto out;
@@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
843 fput(src_file); 924 fput(src_file);
844 goto out; 925 goto out;
845 } 926 }
846 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 927 ret = btrfs_mksubvol(&file->f_path, name, namelen,
847 BTRFS_I(src_inode)->root); 928 BTRFS_I(src_inode)->root,
929 transid);
848 fput(src_file); 930 fput(src_file);
849 } 931 }
850out: 932out:
933 return ret;
934}
935
936static noinline int btrfs_ioctl_snap_create(struct file *file,
937 void __user *arg, int subvol,
938 int async)
939{
940 struct btrfs_ioctl_vol_args *vol_args = NULL;
941 struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
942 char *name;
943 u64 fd;
944 u64 transid = 0;
945 int ret;
946
947 if (async) {
948 async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
949 if (IS_ERR(async_vol_args))
950 return PTR_ERR(async_vol_args);
951
952 name = async_vol_args->name;
953 fd = async_vol_args->fd;
954 async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
955 } else {
956 vol_args = memdup_user(arg, sizeof(*vol_args));
957 if (IS_ERR(vol_args))
958 return PTR_ERR(vol_args);
959 name = vol_args->name;
960 fd = vol_args->fd;
961 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 }
963
964 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
965 subvol, &transid);
966
967 if (!ret && async) {
968 if (copy_to_user(arg +
969 offsetof(struct btrfs_ioctl_async_vol_args,
970 transid), &transid, sizeof(transid)))
971 return -EFAULT;
972 }
973
851 kfree(vol_args); 974 kfree(vol_args);
975 kfree(async_vol_args);
976
852 return ret; 977 return ret;
853} 978}
854 979
@@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
1073 if (!capable(CAP_SYS_ADMIN)) 1198 if (!capable(CAP_SYS_ADMIN))
1074 return -EPERM; 1199 return -EPERM;
1075 1200
1076 args = kmalloc(sizeof(*args), GFP_KERNEL); 1201 args = memdup_user(argp, sizeof(*args));
1077 if (!args) 1202 if (IS_ERR(args))
1078 return -ENOMEM; 1203 return PTR_ERR(args);
1079 1204
1080 if (copy_from_user(args, argp, sizeof(*args))) {
1081 kfree(args);
1082 return -EFAULT;
1083 }
1084 inode = fdentry(file)->d_inode; 1205 inode = fdentry(file)->d_inode;
1085 ret = search_ioctl(inode, args); 1206 ret = search_ioctl(inode, args);
1086 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 1207 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1188 if (!capable(CAP_SYS_ADMIN)) 1309 if (!capable(CAP_SYS_ADMIN))
1189 return -EPERM; 1310 return -EPERM;
1190 1311
1191 args = kmalloc(sizeof(*args), GFP_KERNEL); 1312 args = memdup_user(argp, sizeof(*args));
1192 if (!args) 1313 if (IS_ERR(args))
1193 return -ENOMEM; 1314 return PTR_ERR(args);
1194 1315
1195 if (copy_from_user(args, argp, sizeof(*args))) {
1196 kfree(args);
1197 return -EFAULT;
1198 }
1199 inode = fdentry(file)->d_inode; 1316 inode = fdentry(file)->d_inode;
1200 1317
1201 if (args->treeid == 0) 1318 if (args->treeid == 0)
@@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1227 int ret; 1344 int ret;
1228 int err = 0; 1345 int err = 0;
1229 1346
1230 if (!capable(CAP_SYS_ADMIN))
1231 return -EPERM;
1232
1233 vol_args = memdup_user(arg, sizeof(*vol_args)); 1347 vol_args = memdup_user(arg, sizeof(*vol_args));
1234 if (IS_ERR(vol_args)) 1348 if (IS_ERR(vol_args))
1235 return PTR_ERR(vol_args); 1349 return PTR_ERR(vol_args);
@@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1259 } 1373 }
1260 1374
1261 inode = dentry->d_inode; 1375 inode = dentry->d_inode;
1376 dest = BTRFS_I(inode)->root;
1377 if (!capable(CAP_SYS_ADMIN)){
1378 /*
1379 * Regular user. Only allow this with a special mount
1380 * option, when the user has write+exec access to the
1381 * subvol root, and when rmdir(2) would have been
1382 * allowed.
1383 *
1384 * Note that this is _not_ check that the subvol is
1385 * empty or doesn't contain data that we wouldn't
1386 * otherwise be able to delete.
1387 *
1388 * Users who want to delete empty subvols should try
1389 * rmdir(2).
1390 */
1391 err = -EPERM;
1392 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
1393 goto out_dput;
1394
1395 /*
1396 * Do not allow deletion if the parent dir is the same
1397 * as the dir to be deleted. That means the ioctl
1398 * must be called on the dentry referencing the root
1399 * of the subvol, not a random directory contained
1400 * within it.
1401 */
1402 err = -EINVAL;
1403 if (root == dest)
1404 goto out_dput;
1405
1406 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
1407 if (err)
1408 goto out_dput;
1409
1410 /* check if subvolume may be deleted by a non-root user */
1411 err = btrfs_may_delete(dir, dentry, 1);
1412 if (err)
1413 goto out_dput;
1414 }
1415
1262 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 1416 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
1263 err = -EINVAL; 1417 err = -EINVAL;
1264 goto out_dput; 1418 goto out_dput;
1265 } 1419 }
1266 1420
1267 dest = BTRFS_I(inode)->root;
1268
1269 mutex_lock(&inode->i_mutex); 1421 mutex_lock(&inode->i_mutex);
1270 err = d_invalidate(dentry); 1422 err = d_invalidate(dentry);
1271 if (err) 1423 if (err)
@@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1304 BUG_ON(ret); 1456 BUG_ON(ret);
1305 } 1457 }
1306 1458
1307 ret = btrfs_commit_transaction(trans, root); 1459 ret = btrfs_end_transaction(trans, root);
1308 BUG_ON(ret); 1460 BUG_ON(ret);
1309 inode->i_flags |= S_DEAD; 1461 inode->i_flags |= S_DEAD;
1310out_up_write: 1462out_up_write:
@@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1502 path->reada = 2; 1654 path->reada = 2;
1503 1655
1504 if (inode < src) { 1656 if (inode < src) {
1505 mutex_lock(&inode->i_mutex); 1657 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1506 mutex_lock(&src->i_mutex); 1658 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
1507 } else { 1659 } else {
1508 mutex_lock(&src->i_mutex); 1660 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
1509 mutex_lock(&inode->i_mutex); 1661 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1510 } 1662 }
1511 1663
1512 /* determine range to clone */ 1664 /* determine range to clone */
@@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1530 while (1) { 1682 while (1) {
1531 struct btrfs_ordered_extent *ordered; 1683 struct btrfs_ordered_extent *ordered;
1532 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1684 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1533 ordered = btrfs_lookup_first_ordered_extent(inode, off+len); 1685 ordered = btrfs_lookup_first_ordered_extent(src, off+len);
1534 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) 1686 if (!ordered &&
1687 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
1688 EXTENT_DELALLOC, 0, NULL))
1535 break; 1689 break;
1536 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1690 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1537 if (ordered) 1691 if (ordered)
1538 btrfs_put_ordered_extent(ordered); 1692 btrfs_put_ordered_extent(ordered);
1539 btrfs_wait_ordered_range(src, off, off+len); 1693 btrfs_wait_ordered_range(src, off, len);
1540 } 1694 }
1541 1695
1542 /* clone data */ 1696 /* clone data */
@@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1605 } 1759 }
1606 btrfs_release_path(root, path); 1760 btrfs_release_path(root, path);
1607 1761
1608 if (key.offset + datal < off || 1762 if (key.offset + datal <= off ||
1609 key.offset >= off+len) 1763 key.offset >= off+len)
1610 goto next; 1764 goto next;
1611 1765
@@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1879 return 0; 2033 return 0;
1880} 2034}
1881 2035
2036static void get_block_group_info(struct list_head *groups_list,
2037 struct btrfs_ioctl_space_info *space)
2038{
2039 struct btrfs_block_group_cache *block_group;
2040
2041 space->total_bytes = 0;
2042 space->used_bytes = 0;
2043 space->flags = 0;
2044 list_for_each_entry(block_group, groups_list, list) {
2045 space->flags = block_group->flags;
2046 space->total_bytes += block_group->key.offset;
2047 space->used_bytes +=
2048 btrfs_block_group_used(&block_group->item);
2049 }
2050}
2051
1882long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 2052long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1883{ 2053{
1884 struct btrfs_ioctl_space_args space_args; 2054 struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1887 struct btrfs_ioctl_space_info *dest_orig; 2057 struct btrfs_ioctl_space_info *dest_orig;
1888 struct btrfs_ioctl_space_info *user_dest; 2058 struct btrfs_ioctl_space_info *user_dest;
1889 struct btrfs_space_info *info; 2059 struct btrfs_space_info *info;
2060 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2061 BTRFS_BLOCK_GROUP_SYSTEM,
2062 BTRFS_BLOCK_GROUP_METADATA,
2063 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2064 int num_types = 4;
1890 int alloc_size; 2065 int alloc_size;
1891 int ret = 0; 2066 int ret = 0;
1892 int slot_count = 0; 2067 int slot_count = 0;
2068 int i, c;
1893 2069
1894 if (copy_from_user(&space_args, 2070 if (copy_from_user(&space_args,
1895 (struct btrfs_ioctl_space_args __user *)arg, 2071 (struct btrfs_ioctl_space_args __user *)arg,
1896 sizeof(space_args))) 2072 sizeof(space_args)))
1897 return -EFAULT; 2073 return -EFAULT;
1898 2074
1899 /* first we count slots */ 2075 for (i = 0; i < num_types; i++) {
1900 rcu_read_lock(); 2076 struct btrfs_space_info *tmp;
1901 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) 2077
1902 slot_count++; 2078 info = NULL;
1903 rcu_read_unlock(); 2079 rcu_read_lock();
2080 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2081 list) {
2082 if (tmp->flags == types[i]) {
2083 info = tmp;
2084 break;
2085 }
2086 }
2087 rcu_read_unlock();
2088
2089 if (!info)
2090 continue;
2091
2092 down_read(&info->groups_sem);
2093 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2094 if (!list_empty(&info->block_groups[c]))
2095 slot_count++;
2096 }
2097 up_read(&info->groups_sem);
2098 }
1904 2099
1905 /* space_slots == 0 means they are asking for a count */ 2100 /* space_slots == 0 means they are asking for a count */
1906 if (space_args.space_slots == 0) { 2101 if (space_args.space_slots == 0) {
1907 space_args.total_spaces = slot_count; 2102 space_args.total_spaces = slot_count;
1908 goto out; 2103 goto out;
1909 } 2104 }
2105
2106 slot_count = min_t(int, space_args.space_slots, slot_count);
2107
1910 alloc_size = sizeof(*dest) * slot_count; 2108 alloc_size = sizeof(*dest) * slot_count;
2109
1911 /* we generally have at most 6 or so space infos, one for each raid 2110 /* we generally have at most 6 or so space infos, one for each raid
1912 * level. So, a whole page should be more than enough for everyone 2111 * level. So, a whole page should be more than enough for everyone
1913 */ 2112 */
@@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1921 dest_orig = dest; 2120 dest_orig = dest;
1922 2121
1923 /* now we have a buffer to copy into */ 2122 /* now we have a buffer to copy into */
1924 rcu_read_lock(); 2123 for (i = 0; i < num_types; i++) {
1925 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { 2124 struct btrfs_space_info *tmp;
1926 /* make sure we don't copy more than we allocated 2125
1927 * in our buffer 2126 info = NULL;
1928 */ 2127 rcu_read_lock();
1929 if (slot_count == 0) 2128 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
1930 break; 2129 list) {
1931 slot_count--; 2130 if (tmp->flags == types[i]) {
1932 2131 info = tmp;
1933 /* make sure userland has enough room in their buffer */ 2132 break;
1934 if (space_args.total_spaces >= space_args.space_slots) 2133 }
1935 break; 2134 }
2135 rcu_read_unlock();
1936 2136
1937 space.flags = info->flags; 2137 if (!info)
1938 space.total_bytes = info->total_bytes; 2138 continue;
1939 space.used_bytes = info->bytes_used; 2139 down_read(&info->groups_sem);
1940 memcpy(dest, &space, sizeof(space)); 2140 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
1941 dest++; 2141 if (!list_empty(&info->block_groups[c])) {
1942 space_args.total_spaces++; 2142 get_block_group_info(&info->block_groups[c],
2143 &space);
2144 memcpy(dest, &space, sizeof(space));
2145 dest++;
2146 space_args.total_spaces++;
2147 }
2148 }
2149 up_read(&info->groups_sem);
1943 } 2150 }
1944 rcu_read_unlock();
1945 2151
1946 user_dest = (struct btrfs_ioctl_space_info *) 2152 user_dest = (struct btrfs_ioctl_space_info *)
1947 (arg + sizeof(struct btrfs_ioctl_space_args)); 2153 (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file)
1984 return 0; 2190 return 0;
1985} 2191}
1986 2192
2193static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
2194{
2195 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2196 struct btrfs_trans_handle *trans;
2197 u64 transid;
2198
2199 trans = btrfs_start_transaction(root, 0);
2200 transid = trans->transid;
2201 btrfs_commit_transaction_async(trans, root, 0);
2202
2203 if (argp)
2204 if (copy_to_user(argp, &transid, sizeof(transid)))
2205 return -EFAULT;
2206 return 0;
2207}
2208
2209static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
2210{
2211 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2212 u64 transid;
2213
2214 if (argp) {
2215 if (copy_from_user(&transid, argp, sizeof(transid)))
2216 return -EFAULT;
2217 } else {
2218 transid = 0; /* current trans */
2219 }
2220 return btrfs_wait_for_commit(root, transid);
2221}
2222
1987long btrfs_ioctl(struct file *file, unsigned int 2223long btrfs_ioctl(struct file *file, unsigned int
1988 cmd, unsigned long arg) 2224 cmd, unsigned long arg)
1989{ 2225{
@@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int
1998 case FS_IOC_GETVERSION: 2234 case FS_IOC_GETVERSION:
1999 return btrfs_ioctl_getversion(file, argp); 2235 return btrfs_ioctl_getversion(file, argp);
2000 case BTRFS_IOC_SNAP_CREATE: 2236 case BTRFS_IOC_SNAP_CREATE:
2001 return btrfs_ioctl_snap_create(file, argp, 0); 2237 return btrfs_ioctl_snap_create(file, argp, 0, 0);
2238 case BTRFS_IOC_SNAP_CREATE_ASYNC:
2239 return btrfs_ioctl_snap_create(file, argp, 0, 1);
2002 case BTRFS_IOC_SUBVOL_CREATE: 2240 case BTRFS_IOC_SUBVOL_CREATE:
2003 return btrfs_ioctl_snap_create(file, argp, 1); 2241 return btrfs_ioctl_snap_create(file, argp, 1, 0);
2004 case BTRFS_IOC_SNAP_DESTROY: 2242 case BTRFS_IOC_SNAP_DESTROY:
2005 return btrfs_ioctl_snap_destroy(file, argp); 2243 return btrfs_ioctl_snap_destroy(file, argp);
2006 case BTRFS_IOC_DEFAULT_SUBVOL: 2244 case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2034 case BTRFS_IOC_SYNC: 2272 case BTRFS_IOC_SYNC:
2035 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2273 btrfs_sync_fs(file->f_dentry->d_sb, 1);
2036 return 0; 2274 return 0;
2275 case BTRFS_IOC_START_SYNC:
2276 return btrfs_ioctl_start_sync(file, argp);
2277 case BTRFS_IOC_WAIT_SYNC:
2278 return btrfs_ioctl_wait_sync(file, argp);
2037 } 2279 }
2038 2280
2039 return -ENOTTY; 2281 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517..17c99ebdf96 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
22 22
23#define BTRFS_IOCTL_MAGIC 0x94 23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255 24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4087
26 25
27/* this should be 4k */ 26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args { 28struct btrfs_ioctl_vol_args {
29 __s64 fd; 29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SNAPSHOT_NAME_MAX 4079
34struct btrfs_ioctl_async_vol_args {
35 __s64 fd;
36 __u64 transid;
37 char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
38};
39
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080 40#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args { 41struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid; 42 __u64 treeid;
@@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args {
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) 185#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ 186#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args) 187 struct btrfs_ioctl_space_args)
188#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
189#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
190#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
191 struct btrfs_ioctl_async_vol_args)
181#endif 192#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5ad..f4621f6deca 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
526{ 526{
527 u64 end; 527 u64 end;
528 u64 orig_end; 528 u64 orig_end;
529 u64 wait_end;
530 struct btrfs_ordered_extent *ordered; 529 struct btrfs_ordered_extent *ordered;
531 int found; 530 int found;
532 531
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
537 if (orig_end > INT_LIMIT(loff_t)) 536 if (orig_end > INT_LIMIT(loff_t))
538 orig_end = INT_LIMIT(loff_t); 537 orig_end = INT_LIMIT(loff_t);
539 } 538 }
540 wait_end = orig_end;
541again: 539again:
542 /* start IO across the range first to instantiate any delalloc 540 /* start IO across the range first to instantiate any delalloc
543 * extents 541 * extents
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4..045c9c2b2d7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
29#include "locking.h" 29#include "locking.h"
30#include "btrfs_inode.h" 30#include "btrfs_inode.h"
31#include "async-thread.h" 31#include "async-thread.h"
32#include "free-space-cache.h"
32 33
33/* 34/*
34 * backref_node, mapping_node and tree_block start with this 35 * backref_node, mapping_node and tree_block start with this
@@ -178,8 +179,6 @@ struct reloc_control {
178 u64 search_start; 179 u64 search_start;
179 u64 extents_found; 180 u64 extents_found;
180 181
181 int block_rsv_retries;
182
183 unsigned int stage:8; 182 unsigned int stage:8;
184 unsigned int create_reloc_tree:1; 183 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1; 184 unsigned int merge_reloc_tree:1;
@@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2133 LIST_HEAD(reloc_roots); 2132 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0; 2133 u64 num_bytes = 0;
2135 int ret; 2134 int ret;
2136 int retries = 0;
2137 2135
2138 mutex_lock(&root->fs_info->trans_mutex); 2136 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2137 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2141,7 @@ again:
2143 if (!err) { 2141 if (!err) {
2144 num_bytes = rc->merging_rsv_size; 2142 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2143 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries); 2144 num_bytes);
2147 if (ret) 2145 if (ret)
2148 err = ret; 2146 err = ret;
2149 } 2147 }
@@ -2155,7 +2153,6 @@ again:
2155 btrfs_end_transaction(trans, rc->extent_root); 2153 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root, 2154 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes); 2155 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again; 2156 goto again;
2160 } 2157 }
2161 } 2158 }
@@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2402 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2406 2403
2407 trans->block_rsv = rc->block_rsv; 2404 trans->block_rsv = rc->block_rsv;
2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, 2405 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
2409 &rc->block_rsv_retries);
2410 if (ret) { 2406 if (ret) {
2411 if (ret == -EAGAIN) 2407 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1; 2408 rc->commit_transaction = 1;
2413 return ret; 2409 return ret;
2414 } 2410 }
2415 2411
2416 rc->block_rsv_retries = 0;
2417 return 0; 2412 return 0;
2418} 2413}
2419 2414
@@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
3099 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3094 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3100 ret = get_ref_objectid_v0(rc, path, extent_key, 3095 ret = get_ref_objectid_v0(rc, path, extent_key,
3101 &ref_owner, NULL); 3096 &ref_owner, NULL);
3097 if (ret < 0)
3098 return ret;
3102 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); 3099 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3103 level = (int)ref_owner; 3100 level = (int)ref_owner;
3104 /* FIXME: get real generation */ 3101 /* FIXME: get real generation */
@@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc,
3191 return ret; 3188 return ret;
3192} 3189}
3193 3190
3191static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3192 struct inode *inode, u64 ino)
3193{
3194 struct btrfs_key key;
3195 struct btrfs_path *path;
3196 struct btrfs_root *root = fs_info->tree_root;
3197 struct btrfs_trans_handle *trans;
3198 unsigned long nr;
3199 int ret = 0;
3200
3201 if (inode)
3202 goto truncate;
3203
3204 key.objectid = ino;
3205 key.type = BTRFS_INODE_ITEM_KEY;
3206 key.offset = 0;
3207
3208 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3209 if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
3210 if (inode && !IS_ERR(inode))
3211 iput(inode);
3212 return -ENOENT;
3213 }
3214
3215truncate:
3216 path = btrfs_alloc_path();
3217 if (!path) {
3218 ret = -ENOMEM;
3219 goto out;
3220 }
3221
3222 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) {
3224 btrfs_free_path(path);
3225 goto out;
3226 }
3227
3228 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3229
3230 btrfs_free_path(path);
3231 nr = trans->blocks_used;
3232 btrfs_end_transaction(trans, root);
3233 btrfs_btree_balance_dirty(root, nr);
3234out:
3235 iput(inode);
3236 return ret;
3237}
3238
3194/* 3239/*
3195 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3240 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3196 * this function scans fs tree to find blocks reference the data extent 3241 * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
3217 int counted; 3262 int counted;
3218 int ret; 3263 int ret;
3219 3264
3220 path = btrfs_alloc_path();
3221 if (!path)
3222 return -ENOMEM;
3223
3224 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3265 ref_root = btrfs_extent_data_ref_root(leaf, ref);
3225 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3266 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3226 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3267 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3227 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3268 ref_count = btrfs_extent_data_ref_count(leaf, ref);
3228 3269
3270 /*
3271 * This is an extent belonging to the free space cache, lets just delete
3272 * it and redo the search.
3273 */
3274 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3275 ret = delete_block_group_cache(rc->extent_root->fs_info,
3276 NULL, ref_objectid);
3277 if (ret != -ENOENT)
3278 return ret;
3279 ret = 0;
3280 }
3281
3282 path = btrfs_alloc_path();
3283 if (!path)
3284 return -ENOMEM;
3285
3229 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3286 root = read_fs_root(rc->extent_root->fs_info, ref_root);
3230 if (IS_ERR(root)) { 3287 if (IS_ERR(root)) {
3231 err = PTR_ERR(root); 3288 err = PTR_ERR(root);
@@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3554 * is no reservation in transaction handle. 3611 * is no reservation in transaction handle.
3555 */ 3612 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3613 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256, 3614 rc->extent_root->nodesize * 256);
3558 &rc->block_rsv_retries);
3559 if (ret) 3615 if (ret)
3560 return ret; 3616 return ret;
3561 3617
@@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc)
3567 rc->extents_found = 0; 3623 rc->extents_found = 0;
3568 rc->nodes_relocated = 0; 3624 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0; 3625 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571 3626
3572 rc->create_reloc_tree = 1; 3627 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc); 3628 set_reloc_control(rc);
@@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3860{ 3915{
3861 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3916 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3862 struct reloc_control *rc; 3917 struct reloc_control *rc;
3918 struct inode *inode;
3919 struct btrfs_path *path;
3863 int ret; 3920 int ret;
3864 int rw = 0; 3921 int rw = 0;
3865 int err = 0; 3922 int err = 0;
@@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3882 rw = 1; 3939 rw = 1;
3883 } 3940 }
3884 3941
3942 path = btrfs_alloc_path();
3943 if (!path) {
3944 err = -ENOMEM;
3945 goto out;
3946 }
3947
3948 inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
3949 path);
3950 btrfs_free_path(path);
3951
3952 if (!IS_ERR(inode))
3953 ret = delete_block_group_cache(fs_info, inode, 0);
3954 else
3955 ret = PTR_ERR(inode);
3956
3957 if (ret && ret != -ENOENT) {
3958 err = ret;
3959 goto out;
3960 }
3961
3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3962 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3886 if (IS_ERR(rc->data_inode)) { 3963 if (IS_ERR(rc->data_inode)) {
3887 err = PTR_ERR(rc->data_inode); 3964 err = PTR_ERR(rc->data_inode);
@@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4143 btrfs_add_ordered_sum(inode, ordered, sums); 4220 btrfs_add_ordered_sum(inode, ordered, sums);
4144 } 4221 }
4145 btrfs_put_ordered_extent(ordered); 4222 btrfs_put_ordered_extent(ordered);
4146 return 0; 4223 return ret;
4147} 4224}
4148 4225
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4226void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c..6a1086e83ff 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) 181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
182{ 182{
183 struct btrfs_root *dead_root; 183 struct btrfs_root *dead_root;
184 struct btrfs_item *item;
185 struct btrfs_root_item *ri; 184 struct btrfs_root_item *ri;
186 struct btrfs_key key; 185 struct btrfs_key key;
187 struct btrfs_key found_key; 186 struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
214 nritems = btrfs_header_nritems(leaf); 213 nritems = btrfs_header_nritems(leaf);
215 slot = path->slots[0]; 214 slot = path->slots[0];
216 } 215 }
217 item = btrfs_item_nr(leaf, slot);
218 btrfs_item_key_to_cpu(leaf, &key, slot); 216 btrfs_item_key_to_cpu(leaf, &key, slot);
219 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) 217 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
220 goto next; 218 goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2393b39031..8299a25ffc8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
61 61
62 ret = close_ctree(root); 62 ret = close_ctree(root);
63 sb->s_fs_info = NULL; 63 sb->s_fs_info = NULL;
64
65 (void)ret; /* FIXME: need to fix VFS to return error? */
64} 66}
65 67
66enum { 68enum {
@@ -68,7 +70,8 @@ enum {
68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
71 Opt_discard, Opt_err, 73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
74 Opt_user_subvol_rm_allowed,
72}; 75};
73 76
74static match_table_t tokens = { 77static match_table_t tokens = {
@@ -92,6 +95,9 @@ static match_table_t tokens = {
92 {Opt_flushoncommit, "flushoncommit"}, 95 {Opt_flushoncommit, "flushoncommit"},
93 {Opt_ratio, "metadata_ratio=%d"}, 96 {Opt_ratio, "metadata_ratio=%d"},
94 {Opt_discard, "discard"}, 97 {Opt_discard, "discard"},
98 {Opt_space_cache, "space_cache"},
99 {Opt_clear_cache, "clear_cache"},
100 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
95 {Opt_err, NULL}, 101 {Opt_err, NULL},
96}; 102};
97 103
@@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
235 case Opt_discard: 241 case Opt_discard:
236 btrfs_set_opt(info->mount_opt, DISCARD); 242 btrfs_set_opt(info->mount_opt, DISCARD);
237 break; 243 break;
244 case Opt_space_cache:
245 printk(KERN_INFO "btrfs: enabling disk space caching\n");
246 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
247 case Opt_clear_cache:
248 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
249 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
250 break;
251 case Opt_user_subvol_rm_allowed:
252 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
253 break;
238 case Opt_err: 254 case Opt_err:
239 printk(KERN_INFO "btrfs: unrecognized mount option " 255 printk(KERN_INFO "btrfs: unrecognized mount option "
240 "'%s'\n", p); 256 "'%s'\n", p);
@@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb,
380find_root: 396find_root:
381 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 397 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
382 if (IS_ERR(new_root)) 398 if (IS_ERR(new_root))
383 return ERR_PTR(PTR_ERR(new_root)); 399 return ERR_CAST(new_root);
384 400
385 if (btrfs_root_refs(&new_root->root_item) == 0) 401 if (btrfs_root_refs(&new_root->root_item) == 0)
386 return ERR_PTR(-ENOENT); 402 return ERR_PTR(-ENOENT);
@@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb,
436{ 452{
437 struct inode *inode; 453 struct inode *inode;
438 struct dentry *root_dentry; 454 struct dentry *root_dentry;
439 struct btrfs_super_block *disk_super;
440 struct btrfs_root *tree_root; 455 struct btrfs_root *tree_root;
441 struct btrfs_key key; 456 struct btrfs_key key;
442 int err; 457 int err;
@@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
458 return PTR_ERR(tree_root); 473 return PTR_ERR(tree_root);
459 } 474 }
460 sb->s_fs_info = tree_root; 475 sb->s_fs_info = tree_root;
461 disk_super = &tree_root->fs_info->super_copy;
462 476
463 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 477 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
464 key.type = BTRFS_INODE_ITEM_KEY; 478 key.type = BTRFS_INODE_ITEM_KEY;
@@ -560,8 +574,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
560 * Note: This is based on get_sb_bdev from fs/super.c with a few additions 574 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
561 * for multiple device setup. Make sure to keep it in sync. 575 * for multiple device setup. Make sure to keep it in sync.
562 */ 576 */
563static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 577static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
564 const char *dev_name, void *data, struct vfsmount *mnt) 578 const char *dev_name, void *data)
565{ 579{
566 struct block_device *bdev = NULL; 580 struct block_device *bdev = NULL;
567 struct super_block *s; 581 struct super_block *s;
@@ -571,7 +585,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
571 char *subvol_name = NULL; 585 char *subvol_name = NULL;
572 u64 subvol_objectid = 0; 586 u64 subvol_objectid = 0;
573 int error = 0; 587 int error = 0;
574 int found = 0;
575 588
576 if (!(flags & MS_RDONLY)) 589 if (!(flags & MS_RDONLY))
577 mode |= FMODE_WRITE; 590 mode |= FMODE_WRITE;
@@ -580,7 +593,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
580 &subvol_name, &subvol_objectid, 593 &subvol_name, &subvol_objectid,
581 &fs_devices); 594 &fs_devices);
582 if (error) 595 if (error)
583 return error; 596 return ERR_PTR(error);
584 597
585 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); 598 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
586 if (error) 599 if (error)
@@ -607,7 +620,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
607 goto error_close_devices; 620 goto error_close_devices;
608 } 621 }
609 622
610 found = 1;
611 btrfs_close_devices(fs_devices); 623 btrfs_close_devices(fs_devices);
612 } else { 624 } else {
613 char b[BDEVNAME_SIZE]; 625 char b[BDEVNAME_SIZE];
@@ -629,7 +641,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
629 if (IS_ERR(root)) { 641 if (IS_ERR(root)) {
630 error = PTR_ERR(root); 642 error = PTR_ERR(root);
631 deactivate_locked_super(s); 643 deactivate_locked_super(s);
632 goto error; 644 goto error_free_subvol_name;
633 } 645 }
634 /* if they gave us a subvolume name bind mount into that */ 646 /* if they gave us a subvolume name bind mount into that */
635 if (strcmp(subvol_name, ".")) { 647 if (strcmp(subvol_name, ".")) {
@@ -643,24 +655,21 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
643 deactivate_locked_super(s); 655 deactivate_locked_super(s);
644 error = PTR_ERR(new_root); 656 error = PTR_ERR(new_root);
645 dput(root); 657 dput(root);
646 goto error_close_devices; 658 goto error_free_subvol_name;
647 } 659 }
648 if (!new_root->d_inode) { 660 if (!new_root->d_inode) {
649 dput(root); 661 dput(root);
650 dput(new_root); 662 dput(new_root);
651 deactivate_locked_super(s); 663 deactivate_locked_super(s);
652 error = -ENXIO; 664 error = -ENXIO;
653 goto error_close_devices; 665 goto error_free_subvol_name;
654 } 666 }
655 dput(root); 667 dput(root);
656 root = new_root; 668 root = new_root;
657 } 669 }
658 670
659 mnt->mnt_sb = s;
660 mnt->mnt_root = root;
661
662 kfree(subvol_name); 671 kfree(subvol_name);
663 return 0; 672 return root;
664 673
665error_s: 674error_s:
666 error = PTR_ERR(s); 675 error = PTR_ERR(s);
@@ -668,8 +677,7 @@ error_close_devices:
668 btrfs_close_devices(fs_devices); 677 btrfs_close_devices(fs_devices);
669error_free_subvol_name: 678error_free_subvol_name:
670 kfree(subvol_name); 679 kfree(subvol_name);
671error: 680 return ERR_PTR(error);
672 return error;
673} 681}
674 682
675static int btrfs_remount(struct super_block *sb, int *flags, char *data) 683static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -716,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
716 struct list_head *head = &root->fs_info->space_info; 724 struct list_head *head = &root->fs_info->space_info;
717 struct btrfs_space_info *found; 725 struct btrfs_space_info *found;
718 u64 total_used = 0; 726 u64 total_used = 0;
727 u64 total_used_data = 0;
719 int bits = dentry->d_sb->s_blocksize_bits; 728 int bits = dentry->d_sb->s_blocksize_bits;
720 __be32 *fsid = (__be32 *)root->fs_info->fsid; 729 __be32 *fsid = (__be32 *)root->fs_info->fsid;
721 730
722 rcu_read_lock(); 731 rcu_read_lock();
723 list_for_each_entry_rcu(found, head, list) 732 list_for_each_entry_rcu(found, head, list) {
733 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
734 BTRFS_BLOCK_GROUP_SYSTEM))
735 total_used_data += found->disk_total;
736 else
737 total_used_data += found->disk_used;
724 total_used += found->disk_used; 738 total_used += found->disk_used;
739 }
725 rcu_read_unlock(); 740 rcu_read_unlock();
726 741
727 buf->f_namelen = BTRFS_NAME_LEN; 742 buf->f_namelen = BTRFS_NAME_LEN;
728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 743 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
729 buf->f_bfree = buf->f_blocks - (total_used >> bits); 744 buf->f_bfree = buf->f_blocks - (total_used >> bits);
730 buf->f_bavail = buf->f_bfree; 745 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
731 buf->f_bsize = dentry->d_sb->s_blocksize; 746 buf->f_bsize = dentry->d_sb->s_blocksize;
732 buf->f_type = BTRFS_SUPER_MAGIC; 747 buf->f_type = BTRFS_SUPER_MAGIC;
733 748
@@ -746,7 +761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
746static struct file_system_type btrfs_fs_type = { 761static struct file_system_type btrfs_fs_type = {
747 .owner = THIS_MODULE, 762 .owner = THIS_MODULE,
748 .name = "btrfs", 763 .name = "btrfs",
749 .get_sb = btrfs_get_sb, 764 .mount = btrfs_mount,
750 .kill_sb = kill_anon_super, 765 .kill_sb = kill_anon_super,
751 .fs_flags = FS_REQUIRES_DEV, 766 .fs_flags = FS_REQUIRES_DEV,
752}; 767};
@@ -797,7 +812,7 @@ static int btrfs_unfreeze(struct super_block *sb)
797 812
798static const struct super_operations btrfs_super_ops = { 813static const struct super_operations btrfs_super_ops = {
799 .drop_inode = btrfs_drop_inode, 814 .drop_inode = btrfs_drop_inode,
800 .delete_inode = btrfs_delete_inode, 815 .evict_inode = btrfs_evict_inode,
801 .put_super = btrfs_put_super, 816 .put_super = btrfs_put_super,
802 .sync_fs = btrfs_sync_fs, 817 .sync_fs = btrfs_sync_fs,
803 .show_options = btrfs_show_options, 818 .show_options = btrfs_show_options,
@@ -815,6 +830,7 @@ static const struct file_operations btrfs_ctl_fops = {
815 .unlocked_ioctl = btrfs_control_ioctl, 830 .unlocked_ioctl = btrfs_control_ioctl,
816 .compat_ioctl = btrfs_control_ioctl, 831 .compat_ioctl = btrfs_control_ioctl,
817 .owner = THIS_MODULE, 832 .owner = THIS_MODULE,
833 .llseek = noop_llseek,
818}; 834};
819 835
820static struct miscdevice btrfs_misc = { 836static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63..1fffbc017bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
163 TRANS_START, 163 TRANS_START,
164 TRANS_JOIN, 164 TRANS_JOIN,
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166 TRANS_JOIN_NOLOCK,
166}; 167};
167 168
168static int may_wait_transaction(struct btrfs_root *root, int type) 169static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
179{ 180{
180 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
181 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
182 int retries = 0;
183 int ret; 183 int ret;
184again: 184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 186 if (!h)
187 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
188 188
189 mutex_lock(&root->fs_info->trans_mutex); 189 if (type != TRANS_JOIN_NOLOCK)
190 mutex_lock(&root->fs_info->trans_mutex);
190 if (may_wait_transaction(root, type)) 191 if (may_wait_transaction(root, type))
191 wait_current_trans(root); 192 wait_current_trans(root);
192 193
@@ -195,7 +196,8 @@ again:
195 196
196 cur_trans = root->fs_info->running_transaction; 197 cur_trans = root->fs_info->running_transaction;
197 cur_trans->use_count++; 198 cur_trans->use_count++;
198 mutex_unlock(&root->fs_info->trans_mutex); 199 if (type != TRANS_JOIN_NOLOCK)
200 mutex_unlock(&root->fs_info->trans_mutex);
199 201
200 h->transid = cur_trans->transid; 202 h->transid = cur_trans->transid;
201 h->transaction = cur_trans; 203 h->transaction = cur_trans;
@@ -212,8 +214,7 @@ again:
212 } 214 }
213 215
214 if (num_items > 0) { 216 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items, 217 ret = btrfs_trans_reserve_metadata(h, root, num_items);
216 &retries);
217 if (ret == -EAGAIN) { 218 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root); 219 btrfs_commit_transaction(h, root);
219 goto again; 220 goto again;
@@ -224,9 +225,11 @@ again:
224 } 225 }
225 } 226 }
226 227
227 mutex_lock(&root->fs_info->trans_mutex); 228 if (type != TRANS_JOIN_NOLOCK)
229 mutex_lock(&root->fs_info->trans_mutex);
228 record_root_in_trans(h, root); 230 record_root_in_trans(h, root);
229 mutex_unlock(&root->fs_info->trans_mutex); 231 if (type != TRANS_JOIN_NOLOCK)
232 mutex_unlock(&root->fs_info->trans_mutex);
230 233
231 if (!current->journal_info && type != TRANS_USERSPACE) 234 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h; 235 current->journal_info = h;
@@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
244 return start_transaction(root, 0, TRANS_JOIN); 247 return start_transaction(root, 0, TRANS_JOIN);
245} 248}
246 249
250struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
251 int num_blocks)
252{
253 return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
254}
255
247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 256struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
248 int num_blocks) 257 int num_blocks)
249{ 258{
@@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
270 return 0; 279 return 0;
271} 280}
272 281
282int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
283{
284 struct btrfs_transaction *cur_trans = NULL, *t;
285 int ret;
286
287 mutex_lock(&root->fs_info->trans_mutex);
288
289 ret = 0;
290 if (transid) {
291 if (transid <= root->fs_info->last_trans_committed)
292 goto out_unlock;
293
294 /* find specified transaction */
295 list_for_each_entry(t, &root->fs_info->trans_list, list) {
296 if (t->transid == transid) {
297 cur_trans = t;
298 break;
299 }
300 if (t->transid > transid)
301 break;
302 }
303 ret = -EINVAL;
304 if (!cur_trans)
305 goto out_unlock; /* bad transid */
306 } else {
307 /* find newest transaction that is committing | committed */
308 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
309 list) {
310 if (t->in_commit) {
311 if (t->commit_done)
312 goto out_unlock;
313 cur_trans = t;
314 break;
315 }
316 }
317 if (!cur_trans)
318 goto out_unlock; /* nothing committing|committed */
319 }
320
321 cur_trans->use_count++;
322 mutex_unlock(&root->fs_info->trans_mutex);
323
324 wait_for_commit(root, cur_trans);
325
326 mutex_lock(&root->fs_info->trans_mutex);
327 put_transaction(cur_trans);
328 ret = 0;
329out_unlock:
330 mutex_unlock(&root->fs_info->trans_mutex);
331 return ret;
332}
333
273#if 0 334#if 0
274/* 335/*
275 * rate limit against the drop_snapshot code. This helps to slow down new 336 * rate limit against the drop_snapshot code. This helps to slow down new
@@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
348} 409}
349 410
350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 411static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
351 struct btrfs_root *root, int throttle) 412 struct btrfs_root *root, int throttle, int lock)
352{ 413{
353 struct btrfs_transaction *cur_trans = trans->transaction; 414 struct btrfs_transaction *cur_trans = trans->transaction;
354 struct btrfs_fs_info *info = root->fs_info; 415 struct btrfs_fs_info *info = root->fs_info;
@@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
376 437
377 btrfs_trans_release_metadata(trans, root); 438 btrfs_trans_release_metadata(trans, root);
378 439
379 if (!root->fs_info->open_ioctl_trans && 440 if (lock && !root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root)) 441 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1; 442 trans->transaction->blocked = 1;
382 443
383 if (cur_trans->blocked && !cur_trans->in_commit) { 444 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle) 445 if (throttle)
385 return btrfs_commit_transaction(trans, root); 446 return btrfs_commit_transaction(trans, root);
386 else 447 else
387 wake_up_process(info->transaction_kthread); 448 wake_up_process(info->transaction_kthread);
388 } 449 }
389 450
390 mutex_lock(&info->trans_mutex); 451 if (lock)
452 mutex_lock(&info->trans_mutex);
391 WARN_ON(cur_trans != info->running_transaction); 453 WARN_ON(cur_trans != info->running_transaction);
392 WARN_ON(cur_trans->num_writers < 1); 454 WARN_ON(cur_trans->num_writers < 1);
393 cur_trans->num_writers--; 455 cur_trans->num_writers--;
394 456
457 smp_mb();
395 if (waitqueue_active(&cur_trans->writer_wait)) 458 if (waitqueue_active(&cur_trans->writer_wait))
396 wake_up(&cur_trans->writer_wait); 459 wake_up(&cur_trans->writer_wait);
397 put_transaction(cur_trans); 460 put_transaction(cur_trans);
398 mutex_unlock(&info->trans_mutex); 461 if (lock)
462 mutex_unlock(&info->trans_mutex);
399 463
400 if (current->journal_info == trans) 464 if (current->journal_info == trans)
401 current->journal_info = NULL; 465 current->journal_info = NULL;
@@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
411int btrfs_end_transaction(struct btrfs_trans_handle *trans, 475int btrfs_end_transaction(struct btrfs_trans_handle *trans,
412 struct btrfs_root *root) 476 struct btrfs_root *root)
413{ 477{
414 return __btrfs_end_transaction(trans, root, 0); 478 return __btrfs_end_transaction(trans, root, 0, 1);
415} 479}
416 480
417int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 481int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 482 struct btrfs_root *root)
419{ 483{
420 return __btrfs_end_transaction(trans, root, 1); 484 return __btrfs_end_transaction(trans, root, 1, 1);
485}
486
487int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
488 struct btrfs_root *root)
489{
490 return __btrfs_end_transaction(trans, root, 0, 0);
421} 491}
422 492
423/* 493/*
@@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
836 struct extent_buffer *tmp; 906 struct extent_buffer *tmp;
837 struct extent_buffer *old; 907 struct extent_buffer *old;
838 int ret; 908 int ret;
839 int retries = 0;
840 u64 to_reserve = 0; 909 u64 to_reserve = 0;
841 u64 index = 0; 910 u64 index = 0;
842 u64 objectid; 911 u64 objectid;
@@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
858 927
859 if (to_reserve > 0) { 928 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 929 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries); 930 to_reserve);
862 if (ret) { 931 if (ret) {
863 pending->error = ret; 932 pending->error = ret;
864 goto fail; 933 goto fail;
@@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root)
966 super->root = root_item->bytenr; 1035 super->root = root_item->bytenr;
967 super->generation = root_item->generation; 1036 super->generation = root_item->generation;
968 super->root_level = root_item->level; 1037 super->root_level = root_item->level;
1038 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
1039 super->cache_generation = root_item->generation;
969} 1040}
970 1041
971int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1042int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
988 return ret; 1059 return ret;
989} 1060}
990 1061
1062/*
1063 * wait for the current transaction commit to start and block subsequent
1064 * transaction joins
1065 */
1066static void wait_current_trans_commit_start(struct btrfs_root *root,
1067 struct btrfs_transaction *trans)
1068{
1069 DEFINE_WAIT(wait);
1070
1071 if (trans->in_commit)
1072 return;
1073
1074 while (1) {
1075 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1076 TASK_UNINTERRUPTIBLE);
1077 if (trans->in_commit) {
1078 finish_wait(&root->fs_info->transaction_blocked_wait,
1079 &wait);
1080 break;
1081 }
1082 mutex_unlock(&root->fs_info->trans_mutex);
1083 schedule();
1084 mutex_lock(&root->fs_info->trans_mutex);
1085 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1086 }
1087}
1088
1089/*
1090 * wait for the current transaction to start and then become unblocked.
1091 * caller holds ref.
1092 */
1093static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1094 struct btrfs_transaction *trans)
1095{
1096 DEFINE_WAIT(wait);
1097
1098 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1099 return;
1100
1101 while (1) {
1102 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1103 TASK_UNINTERRUPTIBLE);
1104 if (trans->commit_done ||
1105 (trans->in_commit && !trans->blocked)) {
1106 finish_wait(&root->fs_info->transaction_wait,
1107 &wait);
1108 break;
1109 }
1110 mutex_unlock(&root->fs_info->trans_mutex);
1111 schedule();
1112 mutex_lock(&root->fs_info->trans_mutex);
1113 finish_wait(&root->fs_info->transaction_wait,
1114 &wait);
1115 }
1116}
1117
1118/*
1119 * commit transactions asynchronously. once btrfs_commit_transaction_async
1120 * returns, any subsequent transaction will not be allowed to join.
1121 */
1122struct btrfs_async_commit {
1123 struct btrfs_trans_handle *newtrans;
1124 struct btrfs_root *root;
1125 struct delayed_work work;
1126};
1127
1128static void do_async_commit(struct work_struct *work)
1129{
1130 struct btrfs_async_commit *ac =
1131 container_of(work, struct btrfs_async_commit, work.work);
1132
1133 btrfs_commit_transaction(ac->newtrans, ac->root);
1134 kfree(ac);
1135}
1136
1137int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1138 struct btrfs_root *root,
1139 int wait_for_unblock)
1140{
1141 struct btrfs_async_commit *ac;
1142 struct btrfs_transaction *cur_trans;
1143
1144 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1145 BUG_ON(!ac);
1146
1147 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1148 ac->root = root;
1149 ac->newtrans = btrfs_join_transaction(root, 0);
1150
1151 /* take transaction reference */
1152 mutex_lock(&root->fs_info->trans_mutex);
1153 cur_trans = trans->transaction;
1154 cur_trans->use_count++;
1155 mutex_unlock(&root->fs_info->trans_mutex);
1156
1157 btrfs_end_transaction(trans, root);
1158 schedule_delayed_work(&ac->work, 0);
1159
1160 /* wait for transaction to start and unblock */
1161 mutex_lock(&root->fs_info->trans_mutex);
1162 if (wait_for_unblock)
1163 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1164 else
1165 wait_current_trans_commit_start(root, cur_trans);
1166 put_transaction(cur_trans);
1167 mutex_unlock(&root->fs_info->trans_mutex);
1168
1169 return 0;
1170}
1171
1172/*
1173 * btrfs_transaction state sequence:
1174 * in_commit = 0, blocked = 0 (initial)
1175 * in_commit = 1, blocked = 1
1176 * blocked = 0
1177 * commit_done = 1
1178 */
991int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1179int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
992 struct btrfs_root *root) 1180 struct btrfs_root *root)
993{ 1181{
994 unsigned long joined = 0; 1182 unsigned long joined = 0;
995 unsigned long timeout = 1;
996 struct btrfs_transaction *cur_trans; 1183 struct btrfs_transaction *cur_trans;
997 struct btrfs_transaction *prev_trans = NULL; 1184 struct btrfs_transaction *prev_trans = NULL;
998 DEFINE_WAIT(wait); 1185 DEFINE_WAIT(wait);
@@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1039 1226
1040 trans->transaction->in_commit = 1; 1227 trans->transaction->in_commit = 1;
1041 trans->transaction->blocked = 1; 1228 trans->transaction->blocked = 1;
1229 wake_up(&root->fs_info->transaction_blocked_wait);
1230
1042 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1231 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1043 prev_trans = list_entry(cur_trans->list.prev, 1232 prev_trans = list_entry(cur_trans->list.prev,
1044 struct btrfs_transaction, list); 1233 struct btrfs_transaction, list);
@@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1063 snap_pending = 1; 1252 snap_pending = 1;
1064 1253
1065 WARN_ON(cur_trans != trans->transaction); 1254 WARN_ON(cur_trans != trans->transaction);
1066 if (cur_trans->num_writers > 1)
1067 timeout = MAX_SCHEDULE_TIMEOUT;
1068 else if (should_grow)
1069 timeout = 1;
1070
1071 mutex_unlock(&root->fs_info->trans_mutex); 1255 mutex_unlock(&root->fs_info->trans_mutex);
1072 1256
1073 if (flush_on_commit || snap_pending) { 1257 if (flush_on_commit || snap_pending) {
@@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1089 TASK_UNINTERRUPTIBLE); 1273 TASK_UNINTERRUPTIBLE);
1090 1274
1091 smp_mb(); 1275 smp_mb();
1092 if (cur_trans->num_writers > 1 || should_grow) 1276 if (cur_trans->num_writers > 1)
1093 schedule_timeout(timeout); 1277 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1278 else if (should_grow)
1279 schedule_timeout(1);
1094 1280
1095 mutex_lock(&root->fs_info->trans_mutex); 1281 mutex_lock(&root->fs_info->trans_mutex);
1096 finish_wait(&cur_trans->writer_wait, &wait); 1282 finish_wait(&cur_trans->writer_wait, &wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bf..f104b57ad4e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
87 87
88int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root); 89 struct btrfs_root *root);
90int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
91 int num_items); 93 int num_items);
92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 94struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
93 int num_blocks); 95 int num_blocks);
96struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
97 int num_blocks);
94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 98struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
95 int num_blocks); 99 int num_blocks);
100int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 101int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root); 102 struct btrfs_root *root);
98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 103int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
104int btrfs_clean_old_snapshots(struct btrfs_root *root); 109int btrfs_clean_old_snapshots(struct btrfs_root *root);
105int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 110int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
106 struct btrfs_root *root); 111 struct btrfs_root *root);
112int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
113 struct btrfs_root *root,
114 int wait_for_unblock);
107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 115int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 116 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 117int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed..992ab425599 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
36 int ret = 0; 36 int ret = 0;
37 int wret; 37 int wret;
38 int level; 38 int level;
39 int orig_level;
40 int is_extent = 0; 39 int is_extent = 0;
41 int next_key_ret = 0; 40 int next_key_ret = 0;
42 u64 last_ret = 0; 41 u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
64 return -ENOMEM; 63 return -ENOMEM;
65 64
66 level = btrfs_header_level(root->node); 65 level = btrfs_header_level(root->node);
67 orig_level = level;
68 66
69 if (level == 0) 67 if (level == 0)
70 goto out; 68 goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9..a29f19384a2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
786{ 786{
787 struct inode *dir; 787 struct inode *dir;
788 int ret; 788 int ret;
789 struct btrfs_key location;
790 struct btrfs_inode_ref *ref; 789 struct btrfs_inode_ref *ref;
791 struct btrfs_dir_item *di; 790 struct btrfs_dir_item *di;
792 struct inode *inode; 791 struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
795 unsigned long ref_ptr; 794 unsigned long ref_ptr;
796 unsigned long ref_end; 795 unsigned long ref_end;
797 796
798 location.objectid = key->objectid;
799 location.type = BTRFS_INODE_ITEM_KEY;
800 location.offset = 0;
801
802 /* 797 /*
803 * it is possible that we didn't log all the parent directories 798 * it is possible that we didn't log all the parent directories
804 * for a given inode. If we don't find the dir, just don't 799 * for a given inode. If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1583 struct btrfs_path *path; 1578 struct btrfs_path *path;
1584 struct btrfs_root *root = wc->replay_dest; 1579 struct btrfs_root *root = wc->replay_dest;
1585 struct btrfs_key key; 1580 struct btrfs_key key;
1586 u32 item_size;
1587 int level; 1581 int level;
1588 int i; 1582 int i;
1589 int ret; 1583 int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1601 nritems = btrfs_header_nritems(eb); 1595 nritems = btrfs_header_nritems(eb);
1602 for (i = 0; i < nritems; i++) { 1596 for (i = 0; i < nritems; i++) {
1603 btrfs_item_key_to_cpu(eb, &key, i); 1597 btrfs_item_key_to_cpu(eb, &key, i);
1604 item_size = btrfs_item_size_nr(eb, i);
1605 1598
1606 /* inode keys are done during the first stage */ 1599 /* inode keys are done during the first stage */
1607 if (key.type == BTRFS_INODE_ITEM_KEY && 1600 if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1668 struct walk_control *wc) 1661 struct walk_control *wc)
1669{ 1662{
1670 u64 root_owner; 1663 u64 root_owner;
1671 u64 root_gen;
1672 u64 bytenr; 1664 u64 bytenr;
1673 u64 ptr_gen; 1665 u64 ptr_gen;
1674 struct extent_buffer *next; 1666 struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1690
1699 parent = path->nodes[*level]; 1691 parent = path->nodes[*level];
1700 root_owner = btrfs_header_owner(parent); 1692 root_owner = btrfs_header_owner(parent);
1701 root_gen = btrfs_header_generation(parent);
1702 1693
1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1704 1695
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1749 struct walk_control *wc) 1740 struct walk_control *wc)
1750{ 1741{
1751 u64 root_owner; 1742 u64 root_owner;
1752 u64 root_gen;
1753 int i; 1743 int i;
1754 int slot; 1744 int slot;
1755 int ret; 1745 int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1747 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1758 slot = path->slots[i]; 1748 slot = path->slots[i];
1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1749 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1760 struct extent_buffer *node;
1761 node = path->nodes[i];
1762 path->slots[i]++; 1750 path->slots[i]++;
1763 *level = i; 1751 *level = i;
1764 WARN_ON(*level == 0); 1752 WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1771 parent = path->nodes[*level + 1]; 1759 parent = path->nodes[*level + 1];
1772 1760
1773 root_owner = btrfs_header_owner(parent); 1761 root_owner = btrfs_header_owner(parent);
1774 root_gen = btrfs_header_generation(parent);
1775 wc->process_func(root, path->nodes[*level], wc, 1762 wc->process_func(root, path->nodes[*level], wc,
1776 btrfs_header_generation(path->nodes[*level])); 1763 btrfs_header_generation(path->nodes[*level]));
1777 if (wc->free) { 1764 if (wc->free) {
@@ -2273,7 +2260,7 @@ fail:
2273 } 2260 }
2274 btrfs_end_log_trans(root); 2261 btrfs_end_log_trans(root);
2275 2262
2276 return 0; 2263 return err;
2277} 2264}
2278 2265
2279/* see comments for btrfs_del_dir_entries_in_log */ 2266/* see comments for btrfs_del_dir_entries_in_log */
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2729 struct btrfs_key max_key; 2716 struct btrfs_key max_key;
2730 struct btrfs_root *log = root->log_root; 2717 struct btrfs_root *log = root->log_root;
2731 struct extent_buffer *src = NULL; 2718 struct extent_buffer *src = NULL;
2732 u32 size;
2733 int err = 0; 2719 int err = 0;
2734 int ret; 2720 int ret;
2735 int nritems; 2721 int nritems;
@@ -2793,7 +2779,6 @@ again:
2793 break; 2779 break;
2794 2780
2795 src = path->nodes[0]; 2781 src = path->nodes[0];
2796 size = btrfs_item_size_nr(src, path->slots[0]);
2797 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2782 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2798 ins_nr++; 2783 ins_nr++;
2799 goto next_slot; 2784 goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6e3af8be95..cc04dc1445d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -258,7 +258,7 @@ loop_lock:
258 258
259 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260 260
261 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (cur->bi_rw & REQ_SYNC)
262 num_sync_run++; 262 num_sync_run++;
263 263
264 submit_bio(cur->bi_rw, cur); 264 submit_bio(cur->bi_rw, cur);
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 398 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 399 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 400 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 401 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 402 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 403 if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 461 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 462 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 463 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 464 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 465 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 466 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1489 trans = btrfs_start_transaction(root, 0); 1487 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1488 lock_chunks(root);
1491 1489
1492 device->barriers = 1;
1493 device->writeable = 1; 1490 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1491 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1492 generate_random_uuid(device->uuid);
@@ -1901,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1901 u64 size_to_free; 1898 u64 size_to_free;
1902 struct btrfs_path *path; 1899 struct btrfs_path *path;
1903 struct btrfs_key key; 1900 struct btrfs_key key;
1904 struct btrfs_chunk *chunk;
1905 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1901 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1906 struct btrfs_trans_handle *trans; 1902 struct btrfs_trans_handle *trans;
1907 struct btrfs_key found_key; 1903 struct btrfs_key found_key;
@@ -1965,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1965 if (found_key.objectid != key.objectid) 1961 if (found_key.objectid != key.objectid)
1966 break; 1962 break;
1967 1963
1968 chunk = btrfs_item_ptr(path->nodes[0],
1969 path->slots[0],
1970 struct btrfs_chunk);
1971 /* chunk zero is special */ 1964 /* chunk zero is special */
1972 if (found_key.offset == 0) 1965 if (found_key.offset == 0)
1973 break; 1966 break;
@@ -2651,7 +2644,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2651 int max_errors = 0; 2644 int max_errors = 0;
2652 struct btrfs_multi_bio *multi = NULL; 2645 struct btrfs_multi_bio *multi = NULL;
2653 2646
2654 if (multi_ret && !(rw & (1 << BIO_RW))) 2647 if (multi_ret && !(rw & REQ_WRITE))
2655 stripes_allocated = 1; 2648 stripes_allocated = 1;
2656again: 2649again:
2657 if (multi_ret) { 2650 if (multi_ret) {
@@ -2687,7 +2680,7 @@ again:
2687 mirror_num = 0; 2680 mirror_num = 0;
2688 2681
2689 /* if our multi bio struct is too small, back off and try again */ 2682 /* if our multi bio struct is too small, back off and try again */
2690 if (rw & (1 << BIO_RW)) { 2683 if (rw & REQ_WRITE) {
2691 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2684 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2692 BTRFS_BLOCK_GROUP_DUP)) { 2685 BTRFS_BLOCK_GROUP_DUP)) {
2693 stripes_required = map->num_stripes; 2686 stripes_required = map->num_stripes;
@@ -2697,7 +2690,7 @@ again:
2697 max_errors = 1; 2690 max_errors = 1;
2698 } 2691 }
2699 } 2692 }
2700 if (multi_ret && (rw & (1 << BIO_RW)) && 2693 if (multi_ret && (rw & REQ_WRITE) &&
2701 stripes_allocated < stripes_required) { 2694 stripes_allocated < stripes_required) {
2702 stripes_allocated = map->num_stripes; 2695 stripes_allocated = map->num_stripes;
2703 free_extent_map(em); 2696 free_extent_map(em);
@@ -2733,7 +2726,7 @@ again:
2733 num_stripes = 1; 2726 num_stripes = 1;
2734 stripe_index = 0; 2727 stripe_index = 0;
2735 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2728 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2736 if (unplug_page || (rw & (1 << BIO_RW))) 2729 if (unplug_page || (rw & REQ_WRITE))
2737 num_stripes = map->num_stripes; 2730 num_stripes = map->num_stripes;
2738 else if (mirror_num) 2731 else if (mirror_num)
2739 stripe_index = mirror_num - 1; 2732 stripe_index = mirror_num - 1;
@@ -2744,7 +2737,7 @@ again:
2744 } 2737 }
2745 2738
2746 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2739 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2747 if (rw & (1 << BIO_RW)) 2740 if (rw & REQ_WRITE)
2748 num_stripes = map->num_stripes; 2741 num_stripes = map->num_stripes;
2749 else if (mirror_num) 2742 else if (mirror_num)
2750 stripe_index = mirror_num - 1; 2743 stripe_index = mirror_num - 1;
@@ -2755,7 +2748,7 @@ again:
2755 stripe_index = do_div(stripe_nr, factor); 2748 stripe_index = do_div(stripe_nr, factor);
2756 stripe_index *= map->sub_stripes; 2749 stripe_index *= map->sub_stripes;
2757 2750
2758 if (unplug_page || (rw & (1 << BIO_RW))) 2751 if (unplug_page || (rw & REQ_WRITE))
2759 num_stripes = map->sub_stripes; 2752 num_stripes = map->sub_stripes;
2760 else if (mirror_num) 2753 else if (mirror_num)
2761 stripe_index += mirror_num - 1; 2754 stripe_index += mirror_num - 1;
@@ -2945,7 +2938,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2945 struct btrfs_pending_bios *pending_bios; 2938 struct btrfs_pending_bios *pending_bios;
2946 2939
2947 /* don't bother with additional async steps for reads, right now */ 2940 /* don't bother with additional async steps for reads, right now */
2948 if (!(rw & (1 << BIO_RW))) { 2941 if (!(rw & REQ_WRITE)) {
2949 bio_get(bio); 2942 bio_get(bio);
2950 submit_bio(rw, bio); 2943 submit_bio(rw, bio);
2951 bio_put(bio); 2944 bio_put(bio);
@@ -2964,7 +2957,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2964 bio->bi_rw |= rw; 2957 bio->bi_rw |= rw;
2965 2958
2966 spin_lock(&device->io_lock); 2959 spin_lock(&device->io_lock);
2967 if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) 2960 if (bio->bi_rw & REQ_SYNC)
2968 pending_bios = &device->pending_sync_bios; 2961 pending_bios = &device->pending_sync_bios;
2969 else 2962 else
2970 pending_bios = &device->pending_bios; 2963 pending_bios = &device->pending_bios;
@@ -3034,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3034 } 3027 }
3035 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3028 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3036 dev = multi->stripes[dev_nr].dev; 3029 dev = multi->stripes[dev_nr].dev;
3037 BUG_ON(rw == WRITE && !dev->writeable); 3030 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3038 if (dev && dev->bdev) {
3039 bio->bi_bdev = dev->bdev; 3031 bio->bi_bdev = dev->bdev;
3040 if (async_submit) 3032 if (async_submit)
3041 schedule_bio(root, dev, rw, bio); 3033 schedule_bio(root, dev, rw, bio);
@@ -3084,7 +3076,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3076 return NULL;
3085 list_add(&device->dev_list, 3077 list_add(&device->dev_list,
3086 &fs_devices->devices); 3078 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3079 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3080 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3081 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2e..2b638b6e4ee 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
42 int running_pending; 42 int running_pending;
43 u64 generation; 43 u64 generation;
44 44
45 int barriers;
46 int writeable; 45 int writeable;
47 int in_fs_metadata; 46 int in_fs_metadata;
48 47
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb21587..698fdd2c739 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
178 struct inode *inode = dentry->d_inode; 178 struct inode *inode = dentry->d_inode;
179 struct btrfs_root *root = BTRFS_I(inode)->root; 179 struct btrfs_root *root = BTRFS_I(inode)->root;
180 struct btrfs_path *path; 180 struct btrfs_path *path;
181 struct btrfs_item *item;
182 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
183 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
184 int ret = 0, slot, advance; 183 int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
234 } 233 }
235 advance = 1; 234 advance = 1;
236 235
237 item = btrfs_item_nr(leaf, slot);
238 btrfs_item_key_to_cpu(leaf, &found_key, slot); 236 btrfs_item_key_to_cpu(leaf, &found_key, slot);
239 237
240 /* check to make sure this item is what we want */ 238 /* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa23..b9cd5445f71 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
199 int nr_pages = 0; 199 int nr_pages = 0;
200 struct page *in_page = NULL; 200 struct page *in_page = NULL;
201 struct page *out_page = NULL; 201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left; 202 unsigned long bytes_left;
205 203
206 *out_pages = 0; 204 *out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 231 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 232 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235 233
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) { 234 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 235 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) { 236 if (ret != Z_OK) {
diff --git a/fs/buffer.c b/fs/buffer.c
index d54812b198e..5930e382959 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156 if (uptodate) { 156 if (uptodate) {
157 set_buffer_uptodate(bh); 157 set_buffer_uptodate(bh);
158 } else { 158 } else {
159 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { 159 if (!quiet_error(bh)) {
160 buffer_io_error(bh); 160 buffer_io_error(bh);
161 printk(KERN_WARNING "lost page write due to " 161 printk(KERN_WARNING "lost page write due to "
162 "I/O error on %s\n", 162 "I/O error on %s\n",
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
770 spin_unlock(lock); 770 spin_unlock(lock);
771 /* 771 /*
772 * Ensure any pending I/O completes so that 772 * Ensure any pending I/O completes so that
773 * ll_rw_block() actually writes the current 773 * write_dirty_buffer() actually writes the
774 * contents - it is a noop if I/O is still in 774 * current contents - it is a noop if I/O is
775 * flight on potentially older contents. 775 * still in flight on potentially older
776 * contents.
776 */ 777 */
777 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); 778 write_dirty_buffer(bh, WRITE_SYNC_PLUG);
778 779
779 /* 780 /*
780 * Kick off IO for the previous mapping. Note 781 * Kick off IO for the previous mapping. Note
@@ -904,7 +905,6 @@ try_again:
904 905
905 bh->b_state = 0; 906 bh->b_state = 0;
906 atomic_set(&bh->b_count, 0); 907 atomic_set(&bh->b_count, 0);
907 bh->b_private = NULL;
908 bh->b_size = size; 908 bh->b_size = size;
909 909
910 /* Link the buffer to its page */ 910 /* Link the buffer to its page */
@@ -1705,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1705 * and kswapd activity, but those code paths have their own 1705 * and kswapd activity, but those code paths have their own
1706 * higher-level throttling. 1706 * higher-level throttling.
1707 */ 1707 */
1708 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1708 if (wbc->sync_mode != WB_SYNC_NONE) {
1709 lock_buffer(bh); 1709 lock_buffer(bh);
1710 } else if (!trylock_buffer(bh)) { 1710 } else if (!trylock_buffer(bh)) {
1711 redirty_page_for_writepage(wbc, page); 1711 redirty_page_for_writepage(wbc, page);
@@ -1833,9 +1833,12 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1833} 1833}
1834EXPORT_SYMBOL(page_zero_new_buffers); 1834EXPORT_SYMBOL(page_zero_new_buffers);
1835 1835
1836static int __block_prepare_write(struct inode *inode, struct page *page, 1836int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1837 unsigned from, unsigned to, get_block_t *get_block) 1837 get_block_t *get_block)
1838{ 1838{
1839 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1840 unsigned to = from + len;
1841 struct inode *inode = page->mapping->host;
1839 unsigned block_start, block_end; 1842 unsigned block_start, block_end;
1840 sector_t block; 1843 sector_t block;
1841 int err = 0; 1844 int err = 0;
@@ -1908,10 +1911,13 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1908 if (!buffer_uptodate(*wait_bh)) 1911 if (!buffer_uptodate(*wait_bh))
1909 err = -EIO; 1912 err = -EIO;
1910 } 1913 }
1911 if (unlikely(err)) 1914 if (unlikely(err)) {
1912 page_zero_new_buffers(page, from, to); 1915 page_zero_new_buffers(page, from, to);
1916 ClearPageUptodate(page);
1917 }
1913 return err; 1918 return err;
1914} 1919}
1920EXPORT_SYMBOL(__block_write_begin);
1915 1921
1916static int __block_commit_write(struct inode *inode, struct page *page, 1922static int __block_commit_write(struct inode *inode, struct page *page,
1917 unsigned from, unsigned to) 1923 unsigned from, unsigned to)
@@ -1949,90 +1955,32 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1949} 1955}
1950 1956
1951/* 1957/*
1952 * Filesystems implementing the new truncate sequence should use the 1958 * block_write_begin takes care of the basic task of block allocation and
1953 * _newtrunc postfix variant which won't incorrectly call vmtruncate. 1959 * bringing partial write blocks uptodate first.
1960 *
1954 * The filesystem needs to handle block truncation upon failure. 1961 * The filesystem needs to handle block truncation upon failure.
1955 */ 1962 */
1956int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, 1963int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1957 loff_t pos, unsigned len, unsigned flags, 1964 unsigned flags, struct page **pagep, get_block_t *get_block)
1958 struct page **pagep, void **fsdata,
1959 get_block_t *get_block)
1960{ 1965{
1961 struct inode *inode = mapping->host; 1966 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1962 int status = 0;
1963 struct page *page; 1967 struct page *page;
1964 pgoff_t index; 1968 int status;
1965 unsigned start, end;
1966 int ownpage = 0;
1967 1969
1968 index = pos >> PAGE_CACHE_SHIFT; 1970 page = grab_cache_page_write_begin(mapping, index, flags);
1969 start = pos & (PAGE_CACHE_SIZE - 1); 1971 if (!page)
1970 end = start + len; 1972 return -ENOMEM;
1971
1972 page = *pagep;
1973 if (page == NULL) {
1974 ownpage = 1;
1975 page = grab_cache_page_write_begin(mapping, index, flags);
1976 if (!page) {
1977 status = -ENOMEM;
1978 goto out;
1979 }
1980 *pagep = page;
1981 } else
1982 BUG_ON(!PageLocked(page));
1983 1973
1984 status = __block_prepare_write(inode, page, start, end, get_block); 1974 status = __block_write_begin(page, pos, len, get_block);
1985 if (unlikely(status)) { 1975 if (unlikely(status)) {
1986 ClearPageUptodate(page); 1976 unlock_page(page);
1987 1977 page_cache_release(page);
1988 if (ownpage) { 1978 page = NULL;
1989 unlock_page(page);
1990 page_cache_release(page);
1991 *pagep = NULL;
1992 }
1993 } 1979 }
1994 1980
1995out: 1981 *pagep = page;
1996 return status; 1982 return status;
1997} 1983}
1998EXPORT_SYMBOL(block_write_begin_newtrunc);
1999
2000/*
2001 * block_write_begin takes care of the basic task of block allocation and
2002 * bringing partial write blocks uptodate first.
2003 *
2004 * If *pagep is not NULL, then block_write_begin uses the locked page
2005 * at *pagep rather than allocating its own. In this case, the page will
2006 * not be unlocked or deallocated on failure.
2007 */
2008int block_write_begin(struct file *file, struct address_space *mapping,
2009 loff_t pos, unsigned len, unsigned flags,
2010 struct page **pagep, void **fsdata,
2011 get_block_t *get_block)
2012{
2013 int ret;
2014
2015 ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
2016 pagep, fsdata, get_block);
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 *
2023 * Filesystems which pass down their own page also cannot
2024 * call into vmtruncate here because it would lead to lock
2025 * inversion problems (*pagep is locked). This is a further
2026 * example of where the old truncate sequence is inadequate.
2027 */
2028 if (unlikely(ret) && *pagep == NULL) {
2029 loff_t isize = mapping->host->i_size;
2030 if (pos + len > isize)
2031 vmtruncate(mapping->host, isize);
2032 }
2033
2034 return ret;
2035}
2036EXPORT_SYMBOL(block_write_begin); 1984EXPORT_SYMBOL(block_write_begin);
2037 1985
2038int block_write_end(struct file *file, struct address_space *mapping, 1986int block_write_end(struct file *file, struct address_space *mapping,
@@ -2351,7 +2299,7 @@ out:
2351 * For moronic filesystems that do not allow holes in file. 2299 * For moronic filesystems that do not allow holes in file.
2352 * We may have to extend the file. 2300 * We may have to extend the file.
2353 */ 2301 */
2354int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, 2302int cont_write_begin(struct file *file, struct address_space *mapping,
2355 loff_t pos, unsigned len, unsigned flags, 2303 loff_t pos, unsigned len, unsigned flags,
2356 struct page **pagep, void **fsdata, 2304 struct page **pagep, void **fsdata,
2357 get_block_t *get_block, loff_t *bytes) 2305 get_block_t *get_block, loff_t *bytes)
@@ -2363,7 +2311,7 @@ int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2363 2311
2364 err = cont_expand_zero(file, mapping, pos, bytes); 2312 err = cont_expand_zero(file, mapping, pos, bytes);
2365 if (err) 2313 if (err)
2366 goto out; 2314 return err;
2367 2315
2368 zerofrom = *bytes & ~PAGE_CACHE_MASK; 2316 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2369 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2317 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
@@ -2371,44 +2319,10 @@ int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2371 (*bytes)++; 2319 (*bytes)++;
2372 } 2320 }
2373 2321
2374 *pagep = NULL; 2322 return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2375 err = block_write_begin_newtrunc(file, mapping, pos, len,
2376 flags, pagep, fsdata, get_block);
2377out:
2378 return err;
2379}
2380EXPORT_SYMBOL(cont_write_begin_newtrunc);
2381
2382int cont_write_begin(struct file *file, struct address_space *mapping,
2383 loff_t pos, unsigned len, unsigned flags,
2384 struct page **pagep, void **fsdata,
2385 get_block_t *get_block, loff_t *bytes)
2386{
2387 int ret;
2388
2389 ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
2390 pagep, fsdata, get_block, bytes);
2391 if (unlikely(ret)) {
2392 loff_t isize = mapping->host->i_size;
2393 if (pos + len > isize)
2394 vmtruncate(mapping->host, isize);
2395 }
2396
2397 return ret;
2398} 2323}
2399EXPORT_SYMBOL(cont_write_begin); 2324EXPORT_SYMBOL(cont_write_begin);
2400 2325
2401int block_prepare_write(struct page *page, unsigned from, unsigned to,
2402 get_block_t *get_block)
2403{
2404 struct inode *inode = page->mapping->host;
2405 int err = __block_prepare_write(inode, page, from, to, get_block);
2406 if (err)
2407 ClearPageUptodate(page);
2408 return err;
2409}
2410EXPORT_SYMBOL(block_prepare_write);
2411
2412int block_commit_write(struct page *page, unsigned from, unsigned to) 2326int block_commit_write(struct page *page, unsigned from, unsigned to)
2413{ 2327{
2414 struct inode *inode = page->mapping->host; 2328 struct inode *inode = page->mapping->host;
@@ -2457,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2457 else 2371 else
2458 end = PAGE_CACHE_SIZE; 2372 end = PAGE_CACHE_SIZE;
2459 2373
2460 ret = block_prepare_write(page, 0, end, get_block); 2374 ret = __block_write_begin(page, 0, end, get_block);
2461 if (!ret) 2375 if (!ret)
2462 ret = block_commit_write(page, 0, end); 2376 ret = block_commit_write(page, 0, end);
2463 2377
@@ -2510,11 +2424,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2510} 2424}
2511 2425
2512/* 2426/*
2513 * Filesystems implementing the new truncate sequence should use the 2427 * On entry, the page is fully not uptodate.
2514 * _newtrunc postfix variant which won't incorrectly call vmtruncate. 2428 * On exit the page is fully uptodate in the areas outside (from,to)
2515 * The filesystem needs to handle block truncation upon failure. 2429 * The filesystem needs to handle block truncation upon failure.
2516 */ 2430 */
2517int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, 2431int nobh_write_begin(struct address_space *mapping,
2518 loff_t pos, unsigned len, unsigned flags, 2432 loff_t pos, unsigned len, unsigned flags,
2519 struct page **pagep, void **fsdata, 2433 struct page **pagep, void **fsdata,
2520 get_block_t *get_block) 2434 get_block_t *get_block)
@@ -2544,11 +2458,10 @@ int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2544 *fsdata = NULL; 2458 *fsdata = NULL;
2545 2459
2546 if (page_has_buffers(page)) { 2460 if (page_has_buffers(page)) {
2547 unlock_page(page); 2461 ret = __block_write_begin(page, pos, len, get_block);
2548 page_cache_release(page); 2462 if (unlikely(ret))
2549 *pagep = NULL; 2463 goto out_release;
2550 return block_write_begin_newtrunc(file, mapping, pos, len, 2464 return ret;
2551 flags, pagep, fsdata, get_block);
2552 } 2465 }
2553 2466
2554 if (PageMappedToDisk(page)) 2467 if (PageMappedToDisk(page))
@@ -2654,35 +2567,6 @@ out_release:
2654 2567
2655 return ret; 2568 return ret;
2656} 2569}
2657EXPORT_SYMBOL(nobh_write_begin_newtrunc);
2658
2659/*
2660 * On entry, the page is fully not uptodate.
2661 * On exit the page is fully uptodate in the areas outside (from,to)
2662 */
2663int nobh_write_begin(struct file *file, struct address_space *mapping,
2664 loff_t pos, unsigned len, unsigned flags,
2665 struct page **pagep, void **fsdata,
2666 get_block_t *get_block)
2667{
2668 int ret;
2669
2670 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
2671 pagep, fsdata, get_block);
2672
2673 /*
2674 * prepare_write() may have instantiated a few blocks
2675 * outside i_size. Trim these off again. Don't need
2676 * i_size_read because we hold i_mutex.
2677 */
2678 if (unlikely(ret)) {
2679 loff_t isize = mapping->host->i_size;
2680 if (pos + len > isize)
2681 vmtruncate(mapping->host, isize);
2682 }
2683
2684 return ret;
2685}
2686EXPORT_SYMBOL(nobh_write_begin); 2570EXPORT_SYMBOL(nobh_write_begin);
2687 2571
2688int nobh_write_end(struct file *file, struct address_space *mapping, 2572int nobh_write_end(struct file *file, struct address_space *mapping,
@@ -2998,7 +2882,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2998 2882
2999 if (err == -EOPNOTSUPP) { 2883 if (err == -EOPNOTSUPP) {
3000 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2884 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
3001 set_bit(BH_Eopnotsupp, &bh->b_state);
3002 } 2885 }
3003 2886
3004 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2887 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3020,13 +2903,6 @@ int submit_bh(int rw, struct buffer_head * bh)
3020 BUG_ON(buffer_unwritten(bh)); 2903 BUG_ON(buffer_unwritten(bh));
3021 2904
3022 /* 2905 /*
3023 * Mask in barrier bit for a write (could be either a WRITE or a
3024 * WRITE_SYNC
3025 */
3026 if (buffer_ordered(bh) && (rw & WRITE))
3027 rw |= WRITE_BARRIER;
3028
3029 /*
3030 * Only clear out a write error when rewriting 2906 * Only clear out a write error when rewriting
3031 */ 2907 */
3032 if (test_set_buffer_req(bh) && (rw & WRITE)) 2908 if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -3064,22 +2940,21 @@ EXPORT_SYMBOL(submit_bh);
3064 2940
3065/** 2941/**
3066 * ll_rw_block: low-level access to block devices (DEPRECATED) 2942 * ll_rw_block: low-level access to block devices (DEPRECATED)
3067 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 2943 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3068 * @nr: number of &struct buffer_heads in the array 2944 * @nr: number of &struct buffer_heads in the array
3069 * @bhs: array of pointers to &struct buffer_head 2945 * @bhs: array of pointers to &struct buffer_head
3070 * 2946 *
3071 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2947 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3072 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2948 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3073 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 2949 * %READA option is described in the documentation for generic_make_request()
3074 * are sent to disk. The fourth %READA option is described in the documentation 2950 * which ll_rw_block() calls.
3075 * for generic_make_request() which ll_rw_block() calls.
3076 * 2951 *
3077 * This function drops any buffer that it cannot get a lock on (with the 2952 * This function drops any buffer that it cannot get a lock on (with the
3078 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 2953 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3079 * clean when doing a write request, and any buffer that appears to be 2954 * request, and any buffer that appears to be up-to-date when doing read
3080 * up-to-date when doing read request. Further it marks as clean buffers that 2955 * request. Further it marks as clean buffers that are processed for
3081 * are processed for writing (the buffer cache won't assume that they are 2956 * writing (the buffer cache won't assume that they are actually clean
3082 * actually clean until the buffer gets unlocked). 2957 * until the buffer gets unlocked).
3083 * 2958 *
3084 * ll_rw_block sets b_end_io to simple completion handler that marks 2959 * ll_rw_block sets b_end_io to simple completion handler that marks
3085 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 2960 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -3095,20 +2970,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3095 for (i = 0; i < nr; i++) { 2970 for (i = 0; i < nr; i++) {
3096 struct buffer_head *bh = bhs[i]; 2971 struct buffer_head *bh = bhs[i];
3097 2972
3098 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) 2973 if (!trylock_buffer(bh))
3099 lock_buffer(bh);
3100 else if (!trylock_buffer(bh))
3101 continue; 2974 continue;
3102 2975 if (rw == WRITE) {
3103 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3104 rw == SWRITE_SYNC_PLUG) {
3105 if (test_clear_buffer_dirty(bh)) { 2976 if (test_clear_buffer_dirty(bh)) {
3106 bh->b_end_io = end_buffer_write_sync; 2977 bh->b_end_io = end_buffer_write_sync;
3107 get_bh(bh); 2978 get_bh(bh);
3108 if (rw == SWRITE_SYNC) 2979 submit_bh(WRITE, bh);
3109 submit_bh(WRITE_SYNC, bh);
3110 else
3111 submit_bh(WRITE, bh);
3112 continue; 2980 continue;
3113 } 2981 }
3114 } else { 2982 } else {
@@ -3124,12 +2992,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3124} 2992}
3125EXPORT_SYMBOL(ll_rw_block); 2993EXPORT_SYMBOL(ll_rw_block);
3126 2994
2995void write_dirty_buffer(struct buffer_head *bh, int rw)
2996{
2997 lock_buffer(bh);
2998 if (!test_clear_buffer_dirty(bh)) {
2999 unlock_buffer(bh);
3000 return;
3001 }
3002 bh->b_end_io = end_buffer_write_sync;
3003 get_bh(bh);
3004 submit_bh(rw, bh);
3005}
3006EXPORT_SYMBOL(write_dirty_buffer);
3007
3127/* 3008/*
3128 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3009 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3129 * and then start new I/O and then wait upon it. The caller must have a ref on 3010 * and then start new I/O and then wait upon it. The caller must have a ref on
3130 * the buffer_head. 3011 * the buffer_head.
3131 */ 3012 */
3132int sync_dirty_buffer(struct buffer_head *bh) 3013int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3133{ 3014{
3134 int ret = 0; 3015 int ret = 0;
3135 3016
@@ -3138,12 +3019,8 @@ int sync_dirty_buffer(struct buffer_head *bh)
3138 if (test_clear_buffer_dirty(bh)) { 3019 if (test_clear_buffer_dirty(bh)) {
3139 get_bh(bh); 3020 get_bh(bh);
3140 bh->b_end_io = end_buffer_write_sync; 3021 bh->b_end_io = end_buffer_write_sync;
3141 ret = submit_bh(WRITE_SYNC, bh); 3022 ret = submit_bh(rw, bh);
3142 wait_on_buffer(bh); 3023 wait_on_buffer(bh);
3143 if (buffer_eopnotsupp(bh)) {
3144 clear_buffer_eopnotsupp(bh);
3145 ret = -EOPNOTSUPP;
3146 }
3147 if (!ret && !buffer_uptodate(bh)) 3024 if (!ret && !buffer_uptodate(bh))
3148 ret = -EIO; 3025 ret = -EIO;
3149 } else { 3026 } else {
@@ -3151,6 +3028,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
3151 } 3028 }
3152 return ret; 3029 return ret;
3153} 3030}
3031EXPORT_SYMBOL(__sync_dirty_buffer);
3032
3033int sync_dirty_buffer(struct buffer_head *bh)
3034{
3035 return __sync_dirty_buffer(bh, WRITE_SYNC);
3036}
3154EXPORT_SYMBOL(sync_dirty_buffer); 3037EXPORT_SYMBOL(sync_dirty_buffer);
3155 3038
3156/* 3039/*
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 2906077ac79..a2603e7c0bb 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -146,7 +146,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
146 goto error_unsupported; 146 goto error_unsupported;
147 147
148 /* get the cache size and blocksize */ 148 /* get the cache size and blocksize */
149 ret = vfs_statfs(root, &stats); 149 ret = vfs_statfs(&path, &stats);
150 if (ret < 0) 150 if (ret < 0)
151 goto error_unsupported; 151 goto error_unsupported;
152 152
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index c2413561ea7..0a1467b1551 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
55 .read = cachefiles_daemon_read, 55 .read = cachefiles_daemon_read,
56 .write = cachefiles_daemon_write, 56 .write = cachefiles_daemon_write,
57 .poll = cachefiles_daemon_poll, 57 .poll = cachefiles_daemon_poll,
58 .llseek = noop_llseek,
58}; 59};
59 60
60struct cachefiles_daemon_cmd { 61struct cachefiles_daemon_cmd {
@@ -552,8 +553,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
552 */ 553 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) 554static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{ 555{
555 struct fs_struct *fs; 556 struct path path;
556 struct dentry *dir;
557 const struct cred *saved_cred; 557 const struct cred *saved_cred;
558 int ret; 558 int ret;
559 559
@@ -573,24 +573,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
573 } 573 }
574 574
575 /* extract the directory dentry from the cwd */ 575 /* extract the directory dentry from the cwd */
576 fs = current->fs; 576 get_fs_pwd(current->fs, &path);
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580 577
581 if (!S_ISDIR(dir->d_inode->i_mode)) 578 if (!S_ISDIR(path.dentry->d_inode->i_mode))
582 goto notdir; 579 goto notdir;
583 580
584 cachefiles_begin_secure(cache, &saved_cred); 581 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args); 582 ret = cachefiles_cull(cache, path.dentry, args);
586 cachefiles_end_secure(cache, saved_cred); 583 cachefiles_end_secure(cache, saved_cred);
587 584
588 dput(dir); 585 path_put(&path);
589 _leave(" = %d", ret); 586 _leave(" = %d", ret);
590 return ret; 587 return ret;
591 588
592notdir: 589notdir:
593 dput(dir); 590 path_put(&path);
594 kerror("cull command requires dirfd to be a directory"); 591 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR; 592 return -ENOTDIR;
596 593
@@ -628,8 +625,7 @@ inval:
628 */ 625 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) 626static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{ 627{
631 struct fs_struct *fs; 628 struct path path;
632 struct dentry *dir;
633 const struct cred *saved_cred; 629 const struct cred *saved_cred;
634 int ret; 630 int ret;
635 631
@@ -649,24 +645,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
649 } 645 }
650 646
651 /* extract the directory dentry from the cwd */ 647 /* extract the directory dentry from the cwd */
652 fs = current->fs; 648 get_fs_pwd(current->fs, &path);
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656 649
657 if (!S_ISDIR(dir->d_inode->i_mode)) 650 if (!S_ISDIR(path.dentry->d_inode->i_mode))
658 goto notdir; 651 goto notdir;
659 652
660 cachefiles_begin_secure(cache, &saved_cred); 653 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args); 654 ret = cachefiles_check_in_use(cache, path.dentry, args);
662 cachefiles_end_secure(cache, saved_cred); 655 cachefiles_end_secure(cache, saved_cred);
663 656
664 dput(dir); 657 path_put(&path);
665 //_leave(" = %d", ret); 658 //_leave(" = %d", ret);
666 return ret; 659 return ret;
667 660
668notdir: 661notdir:
669 dput(dir); 662 path_put(&path);
670 kerror("inuse command requires dirfd to be a directory"); 663 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR; 664 return -ENOTDIR;
672 665
@@ -683,6 +676,10 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr) 676 unsigned fnr, unsigned bnr)
684{ 677{
685 struct kstatfs stats; 678 struct kstatfs stats;
679 struct path path = {
680 .mnt = cache->mnt,
681 .dentry = cache->mnt->mnt_root,
682 };
686 int ret; 683 int ret;
687 684
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", 685 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
@@ -697,7 +694,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
697 /* find out how many pages of blockdev are available */ 694 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats)); 695 memset(&stats, 0, sizeof(stats));
699 696
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats); 697 ret = vfs_statfs(&path, &stats);
701 if (ret < 0) { 698 if (ret < 0) {
702 if (ret == -EIO) 699 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed"); 700 cachefiles_io_error(cache, "statfs failed");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226d..bd6bc1bde2d 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do { \
267#define dbgprintk(FMT, ...) \ 267#define dbgprintk(FMT, ...) \
268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
269 269
270/* make sure we maintain the format strings, even when debugging is disabled */
271static inline void _dbprintk(const char *fmt, ...)
272 __attribute__((format(printf, 1, 2)));
273static inline void _dbprintk(const char *fmt, ...)
274{
275}
276
277#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 270#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
278#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 271#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
279#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 272#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do { \
304} while (0) 297} while (0)
305 298
306#else 299#else
307#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 300#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
308#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 301#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
309#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 302#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
310#endif 303#endif
311 304
312#if 1 /* defined(__KDEBUGALL) */ 305#if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42..42c7fafc8bf 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
37 37
38 printk(KERN_ERR "%sobject: OBJ%x\n", 38 printk(KERN_ERR "%sobject: OBJ%x\n",
39 prefix, object->fscache.debug_id); 39 prefix, object->fscache.debug_id);
40 printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, fscache_object_states[object->fscache.state],
42 object->fscache.flags, object->fscache.work.flags, 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, 43 object->fscache.events,
44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); 44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
212 212
213 /* if the object we're waiting for is queued for processing, 213 /* if the object we're waiting for is queued for processing,
214 * then just put ourselves on the queue behind it */ 214 * then just put ourselves on the queue behind it */
215 if (slow_work_is_queued(&xobject->fscache.work)) { 215 if (work_pending(&xobject->fscache.work)) {
216 _debug("queue OBJ%x behind OBJ%x immediately", 216 _debug("queue OBJ%x behind OBJ%x immediately",
217 object->fscache.debug_id, 217 object->fscache.debug_id,
218 xobject->fscache.debug_id); 218 xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
220 } 220 }
221 221
222 /* otherwise we sleep until either the object we're waiting for 222 /* otherwise we sleep until either the object we're waiting for
223 * is done, or the slow-work facility wants the thread back to 223 * is done, or the fscache_object is congested */
224 * do other work */
225 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); 224 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
226 init_wait(&wait); 225 init_wait(&wait);
227 requeue = false; 226 requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
229 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 228 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
230 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) 229 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
231 break; 230 break;
232 requeue = slow_work_sleep_till_thread_needed( 231
233 &object->fscache.work, &timeout); 232 requeue = fscache_object_sleep_till_congested(&timeout);
234 } while (timeout > 0 && !requeue); 233 } while (timeout > 0 && !requeue);
235 finish_wait(wq, &wait); 234 finish_wait(wq, &wait);
236 235
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03..0e3c0924cc3 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
423 423
424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
425 op->op.flags |= FSCACHE_OP_FAST; 425 op->op.flags |= FSCACHE_OP_ASYNC;
426 op->op.processor = cachefiles_read_copier; 426 op->op.processor = cachefiles_read_copier;
427 427
428 pagevec_init(&pagevec, 0); 428 pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
729 pagevec_init(&pagevec, 0); 729 pagevec_init(&pagevec, 0);
730 730
731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
732 op->op.flags |= FSCACHE_OP_FAST; 732 op->op.flags |= FSCACHE_OP_ASYNC;
733 op->op.processor = cachefiles_read_copier; 733 op->op.processor = cachefiles_read_copier;
734 734
735 INIT_LIST_HEAD(&backpages); 735 INIT_LIST_HEAD(&backpages);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27..9eb134ea6eb 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,8 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
7 select CRYPTO
8 default n
6 help 9 help
7 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -13,15 +16,3 @@ config CEPH_FS
13 16
14 If unsure, say N. 17 If unsure, say N.
15 18
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be..9e6c4f2e8ff 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,17 +6,10 @@ ifneq ($(KERNELRELEASE),)
6 6
7obj-$(CONFIG_CEPH_FS) += ceph.o 7obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 12 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20 13
21else 14else
22#Otherwise we were called directly from the command 15#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c..00000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d9c60b84949..e9c874abc9e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -87,7 +88,7 @@ static int ceph_set_page_dirty(struct page *page)
87 88
88 /* dirty the head */ 89 /* dirty the head */
89 spin_lock(&inode->i_lock); 90 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0) 91 if (ci->i_head_snapc == NULL)
91 ci->i_head_snapc = ceph_get_snap_context(snapc); 92 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head; 93 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0) 94 if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +106,7 @@ static int ceph_set_page_dirty(struct page *page)
105 spin_lock_irq(&mapping->tree_lock); 106 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */ 107 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page)); 108 WARN_ON_ONCE(!PageUptodate(page));
108 109 account_page_dirtied(page, page->mapping);
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree, 110 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY); 111 page_index(page), PAGECACHE_TAG_DIRTY);
117 112
@@ -199,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
199{ 194{
200 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
203 int err = 0; 199 int err = 0;
204 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
205 201
@@ -271,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
271{ 267{
272 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
275 int rc = 0; 272 int rc = 0;
276 struct page **pages; 273 struct page **pages;
277 loff_t offset; 274 loff_t offset;
@@ -309,7 +306,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
309 zero_user_segment(page, s, PAGE_CACHE_SIZE); 306 zero_user_segment(page, s, PAGE_CACHE_SIZE);
310 } 307 }
311 308
312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { 309 if (add_to_page_cache_lru(page, mapping, page->index,
310 GFP_NOFS)) {
313 page_cache_release(page); 311 page_cache_release(page);
314 dout("readpages %p add_to_page_cache failed %p\n", 312 dout("readpages %p add_to_page_cache failed %p\n",
315 inode, page); 313 inode, page);
@@ -351,7 +349,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
351 break; 349 break;
352 } 350 }
353 } 351 }
354 if (!snapc && ci->i_head_snapc) { 352 if (!snapc && ci->i_wrbuffer_ref_head) {
355 snapc = ceph_get_snap_context(ci->i_head_snapc); 353 snapc = ceph_get_snap_context(ci->i_head_snapc);
356 dout(" head snapc %p has %d dirty pages\n", 354 dout(" head snapc %p has %d dirty pages\n",
357 snapc, ci->i_wrbuffer_ref_head); 355 snapc, ci->i_wrbuffer_ref_head);
@@ -370,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
370{ 368{
371 struct inode *inode; 369 struct inode *inode;
372 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
373 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
374 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
375 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
376 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -388,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
388 } 386 }
389 inode = page->mapping->host; 387 inode = page->mapping->host;
390 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
391 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
392 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
393 391
394 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
395 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -416,13 +414,13 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
416 if (i_size < page_off + len) 414 if (i_size < page_off + len)
417 len = i_size - page_off; 415 len = i_size - page_off;
418 416
419 dout("writepage %p page %p index %lu on %llu~%u\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
420 inode, page, page->index, page_off, len); 418 inode, page, page->index, page_off, len, snapc);
421 419
422 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
423 if (writeback_stat > 421 if (writeback_stat >
424 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
425 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
426 424
427 set_page_writeback(page); 425 set_page_writeback(page);
428 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -501,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
501 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
502 __s32 rc = -EIO; 500 __s32 rc = -EIO;
503 u64 bytes = 0; 501 u64 bytes = 0;
504 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
505 long writeback_stat; 503 long writeback_stat;
506 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
507 505
@@ -534,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
534 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
535 533
536 writeback_stat = 534 writeback_stat =
537 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
538 if (writeback_stat < 536 if (writeback_stat <
539 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
540 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
541 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
542 540
543 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -552,7 +550,7 @@ static void writepages_finish(struct ceph_osd_request *req,
552 * page truncation thread, possibly losing some data that 550 * page truncation thread, possibly losing some data that
553 * raced its way in 551 * raced its way in
554 */ 552 */
555 if ((issued & CEPH_CAP_FILE_CACHE) == 0) 553 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
556 generic_error_remove_page(inode->i_mapping, page); 554 generic_error_remove_page(inode->i_mapping, page);
557 555
558 unlock_page(page); 556 unlock_page(page);
@@ -574,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
574 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
575 * may be less than the maximum write size. 573 * may be less than the maximum write size.
576 */ 574 */
577static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
578 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
579{ 577{
580 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
581 GFP_NOFS); 579 GFP_NOFS);
582 if (!req->r_pages) { 580 if (!req->r_pages) {
583 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
584 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
585 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
586 } 584 }
@@ -593,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
593 struct writeback_control *wbc) 591 struct writeback_control *wbc)
594{ 592{
595 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
596 struct backing_dev_info *bdi = mapping->backing_dev_info;
597 struct ceph_inode_info *ci = ceph_inode(inode); 594 struct ceph_inode_info *ci = ceph_inode(inode);
598 struct ceph_client *client; 595 struct ceph_fs_client *fsc;
599 pgoff_t index, start, end; 596 pgoff_t index, start, end;
600 int range_whole = 0; 597 int range_whole = 0;
601 int should_loop = 1; 598 int should_loop = 1;
@@ -622,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
622 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 619 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
623 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 620 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
624 621
625 client = ceph_inode_to_client(inode); 622 fsc = ceph_inode_to_client(inode);
626 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 623 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
627 pr_warning("writepage_start %p on forced umount\n", inode); 624 pr_warning("writepage_start %p on forced umount\n", inode);
628 return -EIO; /* we're in a forced umount, don't write! */ 625 return -EIO; /* we're in a forced umount, don't write! */
629 } 626 }
630 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 627 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
631 wsize = client->mount_args->wsize; 628 wsize = fsc->mount_options->wsize;
632 if (wsize < PAGE_CACHE_SIZE) 629 if (wsize < PAGE_CACHE_SIZE)
633 wsize = PAGE_CACHE_SIZE; 630 wsize = PAGE_CACHE_SIZE;
634 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 631 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
635 632
636 pagevec_init(&pvec, 0); 633 pagevec_init(&pvec, 0);
637 634
638 /* ?? */
639 if (wbc->nonblocking && bdi_write_congested(bdi)) {
640 dout(" writepages congested\n");
641 wbc->encountered_congestion = 1;
642 goto out_final;
643 }
644
645 /* where to start/end? */ 635 /* where to start/end? */
646 if (wbc->range_cyclic) { 636 if (wbc->range_cyclic) {
647 start = mapping->writeback_index; /* Start from prev offset */ 637 start = mapping->writeback_index; /* Start from prev offset */
@@ -771,9 +761,10 @@ get_more_pages:
771 /* ok */ 761 /* ok */
772 if (locked_pages == 0) { 762 if (locked_pages == 0) {
773 /* prepare async write request */ 763 /* prepare async write request */
774 offset = page->index << PAGE_CACHE_SHIFT; 764 offset = (unsigned long long)page->index
765 << PAGE_CACHE_SHIFT;
775 len = wsize; 766 len = wsize;
776 req = ceph_osdc_new_request(&client->osdc, 767 req = ceph_osdc_new_request(&fsc->client->osdc,
777 &ci->i_layout, 768 &ci->i_layout,
778 ceph_vino(inode), 769 ceph_vino(inode),
779 offset, &len, 770 offset, &len,
@@ -786,7 +777,7 @@ get_more_pages:
786 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1);
787 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
788 779
789 alloc_page_vec(client, req); 780 alloc_page_vec(fsc, req);
790 req->r_callback = writepages_finish; 781 req->r_callback = writepages_finish;
791 req->r_inode = inode; 782 req->r_inode = inode;
792 } 783 }
@@ -797,9 +788,12 @@ get_more_pages:
797 dout("%p will write page %p idx %lu\n", 788 dout("%p will write page %p idx %lu\n",
798 inode, page, page->index); 789 inode, page, page->index);
799 790
800 writeback_stat = atomic_long_inc_return(&client->writeback_count); 791 writeback_stat =
801 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { 792 atomic_long_inc_return(&fsc->writeback_count);
802 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 793 if (writeback_stat > CONGESTION_ON_THRESH(
794 fsc->mount_options->congestion_kb)) {
795 set_bdi_congested(&fsc->backing_dev_info,
796 BLK_RW_ASYNC);
803 } 797 }
804 798
805 set_page_writeback(page); 799 set_page_writeback(page);
@@ -847,7 +841,7 @@ get_more_pages:
847 op->payload_len = cpu_to_le32(len); 841 op->payload_len = cpu_to_le32(len);
848 req->r_request->hdr.data_len = cpu_to_le32(len); 842 req->r_request->hdr.data_len = cpu_to_le32(len);
849 843
850 ceph_osdc_start_request(&client->osdc, req, true); 844 ceph_osdc_start_request(&fsc->client->osdc, req, true);
851 req = NULL; 845 req = NULL;
852 846
853 /* continue? */ 847 /* continue? */
@@ -883,7 +877,6 @@ out:
883 rc = 0; /* vfs expects us to return 0 */ 877 rc = 0; /* vfs expects us to return 0 */
884 ceph_put_snap_context(snapc); 878 ceph_put_snap_context(snapc);
885 dout("writepages done, rc = %d\n", rc); 879 dout("writepages done, rc = %d\n", rc);
886out_final:
887 return rc; 880 return rc;
888} 881}
889 882
@@ -916,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
916{ 909{
917 struct inode *inode = file->f_dentry->d_inode; 910 struct inode *inode = file->f_dentry->d_inode;
918 struct ceph_inode_info *ci = ceph_inode(inode); 911 struct ceph_inode_info *ci = ceph_inode(inode);
919 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 912 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
920 loff_t page_off = pos & PAGE_CACHE_MASK; 913 loff_t page_off = pos & PAGE_CACHE_MASK;
921 int pos_in_page = pos & ~PAGE_CACHE_MASK; 914 int pos_in_page = pos & ~PAGE_CACHE_MASK;
922 int end_in_page = pos_in_page + len; 915 int end_in_page = pos_in_page + len;
@@ -1036,7 +1029,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1036 *pagep = page; 1029 *pagep = page;
1037 1030
1038 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1031 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1039 inode, page, (int)pos, (int)len); 1032 inode, page, (int)pos, (int)len);
1040 1033
1041 r = ceph_update_writeable_page(file, pos, len, page); 1034 r = ceph_update_writeable_page(file, pos, len, page);
1042 } while (r == -EAGAIN); 1035 } while (r == -EAGAIN);
@@ -1054,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1054 struct page *page, void *fsdata) 1047 struct page *page, void *fsdata)
1055{ 1048{
1056 struct inode *inode = file->f_dentry->d_inode; 1049 struct inode *inode = file->f_dentry->d_inode;
1057 struct ceph_client *client = ceph_inode_to_client(inode); 1050 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1058 struct ceph_mds_client *mdsc = &client->mdsc; 1051 struct ceph_mds_client *mdsc = fsc->mdsc;
1059 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1052 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1060 int check_cap = 0; 1053 int check_cap = 0;
1061 1054
@@ -1124,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1124{ 1117{
1125 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1118 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1126 struct page *page = vmf->page; 1119 struct page *page = vmf->page;
1127 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1120 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1128 loff_t off = page->index << PAGE_CACHE_SHIFT; 1121 loff_t off = page->index << PAGE_CACHE_SHIFT;
1129 loff_t size, len; 1122 loff_t size, len;
1130 int ret; 1123 int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index 67b2c030924..00000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,99 +0,0 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 89490beaf53..00000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a13..00000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 const char *name;
19
20 /*
21 * true if we are authenticated and can connect to
22 * services.
23 */
24 int (*is_authenticated)(struct ceph_auth_client *ac);
25
26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
33 * build requests and process replies during monitor
34 * handshake. if handle_reply returns -EAGAIN, we build
35 * another request.
36 */
37 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
38 int (*handle_reply)(struct ceph_auth_client *ac, int result,
39 void *buf, void *end);
40
41 /*
42 * Create authorizer for connecting to a service, and verify
43 * the response to authenticate the service.
44 */
45 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
46 struct ceph_authorizer **a,
47 void **buf, size_t *len,
48 void **reply_buf, size_t *reply_len);
49 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
50 struct ceph_authorizer *a, size_t len);
51 void (*destroy_authorizer)(struct ceph_auth_client *ac,
52 struct ceph_authorizer *a);
53 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
54 int peer_type);
55
56 /* reset when we (re)connect to a monitor */
57 void (*reset)(struct ceph_auth_client *ac);
58
59 void (*destroy)(struct ceph_auth_client *ac);
60};
61
62struct ceph_auth_client {
63 u32 protocol; /* CEPH_AUTH_* */
64 void *private; /* for use by protocol implementation */
65 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
66
67 bool negotiating; /* true if negotiating protocol */
68 const char *name; /* entity name */
69 u64 global_id; /* our unique id in system */
70 const char *secret; /* our secret key */
71 unsigned want_keys; /* which services we want */
72};
73
74extern struct ceph_auth_client *ceph_auth_init(const char *name,
75 const char *secret);
76extern void ceph_auth_destroy(struct ceph_auth_client *ac);
77
78extern void ceph_auth_reset(struct ceph_auth_client *ac);
79
80extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
81 void *buf, size_t len);
82extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
83 void *buf, size_t len,
84 void *reply_buf, size_t reply_len);
85extern int ceph_entity_name_encode(const char *name, void **p, void *end);
86
87extern int ceph_build_auth(struct ceph_auth_client *ac,
88 void *msg_buf, size_t msg_len);
89
90extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
91
92#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c..00000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
41/*
42 * the generic auth code decode the global_id, and we carry no actual
43 * authenticate state, so nothing happens here.
44 */
45static int handle_reply(struct ceph_auth_client *ac, int result,
46 void *buf, void *end)
47{
48 struct ceph_auth_none_info *xi = ac->private;
49
50 xi->starting = false;
51 return result;
52}
53
54/*
55 * build an 'authorizer' with our entity_name and global_id. we can
56 * reuse a single static copy since it is identical for all services
57 * we connect to.
58 */
59static int ceph_auth_none_create_authorizer(
60 struct ceph_auth_client *ac, int peer_type,
61 struct ceph_authorizer **a,
62 void **buf, size_t *len,
63 void **reply_buf, size_t *reply_len)
64{
65 struct ceph_auth_none_info *ai = ac->private;
66 struct ceph_none_authorizer *au = &ai->au;
67 void *p, *end;
68 int ret;
69
70 if (!ai->built_authorizer) {
71 p = au->buf;
72 end = p + sizeof(au->buf);
73 ceph_encode_8(&p, 1);
74 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
75 if (ret < 0)
76 goto bad;
77 ceph_decode_need(&p, end, sizeof(u64), bad2);
78 ceph_encode_64(&p, ac->global_id);
79 au->buf_len = p - (void *)au->buf;
80 ai->built_authorizer = true;
81 dout("built authorizer len %d\n", au->buf_len);
82 }
83
84 *a = (struct ceph_authorizer *)au;
85 *buf = au->buf;
86 *len = au->buf_len;
87 *reply_buf = au->reply_buf;
88 *reply_len = sizeof(au->reply_buf);
89 return 0;
90
91bad2:
92 ret = -ERANGE;
93bad:
94 return ret;
95}
96
97static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
98 struct ceph_authorizer *a)
99{
100 /* nothing to do */
101}
102
103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
105 .reset = reset,
106 .destroy = destroy,
107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
109 .handle_reply = handle_reply,
110 .create_authorizer = ceph_auth_none_create_authorizer,
111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
112};
113
114int ceph_auth_none_init(struct ceph_auth_client *ac)
115{
116 struct ceph_auth_none_info *xi;
117
118 dout("ceph_auth_none_init %p\n", ac);
119 xi = kzalloc(sizeof(*xi), GFP_NOFS);
120 if (!xi)
121 return -ENOMEM;
122
123 xi->starting = true;
124 xi->built_authorizer = false;
125
126 ac->protocol = CEPH_AUTH_NONE;
127 ac->private = xi;
128 ac->ops = &ceph_auth_none_ops;
129 return 0;
130}
131
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08b..00000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5
6#include "auth.h"
7
8/*
9 * null security mode.
10 *
11 * we use a single static authorizer that simply encodes our entity name
12 * and global id.
13 */
14
15struct ceph_none_authorizer {
16 char buf[128];
17 int buf_len;
18 char reply_buf[0];
19};
20
21struct ceph_auth_none_info {
22 bool starting;
23 bool built_authorizer;
24 struct ceph_none_authorizer au; /* we only need one; it's static */
25};
26
27extern int ceph_auth_none_init(struct ceph_auth_client *ac);
28
29#endif
30
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index 6d44053ecff..00000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,684 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15#define TEMP_TICKET_BUF_LEN 256
16
17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18
19static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20{
21 struct ceph_x_info *xi = ac->private;
22 int need;
23
24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28}
29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
41static int ceph_x_encrypt_buflen(int ilen)
42{
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32);
45}
46
47static int ceph_x_encrypt(struct ceph_crypto_key *secret,
48 void *ibuf, int ilen, void *obuf, size_t olen)
49{
50 struct ceph_x_encrypt_header head = {
51 .struct_v = 1,
52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
53 };
54 size_t len = olen - sizeof(u32);
55 int ret;
56
57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
58 &head, sizeof(head), ibuf, ilen);
59 if (ret)
60 return ret;
61 ceph_encode_32(&obuf, len);
62 return len + sizeof(u32);
63}
64
65static int ceph_x_decrypt(struct ceph_crypto_key *secret,
66 void **p, void *end, void *obuf, size_t olen)
67{
68 struct ceph_x_encrypt_header head;
69 size_t head_len = sizeof(head);
70 int len, ret;
71
72 len = ceph_decode_32(p);
73 if (*p + len > end)
74 return -EINVAL;
75
76 dout("ceph_x_decrypt len %d\n", len);
77 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
78 *p, len);
79 if (ret)
80 return ret;
81 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
82 return -EPERM;
83 *p += len;
84 return olen;
85}
86
87/*
88 * get existing (or insert new) ticket handler
89 */
90struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
91 int service)
92{
93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private;
95 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
96
97 while (*p) {
98 parent = *p;
99 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
100 if (service < th->service)
101 p = &(*p)->rb_left;
102 else if (service > th->service)
103 p = &(*p)->rb_right;
104 else
105 return th;
106 }
107
108 /* add it */
109 th = kzalloc(sizeof(*th), GFP_NOFS);
110 if (!th)
111 return ERR_PTR(-ENOMEM);
112 th->service = service;
113 rb_link_node(&th->node, parent, p);
114 rb_insert_color(&th->node, &xi->ticket_handlers);
115 return th;
116}
117
118static void remove_ticket_handler(struct ceph_auth_client *ac,
119 struct ceph_x_ticket_handler *th)
120{
121 struct ceph_x_info *xi = ac->private;
122
123 dout("remove_ticket_handler %p %d\n", th, th->service);
124 rb_erase(&th->node, &xi->ticket_handlers);
125 ceph_crypto_key_destroy(&th->session_key);
126 if (th->ticket_blob)
127 ceph_buffer_put(th->ticket_blob);
128 kfree(th);
129}
130
131static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
132 struct ceph_crypto_key *secret,
133 void *buf, void *end)
134{
135 struct ceph_x_info *xi = ac->private;
136 int num;
137 void *p = buf;
138 int ret;
139 char *dbuf;
140 char *ticket_buf;
141 u8 reply_struct_v;
142
143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
144 if (!dbuf)
145 return -ENOMEM;
146
147 ret = -ENOMEM;
148 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
149 if (!ticket_buf)
150 goto out_dbuf;
151
152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
153 reply_struct_v = ceph_decode_8(&p);
154 if (reply_struct_v != 1)
155 goto bad;
156 num = ceph_decode_32(&p);
157 dout("%d tickets\n", num);
158 while (num--) {
159 int type;
160 u8 tkt_struct_v, blob_struct_v;
161 struct ceph_x_ticket_handler *th;
162 void *dp, *dend;
163 int dlen;
164 char is_enc;
165 struct timespec validity;
166 struct ceph_crypto_key old_key;
167 void *tp, *tpend;
168 struct ceph_timespec new_validity;
169 struct ceph_crypto_key new_session_key;
170 struct ceph_buffer *new_ticket_blob;
171 unsigned long new_expires, new_renew_after;
172 u64 new_secret_id;
173
174 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
175
176 type = ceph_decode_32(&p);
177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
178
179 tkt_struct_v = ceph_decode_8(&p);
180 if (tkt_struct_v != 1)
181 goto bad;
182
183 th = get_ticket_handler(ac, type);
184 if (IS_ERR(th)) {
185 ret = PTR_ERR(th);
186 goto out;
187 }
188
189 /* blob for me */
190 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
191 TEMP_TICKET_BUF_LEN);
192 if (dlen <= 0) {
193 ret = dlen;
194 goto out;
195 }
196 dout(" decrypted %d bytes\n", dlen);
197 dend = dbuf + dlen;
198 dp = dbuf;
199
200 tkt_struct_v = ceph_decode_8(&dp);
201 if (tkt_struct_v != 1)
202 goto bad;
203
204 memcpy(&old_key, &th->session_key, sizeof(old_key));
205 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
206 if (ret)
207 goto out;
208
209 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
210 ceph_decode_timespec(&validity, &new_validity);
211 new_expires = get_seconds() + validity.tv_sec;
212 new_renew_after = new_expires - (validity.tv_sec / 4);
213 dout(" expires=%lu renew_after=%lu\n", new_expires,
214 new_renew_after);
215
216 /* ticket blob for service */
217 ceph_decode_8_safe(&p, end, is_enc, bad);
218 tp = ticket_buf;
219 if (is_enc) {
220 /* encrypted */
221 dout(" encrypted ticket\n");
222 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
223 TEMP_TICKET_BUF_LEN);
224 if (dlen < 0) {
225 ret = dlen;
226 goto out;
227 }
228 dlen = ceph_decode_32(&tp);
229 } else {
230 /* unencrypted */
231 ceph_decode_32_safe(&p, end, dlen, bad);
232 ceph_decode_need(&p, end, dlen, bad);
233 ceph_decode_copy(&p, ticket_buf, dlen);
234 }
235 tpend = tp + dlen;
236 dout(" ticket blob is %d bytes\n", dlen);
237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
238 blob_struct_v = ceph_decode_8(&tp);
239 new_secret_id = ceph_decode_64(&tp);
240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
241 if (ret)
242 goto out;
243
244 /* all is well, update our ticket */
245 ceph_crypto_key_destroy(&th->session_key);
246 if (th->ticket_blob)
247 ceph_buffer_put(th->ticket_blob);
248 th->session_key = new_session_key;
249 th->ticket_blob = new_ticket_blob;
250 th->validity = new_validity;
251 th->secret_id = new_secret_id;
252 th->expires = new_expires;
253 th->renew_after = new_renew_after;
254 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
255 type, ceph_entity_type_name(type), th->secret_id,
256 (int)th->ticket_blob->vec.iov_len);
257 xi->have_keys |= th->service;
258 }
259
260 ret = 0;
261out:
262 kfree(ticket_buf);
263out_dbuf:
264 kfree(dbuf);
265 return ret;
266
267bad:
268 ret = -EINVAL;
269 goto out;
270}
271
272static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
273 struct ceph_x_ticket_handler *th,
274 struct ceph_x_authorizer *au)
275{
276 int maxlen;
277 struct ceph_x_authorize_a *msg_a;
278 struct ceph_x_authorize_b msg_b;
279 void *p, *end;
280 int ret;
281 int ticket_blob_len =
282 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
283
284 dout("build_authorizer for %s %p\n",
285 ceph_entity_type_name(th->service), au);
286
287 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
288 ceph_x_encrypt_buflen(ticket_blob_len);
289 dout(" need len %d\n", maxlen);
290 if (au->buf && au->buf->alloc_len < maxlen) {
291 ceph_buffer_put(au->buf);
292 au->buf = NULL;
293 }
294 if (!au->buf) {
295 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
296 if (!au->buf)
297 return -ENOMEM;
298 }
299 au->service = th->service;
300
301 msg_a = au->buf->vec.iov_base;
302 msg_a->struct_v = 1;
303 msg_a->global_id = cpu_to_le64(ac->global_id);
304 msg_a->service_id = cpu_to_le32(th->service);
305 msg_a->ticket_blob.struct_v = 1;
306 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
307 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
308 if (ticket_blob_len) {
309 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
310 th->ticket_blob->vec.iov_len);
311 }
312 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
313 le64_to_cpu(msg_a->ticket_blob.secret_id));
314
315 p = msg_a + 1;
316 p += ticket_blob_len;
317 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
318
319 get_random_bytes(&au->nonce, sizeof(au->nonce));
320 msg_b.struct_v = 1;
321 msg_b.nonce = cpu_to_le64(au->nonce);
322 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
323 p, end - p);
324 if (ret < 0)
325 goto out_buf;
326 p += ret;
327 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
328 dout(" built authorizer nonce %llx len %d\n", au->nonce,
329 (int)au->buf->vec.iov_len);
330 BUG_ON(au->buf->vec.iov_len > maxlen);
331 return 0;
332
333out_buf:
334 ceph_buffer_put(au->buf);
335 au->buf = NULL;
336 return ret;
337}
338
339static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
340 void **p, void *end)
341{
342 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
343 ceph_encode_8(p, 1);
344 ceph_encode_64(p, th->secret_id);
345 if (th->ticket_blob) {
346 const char *buf = th->ticket_blob->vec.iov_base;
347 u32 len = th->ticket_blob->vec.iov_len;
348
349 ceph_encode_32_safe(p, end, len, bad);
350 ceph_encode_copy_safe(p, end, buf, len, bad);
351 } else {
352 ceph_encode_32_safe(p, end, 0, bad);
353 }
354
355 return 0;
356bad:
357 return -ERANGE;
358}
359
360static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
361{
362 int want = ac->want_keys;
363 struct ceph_x_info *xi = ac->private;
364 int service;
365
366 *pneed = ac->want_keys & ~(xi->have_keys);
367
368 for (service = 1; service <= want; service <<= 1) {
369 struct ceph_x_ticket_handler *th;
370
371 if (!(ac->want_keys & service))
372 continue;
373
374 if (*pneed & service)
375 continue;
376
377 th = get_ticket_handler(ac, service);
378
379 if (!th) {
380 *pneed |= service;
381 continue;
382 }
383
384 if (get_seconds() >= th->renew_after)
385 *pneed |= service;
386 if (get_seconds() >= th->expires)
387 xi->have_keys &= ~service;
388 }
389}
390
391
392static int ceph_x_build_request(struct ceph_auth_client *ac,
393 void *buf, void *end)
394{
395 struct ceph_x_info *xi = ac->private;
396 int need;
397 struct ceph_x_request_header *head = buf;
398 int ret;
399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401
402 ceph_x_validate_tickets(ac, &need);
403
404 dout("build_request want %x have %x need %x\n",
405 ac->want_keys, xi->have_keys, need);
406
407 if (need & CEPH_ENTITY_TYPE_AUTH) {
408 struct ceph_x_authenticate *auth = (void *)(head + 1);
409 void *p = auth + 1;
410 struct ceph_x_challenge_blob tmp;
411 char tmp_enc[40];
412 u64 *u;
413
414 if (p > end)
415 return -ERANGE;
416
417 dout(" get_auth_session_key\n");
418 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
419
420 /* encrypt and hash */
421 get_random_bytes(&auth->client_challenge, sizeof(u64));
422 tmp.client_challenge = auth->client_challenge;
423 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
424 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
425 tmp_enc, sizeof(tmp_enc));
426 if (ret < 0)
427 return ret;
428
429 auth->struct_v = 1;
430 auth->key = 0;
431 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
432 auth->key ^= *u;
433 dout(" server_challenge %llx client_challenge %llx key %llx\n",
434 xi->server_challenge, le64_to_cpu(auth->client_challenge),
435 le64_to_cpu(auth->key));
436
437 /* now encode the old ticket if exists */
438 ret = ceph_x_encode_ticket(th, &p, end);
439 if (ret < 0)
440 return ret;
441
442 return p - buf;
443 }
444
445 if (need) {
446 void *p = head + 1;
447 struct ceph_x_service_ticket_request *req;
448
449 if (p > end)
450 return -ERANGE;
451 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
452
453 BUG_ON(!th);
454 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
455 if (ret)
456 return ret;
457 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
458 xi->auth_authorizer.buf->vec.iov_len);
459
460 req = p;
461 req->keys = cpu_to_le32(need);
462 p += sizeof(*req);
463 return p - buf;
464 }
465
466 return 0;
467}
468
469static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
470 void *buf, void *end)
471{
472 struct ceph_x_info *xi = ac->private;
473 struct ceph_x_reply_header *head = buf;
474 struct ceph_x_ticket_handler *th;
475 int len = end - buf;
476 int op;
477 int ret;
478
479 if (result)
480 return result; /* XXX hmm? */
481
482 if (xi->starting) {
483 /* it's a hello */
484 struct ceph_x_server_challenge *sc = buf;
485
486 if (len != sizeof(*sc))
487 return -EINVAL;
488 xi->server_challenge = le64_to_cpu(sc->server_challenge);
489 dout("handle_reply got server challenge %llx\n",
490 xi->server_challenge);
491 xi->starting = false;
492 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
493 return -EAGAIN;
494 }
495
496 op = le16_to_cpu(head->op);
497 result = le32_to_cpu(head->result);
498 dout("handle_reply op %d result %d\n", op, result);
499 switch (op) {
500 case CEPHX_GET_AUTH_SESSION_KEY:
501 /* verify auth key */
502 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
503 buf + sizeof(*head), end);
504 break;
505
506 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
507 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
508 BUG_ON(!th);
509 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
510 buf + sizeof(*head), end);
511 break;
512
513 default:
514 return -EINVAL;
515 }
516 if (ret)
517 return ret;
518 if (ac->want_keys == xi->have_keys)
519 return 0;
520 return -EAGAIN;
521}
522
523static int ceph_x_create_authorizer(
524 struct ceph_auth_client *ac, int peer_type,
525 struct ceph_authorizer **a,
526 void **buf, size_t *len,
527 void **reply_buf, size_t *reply_len)
528{
529 struct ceph_x_authorizer *au;
530 struct ceph_x_ticket_handler *th;
531 int ret;
532
533 th = get_ticket_handler(ac, peer_type);
534 if (IS_ERR(th))
535 return PTR_ERR(th);
536
537 au = kzalloc(sizeof(*au), GFP_NOFS);
538 if (!au)
539 return -ENOMEM;
540
541 ret = ceph_x_build_authorizer(ac, th, au);
542 if (ret) {
543 kfree(au);
544 return ret;
545 }
546
547 *a = (struct ceph_authorizer *)au;
548 *buf = au->buf->vec.iov_base;
549 *len = au->buf->vec.iov_len;
550 *reply_buf = au->reply_buf;
551 *reply_len = sizeof(au->reply_buf);
552 return 0;
553}
554
555static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
556 struct ceph_authorizer *a, size_t len)
557{
558 struct ceph_x_authorizer *au = (void *)a;
559 struct ceph_x_ticket_handler *th;
560 int ret = 0;
561 struct ceph_x_authorize_reply reply;
562 void *p = au->reply_buf;
563 void *end = p + sizeof(au->reply_buf);
564
565 th = get_ticket_handler(ac, au->service);
566 if (!th)
567 return -EIO; /* hrm! */
568 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
569 if (ret < 0)
570 return ret;
571 if (ret != sizeof(reply))
572 return -EPERM;
573
574 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
575 ret = -EPERM;
576 else
577 ret = 0;
578 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
579 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
580 return ret;
581}
582
583static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
584 struct ceph_authorizer *a)
585{
586 struct ceph_x_authorizer *au = (void *)a;
587
588 ceph_buffer_put(au->buf);
589 kfree(au);
590}
591
592
593static void ceph_x_reset(struct ceph_auth_client *ac)
594{
595 struct ceph_x_info *xi = ac->private;
596
597 dout("reset\n");
598 xi->starting = true;
599 xi->server_challenge = 0;
600}
601
602static void ceph_x_destroy(struct ceph_auth_client *ac)
603{
604 struct ceph_x_info *xi = ac->private;
605 struct rb_node *p;
606
607 dout("ceph_x_destroy %p\n", ac);
608 ceph_crypto_key_destroy(&xi->secret);
609
610 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
611 struct ceph_x_ticket_handler *th =
612 rb_entry(p, struct ceph_x_ticket_handler, node);
613 remove_ticket_handler(ac, th);
614 }
615
616 if (xi->auth_authorizer.buf)
617 ceph_buffer_put(xi->auth_authorizer.buf);
618
619 kfree(ac->private);
620 ac->private = NULL;
621}
622
623static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
624 int peer_type)
625{
626 struct ceph_x_ticket_handler *th;
627
628 th = get_ticket_handler(ac, peer_type);
629 if (th && !IS_ERR(th))
630 remove_ticket_handler(ac, th);
631}
632
633
634static const struct ceph_auth_client_ops ceph_x_ops = {
635 .name = "x",
636 .is_authenticated = ceph_x_is_authenticated,
637 .should_authenticate = ceph_x_should_authenticate,
638 .build_request = ceph_x_build_request,
639 .handle_reply = ceph_x_handle_reply,
640 .create_authorizer = ceph_x_create_authorizer,
641 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
642 .destroy_authorizer = ceph_x_destroy_authorizer,
643 .invalidate_authorizer = ceph_x_invalidate_authorizer,
644 .reset = ceph_x_reset,
645 .destroy = ceph_x_destroy,
646};
647
648
649int ceph_x_init(struct ceph_auth_client *ac)
650{
651 struct ceph_x_info *xi;
652 int ret;
653
654 dout("ceph_x_init %p\n", ac);
655 ret = -ENOMEM;
656 xi = kzalloc(sizeof(*xi), GFP_NOFS);
657 if (!xi)
658 goto out;
659
660 ret = -EINVAL;
661 if (!ac->secret) {
662 pr_err("no secret set (for auth_x protocol)\n");
663 goto out_nomem;
664 }
665
666 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
667 if (ret)
668 goto out_nomem;
669
670 xi->starting = true;
671 xi->ticket_handlers = RB_ROOT;
672
673 ac->protocol = CEPH_AUTH_CEPHX;
674 ac->private = xi;
675 ac->ops = &ceph_x_ops;
676 return 0;
677
678out_nomem:
679 kfree(xi);
680out:
681 return ret;
682}
683
684
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e68..00000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4..00000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index c67535d70aa..00000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,81 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068..00000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b81be9a5648..98ab13e2b71 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -113,58 +114,41 @@ const char *ceph_cap_string(int caps)
113 return cap_str[i]; 114 return cap_str[i];
114} 115}
115 116
116/* 117void ceph_caps_init(struct ceph_mds_client *mdsc)
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{ 118{
137 INIT_LIST_HEAD(&caps_list); 119 INIT_LIST_HEAD(&mdsc->caps_list);
138 spin_lock_init(&caps_list_lock); 120 spin_lock_init(&mdsc->caps_list_lock);
139} 121}
140 122
141void ceph_caps_finalize(void) 123void ceph_caps_finalize(struct ceph_mds_client *mdsc)
142{ 124{
143 struct ceph_cap *cap; 125 struct ceph_cap *cap;
144 126
145 spin_lock(&caps_list_lock); 127 spin_lock(&mdsc->caps_list_lock);
146 while (!list_empty(&caps_list)) { 128 while (!list_empty(&mdsc->caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 129 cap = list_first_entry(&mdsc->caps_list,
130 struct ceph_cap, caps_item);
148 list_del(&cap->caps_item); 131 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap); 132 kmem_cache_free(ceph_cap_cachep, cap);
150 } 133 }
151 caps_total_count = 0; 134 mdsc->caps_total_count = 0;
152 caps_avail_count = 0; 135 mdsc->caps_avail_count = 0;
153 caps_use_count = 0; 136 mdsc->caps_use_count = 0;
154 caps_reserve_count = 0; 137 mdsc->caps_reserve_count = 0;
155 caps_min_count = 0; 138 mdsc->caps_min_count = 0;
156 spin_unlock(&caps_list_lock); 139 spin_unlock(&mdsc->caps_list_lock);
157} 140}
158 141
159void ceph_adjust_min_caps(int delta) 142void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
160{ 143{
161 spin_lock(&caps_list_lock); 144 spin_lock(&mdsc->caps_list_lock);
162 caps_min_count += delta; 145 mdsc->caps_min_count += delta;
163 BUG_ON(caps_min_count < 0); 146 BUG_ON(mdsc->caps_min_count < 0);
164 spin_unlock(&caps_list_lock); 147 spin_unlock(&mdsc->caps_list_lock);
165} 148}
166 149
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) 150int ceph_reserve_caps(struct ceph_mds_client *mdsc,
151 struct ceph_cap_reservation *ctx, int need)
168{ 152{
169 int i; 153 int i;
170 struct ceph_cap *cap; 154 struct ceph_cap *cap;
@@ -176,16 +160,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
176 dout("reserve caps ctx=%p need=%d\n", ctx, need); 160 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177 161
178 /* first reserve any caps that are already allocated */ 162 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock); 163 spin_lock(&mdsc->caps_list_lock);
180 if (caps_avail_count >= need) 164 if (mdsc->caps_avail_count >= need)
181 have = need; 165 have = need;
182 else 166 else
183 have = caps_avail_count; 167 have = mdsc->caps_avail_count;
184 caps_avail_count -= have; 168 mdsc->caps_avail_count -= have;
185 caps_reserve_count += have; 169 mdsc->caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 170 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
187 caps_avail_count); 171 mdsc->caps_reserve_count +
188 spin_unlock(&caps_list_lock); 172 mdsc->caps_avail_count);
173 spin_unlock(&mdsc->caps_list_lock);
189 174
190 for (i = have; i < need; i++) { 175 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +183,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
198 } 183 }
199 BUG_ON(have + alloc != need); 184 BUG_ON(have + alloc != need);
200 185
201 spin_lock(&caps_list_lock); 186 spin_lock(&mdsc->caps_list_lock);
202 caps_total_count += alloc; 187 mdsc->caps_total_count += alloc;
203 caps_reserve_count += alloc; 188 mdsc->caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list); 189 list_splice(&newcaps, &mdsc->caps_list);
205 190
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 191 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
207 caps_avail_count); 192 mdsc->caps_reserve_count +
208 spin_unlock(&caps_list_lock); 193 mdsc->caps_avail_count);
194 spin_unlock(&mdsc->caps_list_lock);
209 195
210 ctx->count = need; 196 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count, 198 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
213 caps_avail_count); 199 mdsc->caps_reserve_count, mdsc->caps_avail_count);
214 return 0; 200 return 0;
215 201
216out_alloc_count: 202out_alloc_count:
@@ -220,26 +206,29 @@ out_alloc_count:
220 return ret; 206 return ret;
221} 207}
222 208
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) 209int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
210 struct ceph_cap_reservation *ctx)
224{ 211{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 212 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) { 213 if (ctx->count) {
227 spin_lock(&caps_list_lock); 214 spin_lock(&mdsc->caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count); 215 BUG_ON(mdsc->caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count; 216 mdsc->caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count; 217 mdsc->caps_avail_count += ctx->count;
231 ctx->count = 0; 218 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 219 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count, 220 mdsc->caps_total_count, mdsc->caps_use_count,
234 caps_avail_count); 221 mdsc->caps_reserve_count, mdsc->caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 222 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
236 caps_avail_count); 223 mdsc->caps_reserve_count +
237 spin_unlock(&caps_list_lock); 224 mdsc->caps_avail_count);
225 spin_unlock(&mdsc->caps_list_lock);
238 } 226 }
239 return 0; 227 return 0;
240} 228}
241 229
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) 230static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
231 struct ceph_cap_reservation *ctx)
243{ 232{
244 struct ceph_cap *cap = NULL; 233 struct ceph_cap *cap = NULL;
245 234
@@ -247,71 +236,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
247 if (!ctx) { 236 if (!ctx) {
248 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249 if (cap) { 238 if (cap) {
250 caps_use_count++; 239 mdsc->caps_use_count++;
251 caps_total_count++; 240 mdsc->caps_total_count++;
252 } 241 }
253 return cap; 242 return cap;
254 } 243 }
255 244
256 spin_lock(&caps_list_lock); 245 spin_lock(&mdsc->caps_list_lock);
257 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 246 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
258 ctx, ctx->count, caps_total_count, caps_use_count, 247 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
259 caps_reserve_count, caps_avail_count); 248 mdsc->caps_reserve_count, mdsc->caps_avail_count);
260 BUG_ON(!ctx->count); 249 BUG_ON(!ctx->count);
261 BUG_ON(ctx->count > caps_reserve_count); 250 BUG_ON(ctx->count > mdsc->caps_reserve_count);
262 BUG_ON(list_empty(&caps_list)); 251 BUG_ON(list_empty(&mdsc->caps_list));
263 252
264 ctx->count--; 253 ctx->count--;
265 caps_reserve_count--; 254 mdsc->caps_reserve_count--;
266 caps_use_count++; 255 mdsc->caps_use_count++;
267 256
268 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 257 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
269 list_del(&cap->caps_item); 258 list_del(&cap->caps_item);
270 259
271 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 260 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
272 caps_avail_count); 261 mdsc->caps_reserve_count + mdsc->caps_avail_count);
273 spin_unlock(&caps_list_lock); 262 spin_unlock(&mdsc->caps_list_lock);
274 return cap; 263 return cap;
275} 264}
276 265
277void ceph_put_cap(struct ceph_cap *cap) 266void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
278{ 267{
279 spin_lock(&caps_list_lock); 268 spin_lock(&mdsc->caps_list_lock);
280 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 269 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
281 cap, caps_total_count, caps_use_count, 270 cap, mdsc->caps_total_count, mdsc->caps_use_count,
282 caps_reserve_count, caps_avail_count); 271 mdsc->caps_reserve_count, mdsc->caps_avail_count);
283 caps_use_count--; 272 mdsc->caps_use_count--;
284 /* 273 /*
285 * Keep some preallocated caps around (ceph_min_count), to 274 * Keep some preallocated caps around (ceph_min_count), to
286 * avoid lots of free/alloc churn. 275 * avoid lots of free/alloc churn.
287 */ 276 */
288 if (caps_avail_count >= caps_reserve_count + caps_min_count) { 277 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
289 caps_total_count--; 278 mdsc->caps_min_count) {
279 mdsc->caps_total_count--;
290 kmem_cache_free(ceph_cap_cachep, cap); 280 kmem_cache_free(ceph_cap_cachep, cap);
291 } else { 281 } else {
292 caps_avail_count++; 282 mdsc->caps_avail_count++;
293 list_add(&cap->caps_item, &caps_list); 283 list_add(&cap->caps_item, &mdsc->caps_list);
294 } 284 }
295 285
296 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 286 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
297 caps_avail_count); 287 mdsc->caps_reserve_count + mdsc->caps_avail_count);
298 spin_unlock(&caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
299} 289}
300 290
301void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
302 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
303 int *min) 293 int *min)
304{ 294{
295 struct ceph_mds_client *mdsc = fsc->mdsc;
296
305 if (total) 297 if (total)
306 *total = caps_total_count; 298 *total = mdsc->caps_total_count;
307 if (avail) 299 if (avail)
308 *avail = caps_avail_count; 300 *avail = mdsc->caps_avail_count;
309 if (used) 301 if (used)
310 *used = caps_use_count; 302 *used = mdsc->caps_use_count;
311 if (reserved) 303 if (reserved)
312 *reserved = caps_reserve_count; 304 *reserved = mdsc->caps_reserve_count;
313 if (min) 305 if (min)
314 *min = caps_min_count; 306 *min = mdsc->caps_min_count;
315} 307}
316 308
317/* 309/*
@@ -336,22 +328,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
336 return NULL; 328 return NULL;
337} 329}
338 330
331struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
332{
333 struct ceph_cap *cap;
334
335 spin_lock(&ci->vfs_inode.i_lock);
336 cap = __get_cap_for_mds(ci, mds);
337 spin_unlock(&ci->vfs_inode.i_lock);
338 return cap;
339}
340
339/* 341/*
340 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else 342 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
341 * -1.
342 */ 343 */
343static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) 344static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
344{ 345{
345 struct ceph_cap *cap; 346 struct ceph_cap *cap;
346 int mds = -1; 347 int mds = -1;
347 struct rb_node *p; 348 struct rb_node *p;
348 349
349 /* prefer mds with WR|WRBUFFER|EXCL caps */ 350 /* prefer mds with WR|BUFFER|EXCL caps */
350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 351 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
351 cap = rb_entry(p, struct ceph_cap, ci_node); 352 cap = rb_entry(p, struct ceph_cap, ci_node);
352 mds = cap->mds; 353 mds = cap->mds;
353 if (mseq)
354 *mseq = cap->mseq;
355 if (cap->issued & (CEPH_CAP_FILE_WR | 354 if (cap->issued & (CEPH_CAP_FILE_WR |
356 CEPH_CAP_FILE_BUFFER | 355 CEPH_CAP_FILE_BUFFER |
357 CEPH_CAP_FILE_EXCL)) 356 CEPH_CAP_FILE_EXCL))
@@ -364,7 +363,7 @@ int ceph_get_cap_mds(struct inode *inode)
364{ 363{
365 int mds; 364 int mds;
366 spin_lock(&inode->i_lock); 365 spin_lock(&inode->i_lock);
367 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); 366 mds = __ceph_get_cap_mds(ceph_inode(inode));
368 spin_unlock(&inode->i_lock); 367 spin_unlock(&inode->i_lock);
369 return mds; 368 return mds;
370} 369}
@@ -401,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
401static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
402 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
403{ 402{
404 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
405 404
406 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
407 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -483,8 +482,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
483 * Each time we receive FILE_CACHE anew, we increment 482 * Each time we receive FILE_CACHE anew, we increment
484 * i_rdcache_gen. 483 * i_rdcache_gen.
485 */ 484 */
486 if ((issued & CEPH_CAP_FILE_CACHE) && 485 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
487 (had & CEPH_CAP_FILE_CACHE) == 0) 486 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
488 ci->i_rdcache_gen++; 487 ci->i_rdcache_gen++;
489 488
490 /* 489 /*
@@ -517,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
517 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
518 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
519{ 518{
520 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
521 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
522 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
523 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -543,7 +542,7 @@ retry:
543 new_cap = NULL; 542 new_cap = NULL;
544 } else { 543 } else {
545 spin_unlock(&inode->i_lock); 544 spin_unlock(&inode->i_lock);
546 new_cap = get_cap(caps_reservation); 545 new_cap = get_cap(mdsc, caps_reservation);
547 if (new_cap == NULL) 546 if (new_cap == NULL)
548 return -ENOMEM; 547 return -ENOMEM;
549 goto retry; 548 goto retry;
@@ -588,6 +587,7 @@ retry:
588 } else { 587 } else {
589 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 588 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
590 realmino); 589 realmino);
590 WARN_ON(!realm);
591 } 591 }
592 } 592 }
593 593
@@ -815,7 +815,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
815 used |= CEPH_CAP_PIN; 815 used |= CEPH_CAP_PIN;
816 if (ci->i_rd_ref) 816 if (ci->i_rd_ref)
817 used |= CEPH_CAP_FILE_RD; 817 used |= CEPH_CAP_FILE_RD;
818 if (ci->i_rdcache_ref || ci->i_rdcache_gen) 818 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
819 used |= CEPH_CAP_FILE_CACHE; 819 used |= CEPH_CAP_FILE_CACHE;
820 if (ci->i_wr_ref) 820 if (ci->i_wr_ref)
821 used |= CEPH_CAP_FILE_WR; 821 used |= CEPH_CAP_FILE_WR;
@@ -831,7 +831,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
831{ 831{
832 int want = 0; 832 int want = 0;
833 int mode; 833 int mode;
834 for (mode = 0; mode < 4; mode++) 834 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
835 if (ci->i_nr_by_mode[mode]) 835 if (ci->i_nr_by_mode[mode])
836 want |= ceph_caps_for_mode(mode); 836 want |= ceph_caps_for_mode(mode);
837 return want; 837 return want;
@@ -874,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
874 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
875 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
876 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
877 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
878 int removed = 0; 878 int removed = 0;
879 879
880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -901,7 +901,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
901 ci->i_auth_cap = NULL; 901 ci->i_auth_cap = NULL;
902 902
903 if (removed) 903 if (removed)
904 ceph_put_cap(cap); 904 ceph_put_cap(mdsc, cap);
905 905
906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
907 struct ceph_snap_realm *realm = ci->i_snap_realm; 907 struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1083,6 +1083,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1083 gid_t gid; 1083 gid_t gid;
1084 struct ceph_mds_session *session; 1084 struct ceph_mds_session *session;
1085 u64 xattr_version = 0; 1085 u64 xattr_version = 0;
1086 struct ceph_buffer *xattr_blob = NULL;
1086 int delayed = 0; 1087 int delayed = 0;
1087 u64 flush_tid = 0; 1088 u64 flush_tid = 0;
1088 int i; 1089 int i;
@@ -1143,6 +1144,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1143 for (i = 0; i < CEPH_CAP_BITS; i++) 1144 for (i = 0; i < CEPH_CAP_BITS; i++)
1144 if (flushing & (1 << i)) 1145 if (flushing & (1 << i))
1145 ci->i_cap_flush_tid[i] = flush_tid; 1146 ci->i_cap_flush_tid[i] = flush_tid;
1147
1148 follows = ci->i_head_snapc->seq;
1149 } else {
1150 follows = 0;
1146 } 1151 }
1147 1152
1148 keep = cap->implemented; 1153 keep = cap->implemented;
@@ -1156,14 +1161,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1156 mtime = inode->i_mtime; 1161 mtime = inode->i_mtime;
1157 atime = inode->i_atime; 1162 atime = inode->i_atime;
1158 time_warp_seq = ci->i_time_warp_seq; 1163 time_warp_seq = ci->i_time_warp_seq;
1159 follows = ci->i_snap_realm->cached_context->seq;
1160 uid = inode->i_uid; 1164 uid = inode->i_uid;
1161 gid = inode->i_gid; 1165 gid = inode->i_gid;
1162 mode = inode->i_mode; 1166 mode = inode->i_mode;
1163 1167
1164 if (dropping & CEPH_CAP_XATTR_EXCL) { 1168 if (flushing & CEPH_CAP_XATTR_EXCL) {
1165 __ceph_build_xattrs_blob(ci); 1169 __ceph_build_xattrs_blob(ci);
1166 xattr_version = ci->i_xattrs.version + 1; 1170 xattr_blob = ci->i_xattrs.blob;
1171 xattr_version = ci->i_xattrs.version;
1167 } 1172 }
1168 1173
1169 spin_unlock(&inode->i_lock); 1174 spin_unlock(&inode->i_lock);
@@ -1171,9 +1176,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1171 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1176 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1172 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1177 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1173 size, max_size, &mtime, &atime, time_warp_seq, 1178 size, max_size, &mtime, &atime, time_warp_seq,
1174 uid, gid, mode, 1179 uid, gid, mode, xattr_version, xattr_blob,
1175 xattr_version,
1176 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1177 follows); 1180 follows);
1178 if (ret < 0) { 1181 if (ret < 0) {
1179 dout("error sending cap msg, must requeue %p\n", inode); 1182 dout("error sending cap msg, must requeue %p\n", inode);
@@ -1193,16 +1196,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1193 * asynchronously back to the MDS once sync writes complete and dirty 1196 * asynchronously back to the MDS once sync writes complete and dirty
1194 * data is written out. 1197 * data is written out.
1195 * 1198 *
1199 * Unless @again is true, skip cap_snaps that were already sent to
1200 * the MDS (i.e., during this session).
1201 *
1196 * Called under i_lock. Takes s_mutex as needed. 1202 * Called under i_lock. Takes s_mutex as needed.
1197 */ 1203 */
1198void __ceph_flush_snaps(struct ceph_inode_info *ci, 1204void __ceph_flush_snaps(struct ceph_inode_info *ci,
1199 struct ceph_mds_session **psession) 1205 struct ceph_mds_session **psession,
1206 int again)
1207 __releases(ci->vfs_inode->i_lock)
1208 __acquires(ci->vfs_inode->i_lock)
1200{ 1209{
1201 struct inode *inode = &ci->vfs_inode; 1210 struct inode *inode = &ci->vfs_inode;
1202 int mds; 1211 int mds;
1203 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1204 u32 mseq; 1213 u32 mseq;
1205 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1206 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1207 session->s_mutex */ 1216 session->s_mutex */
1208 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1223,7 +1232,7 @@ retry:
1223 * pages to be written out. 1232 * pages to be written out.
1224 */ 1233 */
1225 if (capsnap->dirty_pages || capsnap->writing) 1234 if (capsnap->dirty_pages || capsnap->writing)
1226 continue; 1235 break;
1227 1236
1228 /* 1237 /*
1229 * if cap writeback already occurred, we should have dropped 1238 * if cap writeback already occurred, we should have dropped
@@ -1232,7 +1241,20 @@ retry:
1232 BUG_ON(capsnap->dirty == 0); 1241 BUG_ON(capsnap->dirty == 0);
1233 1242
1234 /* pick mds, take s_mutex */ 1243 /* pick mds, take s_mutex */
1235 mds = __ceph_get_cap_mds(ci, &mseq); 1244 if (ci->i_auth_cap == NULL) {
1245 dout("no auth cap (migrating?), doing nothing\n");
1246 goto out;
1247 }
1248
1249 /* only flush each capsnap once */
1250 if (!again && !list_empty(&capsnap->flushing_item)) {
1251 dout("already flushed %p, skipping\n", capsnap);
1252 continue;
1253 }
1254
1255 mds = ci->i_auth_cap->session->s_mds;
1256 mseq = ci->i_auth_cap->mseq;
1257
1236 if (session && session->s_mds != mds) { 1258 if (session && session->s_mds != mds) {
1237 dout("oops, wrong session %p mutex\n", session); 1259 dout("oops, wrong session %p mutex\n", session);
1238 mutex_unlock(&session->s_mutex); 1260 mutex_unlock(&session->s_mutex);
@@ -1251,8 +1273,8 @@ retry:
1251 } 1273 }
1252 /* 1274 /*
1253 * if session == NULL, we raced against a cap 1275 * if session == NULL, we raced against a cap
1254 * deletion. retry, and we'll get a better 1276 * deletion or migration. retry, and we'll
1255 * @mds value next time. 1277 * get a better @mds value next time.
1256 */ 1278 */
1257 spin_lock(&inode->i_lock); 1279 spin_lock(&inode->i_lock);
1258 goto retry; 1280 goto retry;
@@ -1266,8 +1288,8 @@ retry:
1266 &session->s_cap_snaps_flushing); 1288 &session->s_cap_snaps_flushing);
1267 spin_unlock(&inode->i_lock); 1289 spin_unlock(&inode->i_lock);
1268 1290
1269 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", 1291 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1270 inode, capsnap, next_follows, capsnap->size); 1292 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1271 send_cap_msg(session, ceph_vino(inode).ino, 0, 1293 send_cap_msg(session, ceph_vino(inode).ino, 0,
1272 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1294 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1273 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1295 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1275,7 +1297,7 @@ retry:
1275 &capsnap->mtime, &capsnap->atime, 1297 &capsnap->mtime, &capsnap->atime,
1276 capsnap->time_warp_seq, 1298 capsnap->time_warp_seq,
1277 capsnap->uid, capsnap->gid, capsnap->mode, 1299 capsnap->uid, capsnap->gid, capsnap->mode,
1278 0, NULL, 1300 capsnap->xattr_version, capsnap->xattr_blob,
1279 capsnap->follows); 1301 capsnap->follows);
1280 1302
1281 next_follows = capsnap->follows + 1; 1303 next_follows = capsnap->follows + 1;
@@ -1290,6 +1312,7 @@ retry:
1290 list_del_init(&ci->i_snap_flush_item); 1312 list_del_init(&ci->i_snap_flush_item);
1291 spin_unlock(&mdsc->snap_flush_lock); 1313 spin_unlock(&mdsc->snap_flush_lock);
1292 1314
1315out:
1293 if (psession) 1316 if (psession)
1294 *psession = session; 1317 *psession = session;
1295 else if (session) { 1318 else if (session) {
@@ -1303,7 +1326,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1303 struct inode *inode = &ci->vfs_inode; 1326 struct inode *inode = &ci->vfs_inode;
1304 1327
1305 spin_lock(&inode->i_lock); 1328 spin_lock(&inode->i_lock);
1306 __ceph_flush_snaps(ci, NULL); 1329 __ceph_flush_snaps(ci, NULL, 0);
1307 spin_unlock(&inode->i_lock); 1330 spin_unlock(&inode->i_lock);
1308} 1331}
1309 1332
@@ -1314,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1314void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1315{ 1338{
1316 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1317 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1318 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1319 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1320 int dirty = 0; 1343 int dirty = 0;
@@ -1324,7 +1347,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1324 ceph_cap_string(was | mask)); 1347 ceph_cap_string(was | mask));
1325 ci->i_dirty_caps |= mask; 1348 ci->i_dirty_caps |= mask;
1326 if (was == 0) { 1349 if (was == 0) {
1327 dout(" inode %p now dirty\n", &ci->vfs_inode); 1350 if (!ci->i_head_snapc)
1351 ci->i_head_snapc = ceph_get_snap_context(
1352 ci->i_snap_realm->cached_context);
1353 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1354 ci->i_head_snapc);
1328 BUG_ON(!list_empty(&ci->i_dirty_item)); 1355 BUG_ON(!list_empty(&ci->i_dirty_item));
1329 spin_lock(&mdsc->cap_dirty_lock); 1356 spin_lock(&mdsc->cap_dirty_lock);
1330 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1357 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1352,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1352static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1353 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1354{ 1381{
1355 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1356 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1357 int flushing; 1384 int flushing;
1358 1385
@@ -1390,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1390/* 1417/*
1391 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1392 */ 1419 */
1393static int mapping_is_empty(struct address_space *mapping)
1394{
1395 struct page *page = find_get_page(mapping, 0);
1396
1397 if (!page)
1398 return 1;
1399
1400 put_page(page);
1401 return 0;
1402}
1403
1404static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1405{ 1421{
1406 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1410,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1410 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1411 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1412 1428
1413 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1414 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1415 /* success. */ 1431 /* success. */
1416 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1435,10 +1451,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
1435 */ 1451 */
1436void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1437 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1438 __releases(session->s_mutex)
1439{ 1454{
1440 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1441 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1442 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1443 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1444 int file_wanted, used; 1459 int file_wanted, used;
@@ -1463,7 +1478,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1463 1478
1464 /* flush snaps first time around only */ 1479 /* flush snaps first time around only */
1465 if (!list_empty(&ci->i_cap_snaps)) 1480 if (!list_empty(&ci->i_cap_snaps))
1466 __ceph_flush_snaps(ci, &session); 1481 __ceph_flush_snaps(ci, &session, 0);
1467 goto retry_locked; 1482 goto retry_locked;
1468retry: 1483retry:
1469 spin_lock(&inode->i_lock); 1484 spin_lock(&inode->i_lock);
@@ -1508,13 +1523,15 @@ retry_locked:
1508 */ 1523 */
1509 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1510 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1511 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1512 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1513 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1514 !tried_invalidate) { 1530 !tried_invalidate) {
1515 dout("check_caps trying to invalidate on %p\n", inode); 1531 dout("check_caps trying to invalidate on %p\n", inode);
1516 if (try_nonblocking_invalidate(inode) < 0) { 1532 if (try_nonblocking_invalidate(inode) < 0) {
1517 if (revoking & CEPH_CAP_FILE_CACHE) { 1533 if (revoking & (CEPH_CAP_FILE_CACHE|
1534 CEPH_CAP_FILE_LAZYIO)) {
1518 dout("check_caps queuing invalidate\n"); 1535 dout("check_caps queuing invalidate\n");
1519 queue_invalidate = 1; 1536 queue_invalidate = 1;
1520 ci->i_rdcache_revoking = ci->i_rdcache_gen; 1537 ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -1679,7 +1696,7 @@ ack:
1679static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1696static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1680 unsigned *flush_tid) 1697 unsigned *flush_tid)
1681{ 1698{
1682 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1699 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1683 struct ceph_inode_info *ci = ceph_inode(inode); 1700 struct ceph_inode_info *ci = ceph_inode(inode);
1684 int unlock_session = session ? 0 : 1; 1701 int unlock_session = session ? 0 : 1;
1685 int flushing = 0; 1702 int flushing = 0;
@@ -1845,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1845 caps_are_flushed(inode, flush_tid)); 1862 caps_are_flushed(inode, flush_tid));
1846 } else { 1863 } else {
1847 struct ceph_mds_client *mdsc = 1864 struct ceph_mds_client *mdsc =
1848 &ceph_sb_to_client(inode->i_sb)->mdsc; 1865 ceph_sb_to_client(inode->i_sb)->mdsc;
1849 1866
1850 spin_lock(&inode->i_lock); 1867 spin_lock(&inode->i_lock);
1851 if (__ceph_caps_dirty(ci)) 1868 if (__ceph_caps_dirty(ci))
@@ -1878,7 +1895,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1878 if (cap && cap->session == session) { 1895 if (cap && cap->session == session) {
1879 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1896 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1880 cap, capsnap); 1897 cap, capsnap);
1881 __ceph_flush_snaps(ci, &session); 1898 __ceph_flush_snaps(ci, &session, 1);
1882 } else { 1899 } else {
1883 pr_err("%p auth cap %p not mds%d ???\n", inode, 1900 pr_err("%p auth cap %p not mds%d ???\n", inode,
1884 cap, session->s_mds); 1901 cap, session->s_mds);
@@ -2181,7 +2198,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2181 2198
2182 if (ci->i_head_snapc == snapc) { 2199 if (ci->i_head_snapc == snapc) {
2183 ci->i_wrbuffer_ref_head -= nr; 2200 ci->i_wrbuffer_ref_head -= nr;
2184 if (!ci->i_wrbuffer_ref_head) { 2201 if (ci->i_wrbuffer_ref_head == 0 &&
2202 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2203 BUG_ON(!ci->i_head_snapc);
2185 ceph_put_snap_context(ci->i_head_snapc); 2204 ceph_put_snap_context(ci->i_head_snapc);
2186 ci->i_head_snapc = NULL; 2205 ci->i_head_snapc = NULL;
2187 } 2206 }
@@ -2250,12 +2269,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2250 struct ceph_mds_session *session, 2269 struct ceph_mds_session *session,
2251 struct ceph_cap *cap, 2270 struct ceph_cap *cap,
2252 struct ceph_buffer *xattr_buf) 2271 struct ceph_buffer *xattr_buf)
2253 __releases(inode->i_lock) 2272 __releases(inode->i_lock)
2254 __releases(session->s_mutex)
2255{ 2273{
2256 struct ceph_inode_info *ci = ceph_inode(inode); 2274 struct ceph_inode_info *ci = ceph_inode(inode);
2257 int mds = session->s_mds; 2275 int mds = session->s_mds;
2258 int seq = le32_to_cpu(grant->seq); 2276 unsigned seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2259 int newcaps = le32_to_cpu(grant->caps); 2278 int newcaps = le32_to_cpu(grant->caps);
2260 int issued, implemented, used, wanted, dirty; 2279 int issued, implemented, used, wanted, dirty;
2261 u64 size = le64_to_cpu(grant->size); 2280 u64 size = le64_to_cpu(grant->size);
@@ -2267,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2267 int revoked_rdcache = 0; 2286 int revoked_rdcache = 0;
2268 int queue_invalidate = 0; 2287 int queue_invalidate = 0;
2269 2288
2270 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
2271 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
2272 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2273 inode->i_size); 2292 inode->i_size);
2274 2293
@@ -2278,6 +2297,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2278 * will invalidate _after_ writeback.) 2297 * will invalidate _after_ writeback.)
2279 */ 2298 */
2280 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2299 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2300 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2281 !ci->i_wrbuffer_ref) { 2301 !ci->i_wrbuffer_ref) {
2282 if (try_nonblocking_invalidate(inode) == 0) { 2302 if (try_nonblocking_invalidate(inode) == 0) {
2283 revoked_rdcache = 1; 2303 revoked_rdcache = 1;
@@ -2363,21 +2383,29 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2363 } 2383 }
2364 2384
2365 cap->seq = seq; 2385 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2366 2387
2367 /* file layout may have changed */ 2388 /* file layout may have changed */
2368 ci->i_layout = grant->layout; 2389 ci->i_layout = grant->layout;
2369 2390
2370 /* revocation, grant, or no-op? */ 2391 /* revocation, grant, or no-op? */
2371 if (cap->issued & ~newcaps) { 2392 if (cap->issued & ~newcaps) {
2372 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), 2393 int revoking = cap->issued & ~newcaps;
2373 ceph_cap_string(newcaps)); 2394
2374 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) 2395 dout("revocation: %s -> %s (revoking %s)\n",
2375 writeback = 1; /* will delay ack */ 2396 ceph_cap_string(cap->issued),
2376 else if (dirty & ~newcaps) 2397 ceph_cap_string(newcaps),
2377 check_caps = 1; /* initiate writeback in check_caps */ 2398 ceph_cap_string(revoking));
2378 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || 2399 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2379 revoked_rdcache) 2400 writeback = 1; /* initiate writeback; will delay ack */
2380 check_caps = 2; /* send revoke ack in check_caps */ 2401 else if (revoking == CEPH_CAP_FILE_CACHE &&
2402 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2403 queue_invalidate)
2404 ; /* do nothing yet, invalidation will be queued */
2405 else if (cap == ci->i_auth_cap)
2406 check_caps = 1; /* check auth cap only */
2407 else
2408 check_caps = 2; /* check all caps */
2381 cap->issued = newcaps; 2409 cap->issued = newcaps;
2382 cap->implemented |= newcaps; 2410 cap->implemented |= newcaps;
2383 } else if (cap->issued == newcaps) { 2411 } else if (cap->issued == newcaps) {
@@ -2427,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2427 __releases(inode->i_lock) 2455 __releases(inode->i_lock)
2428{ 2456{
2429 struct ceph_inode_info *ci = ceph_inode(inode); 2457 struct ceph_inode_info *ci = ceph_inode(inode);
2430 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2458 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2431 unsigned seq = le32_to_cpu(m->seq); 2459 unsigned seq = le32_to_cpu(m->seq);
2432 int dirty = le32_to_cpu(m->dirty); 2460 int dirty = le32_to_cpu(m->dirty);
2433 int cleaned = 0; 2461 int cleaned = 0;
@@ -2467,6 +2495,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2467 dout(" inode %p now clean\n", inode); 2495 dout(" inode %p now clean\n", inode);
2468 BUG_ON(!list_empty(&ci->i_dirty_item)); 2496 BUG_ON(!list_empty(&ci->i_dirty_item));
2469 drop = 1; 2497 drop = 1;
2498 if (ci->i_wrbuffer_ref_head == 0) {
2499 BUG_ON(!ci->i_head_snapc);
2500 ceph_put_snap_context(ci->i_head_snapc);
2501 ci->i_head_snapc = NULL;
2502 }
2470 } else { 2503 } else {
2471 BUG_ON(list_empty(&ci->i_dirty_item)); 2504 BUG_ON(list_empty(&ci->i_dirty_item));
2472 } 2505 }
@@ -2568,7 +2601,8 @@ static void handle_cap_trunc(struct inode *inode,
2568 * caller holds s_mutex 2601 * caller holds s_mutex
2569 */ 2602 */
2570static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2603static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2571 struct ceph_mds_session *session) 2604 struct ceph_mds_session *session,
2605 int *open_target_sessions)
2572{ 2606{
2573 struct ceph_inode_info *ci = ceph_inode(inode); 2607 struct ceph_inode_info *ci = ceph_inode(inode);
2574 int mds = session->s_mds; 2608 int mds = session->s_mds;
@@ -2600,6 +2634,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2600 ci->i_cap_exporting_mds = mds; 2634 ci->i_cap_exporting_mds = mds;
2601 ci->i_cap_exporting_mseq = mseq; 2635 ci->i_cap_exporting_mseq = mseq;
2602 ci->i_cap_exporting_issued = cap->issued; 2636 ci->i_cap_exporting_issued = cap->issued;
2637
2638 /*
2639 * make sure we have open sessions with all possible
2640 * export targets, so that we get the matching IMPORT
2641 */
2642 *open_target_sessions = 1;
2603 } 2643 }
2604 __ceph_remove_cap(cap); 2644 __ceph_remove_cap(cap);
2605 } 2645 }
@@ -2663,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2663 struct ceph_msg *msg) 2703 struct ceph_msg *msg)
2664{ 2704{
2665 struct ceph_mds_client *mdsc = session->s_mdsc; 2705 struct ceph_mds_client *mdsc = session->s_mdsc;
2666 struct super_block *sb = mdsc->client->sb; 2706 struct super_block *sb = mdsc->fsc->sb;
2667 struct inode *inode; 2707 struct inode *inode;
2668 struct ceph_cap *cap; 2708 struct ceph_cap *cap;
2669 struct ceph_mds_caps *h; 2709 struct ceph_mds_caps *h;
@@ -2675,6 +2715,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2675 u64 size, max_size; 2715 u64 size, max_size;
2676 u64 tid; 2716 u64 tid;
2677 void *snaptrace; 2717 void *snaptrace;
2718 size_t snaptrace_len;
2719 void *flock;
2720 u32 flock_len;
2721 int open_target_sessions = 0;
2678 2722
2679 dout("handle_caps from mds%d\n", mds); 2723 dout("handle_caps from mds%d\n", mds);
2680 2724
@@ -2683,7 +2727,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2683 if (msg->front.iov_len < sizeof(*h)) 2727 if (msg->front.iov_len < sizeof(*h))
2684 goto bad; 2728 goto bad;
2685 h = msg->front.iov_base; 2729 h = msg->front.iov_base;
2686 snaptrace = h + 1;
2687 op = le32_to_cpu(h->op); 2730 op = le32_to_cpu(h->op);
2688 vino.ino = le64_to_cpu(h->ino); 2731 vino.ino = le64_to_cpu(h->ino);
2689 vino.snap = CEPH_NOSNAP; 2732 vino.snap = CEPH_NOSNAP;
@@ -2693,6 +2736,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2693 size = le64_to_cpu(h->size); 2736 size = le64_to_cpu(h->size);
2694 max_size = le64_to_cpu(h->max_size); 2737 max_size = le64_to_cpu(h->max_size);
2695 2738
2739 snaptrace = h + 1;
2740 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2741
2742 if (le16_to_cpu(msg->hdr.version) >= 2) {
2743 void *p, *end;
2744
2745 p = snaptrace + snaptrace_len;
2746 end = msg->front.iov_base + msg->front.iov_len;
2747 ceph_decode_32_safe(&p, end, flock_len, bad);
2748 flock = p;
2749 } else {
2750 flock = NULL;
2751 flock_len = 0;
2752 }
2753
2696 mutex_lock(&session->s_mutex); 2754 mutex_lock(&session->s_mutex);
2697 session->s_seq++; 2755 session->s_seq++;
2698 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2756 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2708,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2708 if (op == CEPH_CAP_OP_IMPORT) 2766 if (op == CEPH_CAP_OP_IMPORT)
2709 __queue_cap_release(session, vino.ino, cap_id, 2767 __queue_cap_release(session, vino.ino, cap_id,
2710 mseq, seq); 2768 mseq, seq);
2711 2769 goto flush_cap_releases;
2712 /*
2713 * send any full release message to try to move things
2714 * along for the mds (who clearly thinks we still have this
2715 * cap).
2716 */
2717 ceph_add_cap_releases(mdsc, session, -1);
2718 ceph_send_cap_releases(mdsc, session);
2719 goto done;
2720 } 2770 }
2721 2771
2722 /* these will work even if we don't have a cap yet */ 2772 /* these will work even if we don't have a cap yet */
@@ -2726,12 +2776,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2726 goto done; 2776 goto done;
2727 2777
2728 case CEPH_CAP_OP_EXPORT: 2778 case CEPH_CAP_OP_EXPORT:
2729 handle_cap_export(inode, h, session); 2779 handle_cap_export(inode, h, session, &open_target_sessions);
2730 goto done; 2780 goto done;
2731 2781
2732 case CEPH_CAP_OP_IMPORT: 2782 case CEPH_CAP_OP_IMPORT:
2733 handle_cap_import(mdsc, inode, h, session, 2783 handle_cap_import(mdsc, inode, h, session,
2734 snaptrace, le32_to_cpu(h->snap_trace_len)); 2784 snaptrace, snaptrace_len);
2735 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2785 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2736 session); 2786 session);
2737 goto done_unlocked; 2787 goto done_unlocked;
@@ -2744,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2744 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2794 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2745 inode, ceph_ino(inode), ceph_snap(inode), mds); 2795 inode, ceph_ino(inode), ceph_snap(inode), mds);
2746 spin_unlock(&inode->i_lock); 2796 spin_unlock(&inode->i_lock);
2747 goto done; 2797 goto flush_cap_releases;
2748 } 2798 }
2749 2799
2750 /* note that each of these drops i_lock for us */ 2800 /* note that each of these drops i_lock for us */
@@ -2768,11 +2818,24 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2768 ceph_cap_op_name(op)); 2818 ceph_cap_op_name(op));
2769 } 2819 }
2770 2820
2821 goto done;
2822
2823flush_cap_releases:
2824 /*
2825 * send any full release message to try to move things
2826 * along for the mds (who clearly thinks we still have this
2827 * cap).
2828 */
2829 ceph_add_cap_releases(mdsc, session);
2830 ceph_send_cap_releases(mdsc, session);
2831
2771done: 2832done:
2772 mutex_unlock(&session->s_mutex); 2833 mutex_unlock(&session->s_mutex);
2773done_unlocked: 2834done_unlocked:
2774 if (inode) 2835 if (inode)
2775 iput(inode); 2836 iput(inode);
2837 if (open_target_sessions)
2838 ceph_mdsc_open_export_target_sessions(mdsc, session);
2776 return; 2839 return;
2777 2840
2778bad: 2841bad:
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c230561..00000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c409..bdce8b1fbd0 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 793f50cb7c2..00000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 79d76bc4303..00000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,74 +0,0 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index 2fa992eaf7d..00000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,705 +0,0 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
68
69
70/*
71 * ceph_file_layout - describe data layout for a file/inode
72 */
73struct ceph_file_layout {
74 /* file -> object mapping */
75 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
76 of page size. */
77 __le32 fl_stripe_count; /* over this many objects */
78 __le32 fl_object_size; /* until objects are this big, then move to
79 new objects */
80 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
81
82 /* pg -> disk layout */
83 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
84
85 /* object -> pg layout */
86 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
87 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
88} __attribute__ ((packed));
89
90#define CEPH_MIN_STRIPE_UNIT 65536
91
92int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
93
94
95/* crypto algorithms */
96#define CEPH_CRYPTO_NONE 0x0
97#define CEPH_CRYPTO_AES 0x1
98
99/* security/authentication protocols */
100#define CEPH_AUTH_UNKNOWN 0x0
101#define CEPH_AUTH_NONE 0x1
102#define CEPH_AUTH_CEPHX 0x2
103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
106
107/*********************************************
108 * message layer
109 */
110
111/*
112 * message types
113 */
114
115/* misc */
116#define CEPH_MSG_SHUTDOWN 1
117#define CEPH_MSG_PING 2
118
119/* client <-> monitor */
120#define CEPH_MSG_MON_MAP 4
121#define CEPH_MSG_MON_GET_MAP 5
122#define CEPH_MSG_STATFS 13
123#define CEPH_MSG_STATFS_REPLY 14
124#define CEPH_MSG_MON_SUBSCRIBE 15
125#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
126#define CEPH_MSG_AUTH 17
127#define CEPH_MSG_AUTH_REPLY 18
128
129/* client <-> mds */
130#define CEPH_MSG_MDS_MAP 21
131
132#define CEPH_MSG_CLIENT_SESSION 22
133#define CEPH_MSG_CLIENT_RECONNECT 23
134
135#define CEPH_MSG_CLIENT_REQUEST 24
136#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
137#define CEPH_MSG_CLIENT_REPLY 26
138#define CEPH_MSG_CLIENT_CAPS 0x310
139#define CEPH_MSG_CLIENT_LEASE 0x311
140#define CEPH_MSG_CLIENT_SNAP 0x312
141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
148/* osd */
149#define CEPH_MSG_OSD_MAP 41
150#define CEPH_MSG_OSD_OP 42
151#define CEPH_MSG_OSD_OPREPLY 43
152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
164struct ceph_mon_request_header {
165 __le64 have_version;
166 __le16 session_mon;
167 __le64 session_mon_tid;
168} __attribute__ ((packed));
169
170struct ceph_mon_statfs {
171 struct ceph_mon_request_header monhdr;
172 struct ceph_fsid fsid;
173} __attribute__ ((packed));
174
175struct ceph_statfs {
176 __le64 kb, kb_used, kb_avail;
177 __le64 num_objects;
178} __attribute__ ((packed));
179
180struct ceph_mon_statfs_reply {
181 struct ceph_fsid fsid;
182 __le64 version;
183 struct ceph_statfs st;
184} __attribute__ ((packed));
185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
211struct ceph_osd_getmap {
212 struct ceph_mon_request_header monhdr;
213 struct ceph_fsid fsid;
214 __le32 start;
215} __attribute__ ((packed));
216
217struct ceph_mds_getmap {
218 struct ceph_mon_request_header monhdr;
219 struct ceph_fsid fsid;
220} __attribute__ ((packed));
221
222struct ceph_client_mount {
223 struct ceph_mon_request_header monhdr;
224} __attribute__ ((packed));
225
226struct ceph_mon_subscribe_item {
227 __le64 have_version; __le64 have;
228 __u8 onetime;
229} __attribute__ ((packed));
230
231struct ceph_mon_subscribe_ack {
232 __le32 duration; /* seconds */
233 struct ceph_fsid fsid;
234} __attribute__ ((packed));
235
236/*
237 * mds states
238 * > 0 -> in
239 * <= 0 -> out
240 */
241#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
242#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
243 empty log. */
244#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
245#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
246#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
247#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
248#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
249
250#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
251#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
252 operations (import, rename, etc.) */
253#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
254#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
255#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
256#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
257#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
258
259extern const char *ceph_mds_state_name(int s);
260
261
262/*
263 * metadata lock types.
264 * - these are bitmasks.. we can compose them
265 * - they also define the lock ordering by the MDS
266 * - a few of these are internal to the mds
267 */
268#define CEPH_LOCK_DVERSION 1
269#define CEPH_LOCK_DN 2
270#define CEPH_LOCK_ISNAP 16
271#define CEPH_LOCK_IVERSION 32 /* mds internal */
272#define CEPH_LOCK_IFILE 64
273#define CEPH_LOCK_IAUTH 128
274#define CEPH_LOCK_ILINK 256
275#define CEPH_LOCK_IDFT 512 /* dir frag tree */
276#define CEPH_LOCK_INEST 1024 /* mds internal */
277#define CEPH_LOCK_IXATTR 2048
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
279
280/* client_session ops */
281enum {
282 CEPH_SESSION_REQUEST_OPEN,
283 CEPH_SESSION_OPEN,
284 CEPH_SESSION_REQUEST_CLOSE,
285 CEPH_SESSION_CLOSE,
286 CEPH_SESSION_REQUEST_RENEWCAPS,
287 CEPH_SESSION_RENEWCAPS,
288 CEPH_SESSION_STALE,
289 CEPH_SESSION_RECALL_STATE,
290};
291
292extern const char *ceph_session_op_name(int op);
293
294struct ceph_mds_session_head {
295 __le32 op;
296 __le64 seq;
297 struct ceph_timespec stamp;
298 __le32 max_caps, max_leases;
299} __attribute__ ((packed));
300
301/* client_request */
302/*
303 * metadata ops.
304 * & 0x001000 -> write op
305 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
306 & & 0x100000 -> use weird ino/path trace
307 */
308#define CEPH_MDS_OP_WRITE 0x001000
309enum {
310 CEPH_MDS_OP_LOOKUP = 0x00100,
311 CEPH_MDS_OP_GETATTR = 0x00101,
312 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
313 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
314
315 CEPH_MDS_OP_SETXATTR = 0x01105,
316 CEPH_MDS_OP_RMXATTR = 0x01106,
317 CEPH_MDS_OP_SETLAYOUT = 0x01107,
318 CEPH_MDS_OP_SETATTR = 0x01108,
319
320 CEPH_MDS_OP_MKNOD = 0x01201,
321 CEPH_MDS_OP_LINK = 0x01202,
322 CEPH_MDS_OP_UNLINK = 0x01203,
323 CEPH_MDS_OP_RENAME = 0x01204,
324 CEPH_MDS_OP_MKDIR = 0x01220,
325 CEPH_MDS_OP_RMDIR = 0x01221,
326 CEPH_MDS_OP_SYMLINK = 0x01222,
327
328 CEPH_MDS_OP_CREATE = 0x01301,
329 CEPH_MDS_OP_OPEN = 0x00302,
330 CEPH_MDS_OP_READDIR = 0x00305,
331
332 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
333 CEPH_MDS_OP_MKSNAP = 0x01400,
334 CEPH_MDS_OP_RMSNAP = 0x01401,
335 CEPH_MDS_OP_LSSNAP = 0x00402,
336};
337
338extern const char *ceph_mds_op_name(int op);
339
340
341#define CEPH_SETATTR_MODE 1
342#define CEPH_SETATTR_UID 2
343#define CEPH_SETATTR_GID 4
344#define CEPH_SETATTR_MTIME 8
345#define CEPH_SETATTR_ATIME 16
346#define CEPH_SETATTR_SIZE 32
347#define CEPH_SETATTR_CTIME 64
348
349union ceph_mds_request_args {
350 struct {
351 __le32 mask; /* CEPH_CAP_* */
352 } __attribute__ ((packed)) getattr;
353 struct {
354 __le32 mode;
355 __le32 uid;
356 __le32 gid;
357 struct ceph_timespec mtime;
358 struct ceph_timespec atime;
359 __le64 size, old_size; /* old_size needed by truncate */
360 __le32 mask; /* CEPH_SETATTR_* */
361 } __attribute__ ((packed)) setattr;
362 struct {
363 __le32 frag; /* which dir fragment */
364 __le32 max_entries; /* how many dentries to grab */
365 __le32 max_bytes;
366 } __attribute__ ((packed)) readdir;
367 struct {
368 __le32 mode;
369 __le32 rdev;
370 } __attribute__ ((packed)) mknod;
371 struct {
372 __le32 mode;
373 } __attribute__ ((packed)) mkdir;
374 struct {
375 __le32 flags;
376 __le32 mode;
377 __le32 stripe_unit; /* layout for newly created file */
378 __le32 stripe_count; /* ... */
379 __le32 object_size;
380 __le32 file_replication;
381 __le32 preferred;
382 } __attribute__ ((packed)) open;
383 struct {
384 __le32 flags;
385 } __attribute__ ((packed)) setxattr;
386 struct {
387 struct ceph_file_layout layout;
388 } __attribute__ ((packed)) setlayout;
389} __attribute__ ((packed));
390
391#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
392#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
393
394struct ceph_mds_request_head {
395 __le64 oldest_client_tid;
396 __le32 mdsmap_epoch; /* on client */
397 __le32 flags; /* CEPH_MDS_FLAG_* */
398 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
399 __le16 num_releases; /* # include cap/lease release records */
400 __le32 op; /* mds op code */
401 __le32 caller_uid, caller_gid;
402 __le64 ino; /* use this ino for openc, mkdir, mknod,
403 etc. (if replaying) */
404 union ceph_mds_request_args args;
405} __attribute__ ((packed));
406
407/* cap/lease release record */
408struct ceph_mds_request_release {
409 __le64 ino, cap_id; /* ino and unique cap id */
410 __le32 caps, wanted; /* new issued, wanted */
411 __le32 seq, issue_seq, mseq;
412 __le32 dname_seq; /* if releasing a dentry lease, a */
413 __le32 dname_len; /* string follows. */
414} __attribute__ ((packed));
415
416/* client reply */
417struct ceph_mds_reply_head {
418 __le32 op;
419 __le32 result;
420 __le32 mdsmap_epoch;
421 __u8 safe; /* true if committed to disk */
422 __u8 is_dentry, is_target; /* true if dentry, target inode records
423 are included with reply */
424} __attribute__ ((packed));
425
426/* one for each node split */
427struct ceph_frag_tree_split {
428 __le32 frag; /* this frag splits... */
429 __le32 by; /* ...by this many bits */
430} __attribute__ ((packed));
431
432struct ceph_frag_tree_head {
433 __le32 nsplits; /* num ceph_frag_tree_split records */
434 struct ceph_frag_tree_split splits[];
435} __attribute__ ((packed));
436
437/* capability issue, for bundling with mds reply */
438struct ceph_mds_reply_cap {
439 __le32 caps, wanted; /* caps issued, wanted */
440 __le64 cap_id;
441 __le32 seq, mseq;
442 __le64 realm; /* snap realm */
443 __u8 flags; /* CEPH_CAP_FLAG_* */
444} __attribute__ ((packed));
445
446#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
447
448/* inode record, for bundling with mds reply */
449struct ceph_mds_reply_inode {
450 __le64 ino;
451 __le64 snapid;
452 __le32 rdev;
453 __le64 version; /* inode version */
454 __le64 xattr_version; /* version for xattr blob */
455 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
456 struct ceph_file_layout layout;
457 struct ceph_timespec ctime, mtime, atime;
458 __le32 time_warp_seq;
459 __le64 size, max_size, truncate_size;
460 __le32 truncate_seq;
461 __le32 mode, uid, gid;
462 __le32 nlink;
463 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
464 struct ceph_timespec rctime;
465 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
466} __attribute__ ((packed));
467/* followed by frag array, then symlink string, then xattr blob */
468
469/* reply_lease follows dname, and reply_inode */
470struct ceph_mds_reply_lease {
471 __le16 mask; /* lease type(s) */
472 __le32 duration_ms; /* lease duration */
473 __le32 seq;
474} __attribute__ ((packed));
475
476struct ceph_mds_reply_dirfrag {
477 __le32 frag; /* fragment */
478 __le32 auth; /* auth mds, if this is a delegation point */
479 __le32 ndist; /* number of mds' this is replicated on */
480 __le32 dist[];
481} __attribute__ ((packed));
482
483/* file access modes */
484#define CEPH_FILE_MODE_PIN 0
485#define CEPH_FILE_MODE_RD 1
486#define CEPH_FILE_MODE_WR 2
487#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
488#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
489#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
490
491int ceph_flags_to_mode(int flags);
492
493
494/* capability bits */
495#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
496
497/* generic cap bits */
498#define CEPH_CAP_GSHARED 1 /* client can reads */
499#define CEPH_CAP_GEXCL 2 /* client can read and update */
500#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
501#define CEPH_CAP_GRD 8 /* (file) client can read */
502#define CEPH_CAP_GWR 16 /* (file) client can write */
503#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
504#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
505#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
506
507/* per-lock shift */
508#define CEPH_CAP_SAUTH 2
509#define CEPH_CAP_SLINK 4
510#define CEPH_CAP_SXATTR 6
511#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
512
513#define CEPH_CAP_BITS 16
514
515/* composed values */
516#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
517#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
518#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
519#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
520#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
521#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
522#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
523#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
524#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
525#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
526#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
527#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
528#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
529#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
530#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
531
532/* cap masks (for getattr) */
533#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
534#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
535#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
536#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
537#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
538#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
539#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
540#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
541#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
542#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
543#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
544#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
545#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
546 CEPH_CAP_AUTH_SHARED | \
547 CEPH_CAP_LINK_SHARED | \
548 CEPH_CAP_FILE_SHARED | \
549 CEPH_CAP_XATTR_SHARED)
550
551#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
552 CEPH_CAP_LINK_SHARED | \
553 CEPH_CAP_XATTR_SHARED | \
554 CEPH_CAP_FILE_SHARED)
555#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
556 CEPH_CAP_FILE_CACHE)
557
558#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
559 CEPH_CAP_LINK_EXCL | \
560 CEPH_CAP_XATTR_EXCL | \
561 CEPH_CAP_FILE_EXCL)
562#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
563 CEPH_CAP_FILE_EXCL)
564#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
565#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
566 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
567
568#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
569 CEPH_LOCK_IXATTR)
570
571int ceph_caps_for_mode(int mode);
572
573enum {
574 CEPH_CAP_OP_GRANT, /* mds->client grant */
575 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
576 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
577 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
578 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
579 CEPH_CAP_OP_UPDATE, /* client->mds update */
580 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
581 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
582 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
583 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
584 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
585 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
586 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
587};
588
589extern const char *ceph_cap_op_name(int op);
590
591/*
592 * caps message, used for capability callbacks, acks, requests, etc.
593 */
594struct ceph_mds_caps {
595 __le32 op; /* CEPH_CAP_OP_* */
596 __le64 ino, realm;
597 __le64 cap_id;
598 __le32 seq, issue_seq;
599 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
600 __le32 migrate_seq;
601 __le64 snap_follows;
602 __le32 snap_trace_len;
603
604 /* authlock */
605 __le32 uid, gid, mode;
606
607 /* linklock */
608 __le32 nlink;
609
610 /* xattrlock */
611 __le32 xattr_len;
612 __le64 xattr_version;
613
614 /* filelock */
615 __le64 size, max_size, truncate_size;
616 __le32 truncate_seq;
617 struct ceph_timespec mtime, atime, ctime;
618 struct ceph_file_layout layout;
619 __le32 time_warp_seq;
620} __attribute__ ((packed));
621
622/* cap release msg head */
623struct ceph_mds_cap_release {
624 __le32 num; /* number of cap_items that follow */
625} __attribute__ ((packed));
626
627struct ceph_mds_cap_item {
628 __le64 ino;
629 __le64 cap_id;
630 __le32 migrate_seq, seq;
631} __attribute__ ((packed));
632
633#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
634#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
635#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
636#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
637
638extern const char *ceph_lease_op_name(int o);
639
640/* lease msg header */
641struct ceph_mds_lease {
642 __u8 action; /* CEPH_MDS_LEASE_* */
643 __le16 mask; /* which lease */
644 __le64 ino;
645 __le64 first, last; /* snap range */
646 __le32 seq;
647 __le32 duration_ms; /* duration of renewal */
648} __attribute__ ((packed));
649/* followed by a __le32+string for dname */
650
651/* client reconnect */
652struct ceph_mds_cap_reconnect {
653 __le64 cap_id;
654 __le32 wanted;
655 __le32 issued;
656 __le64 size;
657 struct ceph_timespec mtime, atime;
658 __le64 snaprealm;
659 __le64 pathbase; /* base ino for our path to this ino */
660} __attribute__ ((packed));
661/* followed by encoded string */
662
663struct ceph_mds_snaprealm_reconnect {
664 __le64 ino; /* snap realm base */
665 __le64 seq; /* snap seq for this snap realm */
666 __le64 parent; /* parent realm */
667} __attribute__ ((packed));
668
669/*
670 * snaps
671 */
672enum {
673 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
674 CEPH_SNAP_OP_CREATE,
675 CEPH_SNAP_OP_DESTROY,
676 CEPH_SNAP_OP_SPLIT,
677};
678
679extern const char *ceph_snap_op_name(int o);
680
681/* snap msg header */
682struct ceph_mds_snap_head {
683 __le32 op; /* CEPH_SNAP_OP_* */
684 __le64 split; /* ino to split off, if any */
685 __le32 num_split_inos; /* # inos belonging to new child realm */
686 __le32 num_split_realms; /* # child realms udner new child realm */
687 __le32 trace_len; /* size of snap trace blob */
688} __attribute__ ((packed));
689/* followed by split ino list, then split realms, then the trace blob */
690
691/*
692 * encode info about a snaprealm, as viewed by a client
693 */
694struct ceph_mds_snap_realm {
695 __le64 ino; /* ino */
696 __le64 created; /* snap: when created */
697 __le64 parent; /* ino: parent realm */
698 __le64 parent_since; /* snap: same parent since */
699 __le64 seq; /* snap: version */
700 __le32 num_snaps;
701 __le32 num_prior_parent_snaps;
702} __attribute__ ((packed));
703/* followed by my snap list, then prior parent snap list */
704
705#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d14..00000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index 5ac470c433c..00000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e577..00000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index dcd7e752370..00000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694b..00000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index ff48e110e4b..00000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258..00000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index 98e90046fd9..00000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index f704b3b6242..00000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,409 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index 40b502e6bd8..00000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2f5332ddbb..7ae1b3d55b5 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -171,6 +77,8 @@ static int mdsc_show(struct seq_file *s, void *p)
171 } else if (req->r_dentry) { 77 } else if (req->r_dentry) {
172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 78 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
173 &pathbase, 0); 79 &pathbase, 0);
80 if (IS_ERR(path))
81 path = NULL;
174 spin_lock(&req->r_dentry->d_lock); 82 spin_lock(&req->r_dentry->d_lock);
175 seq_printf(s, " #%llx/%.*s (%s)", 83 seq_printf(s, " #%llx/%.*s (%s)",
176 ceph_ino(req->r_dentry->d_parent->d_inode), 84 ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +95,8 @@ static int mdsc_show(struct seq_file *s, void *p)
187 if (req->r_old_dentry) { 95 if (req->r_old_dentry) {
188 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, 96 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
189 &pathbase, 0); 97 &pathbase, 0);
98 if (IS_ERR(path))
99 path = NULL;
190 spin_lock(&req->r_old_dentry->d_lock); 100 spin_lock(&req->r_old_dentry->d_lock);
191 seq_printf(s, " #%llx/%.*s (%s)", 101 seq_printf(s, " #%llx/%.*s (%s)",
192 ceph_ino(req->r_old_dentry->d_parent->d_inode), 102 ceph_ino(req->r_old_dentry->d_parent->d_inode),
@@ -210,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
210 return 0; 120 return 0;
211} 121}
212 122
213static int osdc_show(struct seq_file *s, void *pp)
214{
215 struct ceph_client *client = s->private;
216 struct ceph_osd_client *osdc = &client->osdc;
217 struct rb_node *p;
218
219 mutex_lock(&osdc->request_mutex);
220 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
221 struct ceph_osd_request *req;
222 struct ceph_osd_request_head *head;
223 struct ceph_osd_op *op;
224 int num_ops;
225 int opcode, olen;
226 int i;
227
228 req = rb_entry(p, struct ceph_osd_request, r_node);
229
230 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
231 req->r_osd ? req->r_osd->o_osd : -1,
232 le32_to_cpu(req->r_pgid.pool),
233 le16_to_cpu(req->r_pgid.ps));
234
235 head = req->r_request->front.iov_base;
236 op = (void *)(head + 1);
237
238 num_ops = le16_to_cpu(head->num_ops);
239 olen = le32_to_cpu(head->object_len);
240 seq_printf(s, "%.*s", olen,
241 (const char *)(head->ops + num_ops));
242
243 if (req->r_reassert_version.epoch)
244 seq_printf(s, "\t%u'%llu",
245 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
246 le64_to_cpu(req->r_reassert_version.version));
247 else
248 seq_printf(s, "\t");
249
250 for (i = 0; i < num_ops; i++) {
251 opcode = le16_to_cpu(op->op);
252 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
253 op++;
254 }
255
256 seq_printf(s, "\n");
257 }
258 mutex_unlock(&osdc->request_mutex);
259 return 0;
260}
261
262static int caps_show(struct seq_file *s, void *p) 123static int caps_show(struct seq_file *s, void *p)
263{ 124{
264 struct ceph_client *client = s->private; 125 struct ceph_fs_client *fsc = s->private;
265 int total, avail, used, reserved, min; 126 int total, avail, used, reserved, min;
266 127
267 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 128 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
268 seq_printf(s, "total\t\t%d\n" 129 seq_printf(s, "total\t\t%d\n"
269 "avail\t\t%d\n" 130 "avail\t\t%d\n"
270 "used\t\t%d\n" 131 "used\t\t%d\n"
@@ -276,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
276 137
277static int dentry_lru_show(struct seq_file *s, void *ptr) 138static int dentry_lru_show(struct seq_file *s, void *ptr)
278{ 139{
279 struct ceph_client *client = s->private; 140 struct ceph_fs_client *fsc = s->private;
280 struct ceph_mds_client *mdsc = &client->mdsc; 141 struct ceph_mds_client *mdsc = fsc->mdsc;
281 struct ceph_dentry_info *di; 142 struct ceph_dentry_info *di;
282 143
283 spin_lock(&mdsc->dentry_lru_lock); 144 spin_lock(&mdsc->dentry_lru_lock);
@@ -291,199 +152,125 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
291 return 0; 152 return 0;
292} 153}
293 154
294#define DEFINE_SHOW_FUNC(name) \ 155CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
295static int name##_open(struct inode *inode, struct file *file) \ 156CEPH_DEFINE_SHOW_FUNC(mdsc_show)
296{ \ 157CEPH_DEFINE_SHOW_FUNC(caps_show)
297 struct seq_file *sf; \ 158CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
298 int ret; \ 159
299 \
300 ret = single_open(file, name, NULL); \
301 sf = file->private_data; \
302 sf->private = inode->i_private; \
303 return ret; \
304} \
305 \
306static const struct file_operations name##_fops = { \
307 .open = name##_open, \
308 .read = seq_read, \
309 .llseek = seq_lseek, \
310 .release = single_release, \
311};
312
313DEFINE_SHOW_FUNC(monmap_show)
314DEFINE_SHOW_FUNC(mdsmap_show)
315DEFINE_SHOW_FUNC(osdmap_show)
316DEFINE_SHOW_FUNC(monc_show)
317DEFINE_SHOW_FUNC(mdsc_show)
318DEFINE_SHOW_FUNC(osdc_show)
319DEFINE_SHOW_FUNC(dentry_lru_show)
320DEFINE_SHOW_FUNC(caps_show)
321 160
161/*
162 * debugfs
163 */
322static int congestion_kb_set(void *data, u64 val) 164static int congestion_kb_set(void *data, u64 val)
323{ 165{
324 struct ceph_client *client = (struct ceph_client *)data; 166 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
325
326 if (client)
327 client->mount_args->congestion_kb = (int)val;
328 167
168 fsc->mount_options->congestion_kb = (int)val;
329 return 0; 169 return 0;
330} 170}
331 171
332static int congestion_kb_get(void *data, u64 *val) 172static int congestion_kb_get(void *data, u64 *val)
333{ 173{
334 struct ceph_client *client = (struct ceph_client *)data; 174 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
335
336 if (client)
337 *val = (u64)client->mount_args->congestion_kb;
338 175
176 *val = (u64)fsc->mount_options->congestion_kb;
339 return 0; 177 return 0;
340} 178}
341 179
342
343DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 180DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
344 congestion_kb_set, "%llu\n"); 181 congestion_kb_set, "%llu\n");
345 182
346int __init ceph_debugfs_init(void)
347{
348 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
349 if (!ceph_debugfs_dir)
350 return -ENOMEM;
351 return 0;
352}
353 183
354void ceph_debugfs_cleanup(void) 184void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
355{ 185{
356 debugfs_remove(ceph_debugfs_dir); 186 dout("ceph_fs_debugfs_cleanup\n");
187 debugfs_remove(fsc->debugfs_bdi);
188 debugfs_remove(fsc->debugfs_congestion_kb);
189 debugfs_remove(fsc->debugfs_mdsmap);
190 debugfs_remove(fsc->debugfs_caps);
191 debugfs_remove(fsc->debugfs_mdsc);
192 debugfs_remove(fsc->debugfs_dentry_lru);
357} 193}
358 194
359int ceph_debugfs_client_init(struct ceph_client *client) 195int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
360{ 196{
361 int ret = 0; 197 char name[100];
362 char name[80]; 198 int err = -ENOMEM;
363 199
364 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", 200 dout("ceph_fs_debugfs_init\n");
365 PR_FSID(&client->fsid), client->monc.auth->global_id); 201 fsc->debugfs_congestion_kb =
366 202 debugfs_create_file("writeback_congestion_kb",
367 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 203 0600,
368 if (!client->debugfs_dir) 204 fsc->client->debugfs_dir,
205 fsc,
206 &congestion_kb_fops);
207 if (!fsc->debugfs_congestion_kb)
369 goto out; 208 goto out;
370 209
371 client->monc.debugfs_file = debugfs_create_file("monc", 210 dout("a\n");
372 0600,
373 client->debugfs_dir,
374 client,
375 &monc_show_fops);
376 if (!client->monc.debugfs_file)
377 goto out;
378
379 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
380 0600,
381 client->debugfs_dir,
382 client,
383 &mdsc_show_fops);
384 if (!client->mdsc.debugfs_file)
385 goto out;
386 211
387 client->osdc.debugfs_file = debugfs_create_file("osdc", 212 snprintf(name, sizeof(name), "../../bdi/%s",
388 0600, 213 dev_name(fsc->backing_dev_info.dev));
389 client->debugfs_dir, 214 fsc->debugfs_bdi =
390 client, 215 debugfs_create_symlink("bdi",
391 &osdc_show_fops); 216 fsc->client->debugfs_dir,
392 if (!client->osdc.debugfs_file) 217 name);
218 if (!fsc->debugfs_bdi)
393 goto out; 219 goto out;
394 220
395 client->debugfs_monmap = debugfs_create_file("monmap", 221 dout("b\n");
222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
396 0600, 223 0600,
397 client->debugfs_dir, 224 fsc->client->debugfs_dir,
398 client, 225 fsc,
399 &monmap_show_fops);
400 if (!client->debugfs_monmap)
401 goto out;
402
403 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
404 0600,
405 client->debugfs_dir,
406 client,
407 &mdsmap_show_fops); 226 &mdsmap_show_fops);
408 if (!client->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
409 goto out;
410
411 client->debugfs_osdmap = debugfs_create_file("osdmap",
412 0600,
413 client->debugfs_dir,
414 client,
415 &osdmap_show_fops);
416 if (!client->debugfs_osdmap)
417 goto out; 228 goto out;
418 229
419 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 230 dout("ca\n");
420 0600, 231 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
421 client->debugfs_dir, 232 0600,
422 client, 233 fsc->client->debugfs_dir,
423 &dentry_lru_show_fops); 234 fsc,
424 if (!client->debugfs_dentry_lru) 235 &mdsc_show_fops);
236 if (!fsc->debugfs_mdsc)
425 goto out; 237 goto out;
426 238
427 client->debugfs_caps = debugfs_create_file("caps", 239 dout("da\n");
240 fsc->debugfs_caps = debugfs_create_file("caps",
428 0400, 241 0400,
429 client->debugfs_dir, 242 fsc->client->debugfs_dir,
430 client, 243 fsc,
431 &caps_show_fops); 244 &caps_show_fops);
432 if (!client->debugfs_caps) 245 if (!fsc->debugfs_caps)
433 goto out; 246 goto out;
434 247
435 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 248 dout("ea\n");
436 0600, 249 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
437 client->debugfs_dir, 250 0600,
438 client, 251 fsc->client->debugfs_dir,
439 &congestion_kb_fops); 252 fsc,
440 if (!client->debugfs_congestion_kb) 253 &dentry_lru_show_fops);
254 if (!fsc->debugfs_dentry_lru)
441 goto out; 255 goto out;
442 256
443 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
444 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
445 name);
446
447 return 0; 257 return 0;
448 258
449out: 259out:
450 ceph_debugfs_client_cleanup(client); 260 ceph_fs_debugfs_cleanup(fsc);
451 return ret; 261 return err;
452} 262}
453 263
454void ceph_debugfs_client_cleanup(struct ceph_client *client)
455{
456 debugfs_remove(client->debugfs_bdi);
457 debugfs_remove(client->debugfs_caps);
458 debugfs_remove(client->debugfs_dentry_lru);
459 debugfs_remove(client->debugfs_osdmap);
460 debugfs_remove(client->debugfs_mdsmap);
461 debugfs_remove(client->debugfs_monmap);
462 debugfs_remove(client->osdc.debugfs_file);
463 debugfs_remove(client->mdsc.debugfs_file);
464 debugfs_remove(client->monc.debugfs_file);
465 debugfs_remove(client->debugfs_congestion_kb);
466 debugfs_remove(client->debugfs_dir);
467}
468
469#else // CONFIG_DEBUG_FS
470
471int __init ceph_debugfs_init(void)
472{
473 return 0;
474}
475 264
476void ceph_debugfs_cleanup(void) 265#else /* CONFIG_DEBUG_FS */
477{
478}
479 266
480int ceph_debugfs_client_init(struct ceph_client *client) 267int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
481{ 268{
482 return 0; 269 return 0;
483} 270}
484 271
485void ceph_debugfs_client_cleanup(struct ceph_client *client) 272void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
486{ 273{
487} 274}
488 275
489#endif // CONFIG_DEBUG_FS 276#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 65b3e022eaf..00000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,194 +0,0 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f94ed3c7f6a..e0a2dc6fcaf 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -27,7 +28,7 @@
27 28
28const struct inode_operations ceph_dir_iops; 29const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops; 30const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops; 31const struct dentry_operations ceph_dentry_ops;
31 32
32/* 33/*
33 * Initialize ceph dentry state. 34 * Initialize ceph dentry state.
@@ -46,7 +47,7 @@ int ceph_init_dentry(struct dentry *dentry)
46 else 47 else
47 dentry->d_op = &ceph_snap_dentry_ops; 48 dentry->d_op = &ceph_snap_dentry_ops;
48 49
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 50 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
50 if (!di) 51 if (!di)
51 return -ENOMEM; /* oh well */ 52 return -ENOMEM; /* oh well */
52 53
@@ -95,7 +96,6 @@ static unsigned fpos_off(loff_t p)
95static int __dcache_readdir(struct file *filp, 96static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 97 void *dirent, filldir_t filldir)
97{ 98{
98 struct inode *inode = filp->f_dentry->d_inode;
99 struct ceph_file_info *fi = filp->private_data; 99 struct ceph_file_info *fi = filp->private_data;
100 struct dentry *parent = filp->f_dentry; 100 struct dentry *parent = filp->f_dentry;
101 struct inode *dir = parent->d_inode; 101 struct inode *dir = parent->d_inode;
@@ -151,7 +151,6 @@ more:
151 151
152 atomic_inc(&dentry->d_count); 152 atomic_inc(&dentry->d_count);
153 spin_unlock(&dcache_lock); 153 spin_unlock(&dcache_lock);
154 spin_unlock(&inode->i_lock);
155 154
156 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
157 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -169,35 +168,30 @@ more:
169 } else { 168 } else {
170 dput(last); 169 dput(last);
171 } 170 }
172 last = NULL;
173 } 171 }
174
175 spin_lock(&inode->i_lock);
176 spin_lock(&dcache_lock);
177
178 last = dentry; 172 last = dentry;
179 173
180 if (err < 0) 174 if (err < 0)
181 goto out_unlock; 175 goto out;
182 176
183 p = p->prev;
184 filp->f_pos++; 177 filp->f_pos++;
185 178
186 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
187 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
188 goto more; 181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
189 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 182 err = -EAGAIN;
190 err = -EAGAIN; 183 goto out;
184 }
185
186 spin_lock(&dcache_lock);
187 p = p->prev; /* advance to next dentry */
188 goto more;
191 189
192out_unlock: 190out_unlock:
193 spin_unlock(&dcache_lock); 191 spin_unlock(&dcache_lock);
194 192out:
195 if (last) { 193 if (last)
196 spin_unlock(&inode->i_lock);
197 dput(last); 194 dput(last);
198 spin_lock(&inode->i_lock);
199 }
200
201 return err; 195 return err;
202} 196}
203 197
@@ -225,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
225 struct ceph_file_info *fi = filp->private_data; 219 struct ceph_file_info *fi = filp->private_data;
226 struct inode *inode = filp->f_dentry->d_inode; 220 struct inode *inode = filp->f_dentry->d_inode;
227 struct ceph_inode_info *ci = ceph_inode(inode); 221 struct ceph_inode_info *ci = ceph_inode(inode);
228 struct ceph_client *client = ceph_inode_to_client(inode); 222 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
229 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
230 unsigned frag = fpos_frag(filp->f_pos); 224 unsigned frag = fpos_frag(filp->f_pos);
231 int off = fpos_off(filp->f_pos); 225 int off = fpos_off(filp->f_pos);
232 int err; 226 int err;
233 u32 ftype; 227 u32 ftype;
234 struct ceph_mds_reply_info_parsed *rinfo; 228 struct ceph_mds_reply_info_parsed *rinfo;
235 const int max_entries = client->mount_args->max_readdir; 229 const int max_entries = fsc->mount_options->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes; 230 const int max_bytes = fsc->mount_options->max_readdir_bytes;
237 231
238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
239 if (fi->at_end) 233 if (fi->at_end)
@@ -265,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
265 /* can we use the dcache? */ 259 /* can we use the dcache? */
266 spin_lock(&inode->i_lock); 260 spin_lock(&inode->i_lock);
267 if ((filp->f_pos == 2 || fi->dentry) && 261 if ((filp->f_pos == 2 || fi->dentry) &&
268 !ceph_test_opt(client, NOASYNCREADDIR) && 262 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
269 ceph_snap(inode) != CEPH_SNAPDIR && 263 ceph_snap(inode) != CEPH_SNAPDIR &&
270 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
271 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 spin_unlock(&inode->i_lock);
272 err = __dcache_readdir(filp, dirent, filldir); 267 err = __dcache_readdir(filp, dirent, filldir);
273 if (err != -EAGAIN) { 268 if (err != -EAGAIN)
274 spin_unlock(&inode->i_lock);
275 return err; 269 return err;
276 } 270 } else {
271 spin_unlock(&inode->i_lock);
277 } 272 }
278 spin_unlock(&inode->i_lock);
279 if (fi->dentry) { 273 if (fi->dentry) {
280 err = note_last_dentry(fi, fi->dentry->d_name.name, 274 err = note_last_dentry(fi, fi->dentry->d_name.name,
281 fi->dentry->d_name.len); 275 fi->dentry->d_name.len);
@@ -485,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
485struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 479struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
486 struct dentry *dentry, int err) 480 struct dentry *dentry, int err)
487{ 481{
488 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 482 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
489 struct inode *parent = dentry->d_parent->d_inode; 483 struct inode *parent = dentry->d_parent->d_inode;
490 484
491 /* .snap dir? */ 485 /* .snap dir? */
492 if (err == -ENOENT && 486 if (err == -ENOENT &&
493 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
494 strcmp(dentry->d_name.name, 487 strcmp(dentry->d_name.name,
495 client->mount_args->snapdir_name) == 0) { 488 fsc->mount_options->snapdir_name) == 0) {
496 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
497 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
498 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -537,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
537static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
538 struct nameidata *nd) 531 struct nameidata *nd)
539{ 532{
540 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 533 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
541 struct ceph_mds_client *mdsc = &client->mdsc; 534 struct ceph_mds_client *mdsc = fsc->mdsc;
542 struct ceph_mds_request *req; 535 struct ceph_mds_request *req;
543 int op; 536 int op;
544 int err; 537 int err;
@@ -570,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
570 spin_lock(&dir->i_lock); 563 spin_lock(&dir->i_lock);
571 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
572 if (strncmp(dentry->d_name.name, 565 if (strncmp(dentry->d_name.name,
573 client->mount_args->snapdir_name, 566 fsc->mount_options->snapdir_name,
574 dentry->d_name.len) && 567 dentry->d_name.len) &&
575 !is_root_ceph_dentry(dir, dentry) && 568 !is_root_ceph_dentry(dir, dentry) &&
576 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -627,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
627static int ceph_mknod(struct inode *dir, struct dentry *dentry, 620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
628 int mode, dev_t rdev) 621 int mode, dev_t rdev)
629{ 622{
630 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 623 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
631 struct ceph_mds_client *mdsc = &client->mdsc; 624 struct ceph_mds_client *mdsc = fsc->mdsc;
632 struct ceph_mds_request *req; 625 struct ceph_mds_request *req;
633 int err; 626 int err;
634 627
@@ -683,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
683static int ceph_symlink(struct inode *dir, struct dentry *dentry, 676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
684 const char *dest) 677 const char *dest)
685{ 678{
686 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 679 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
687 struct ceph_mds_client *mdsc = &client->mdsc; 680 struct ceph_mds_client *mdsc = fsc->mdsc;
688 struct ceph_mds_request *req; 681 struct ceph_mds_request *req;
689 int err; 682 int err;
690 683
@@ -714,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
714 707
715static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
716{ 709{
717 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 710 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
718 struct ceph_mds_client *mdsc = &client->mdsc; 711 struct ceph_mds_client *mdsc = fsc->mdsc;
719 struct ceph_mds_request *req; 712 struct ceph_mds_request *req;
720 int err = -EROFS; 713 int err = -EROFS;
721 int op; 714 int op;
@@ -756,8 +749,8 @@ out:
756static int ceph_link(struct dentry *old_dentry, struct inode *dir, 749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
757 struct dentry *dentry) 750 struct dentry *dentry)
758{ 751{
759 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 752 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
760 struct ceph_mds_client *mdsc = &client->mdsc; 753 struct ceph_mds_client *mdsc = fsc->mdsc;
761 struct ceph_mds_request *req; 754 struct ceph_mds_request *req;
762 int err; 755 int err;
763 756
@@ -811,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
811 */ 804 */
812static int ceph_unlink(struct inode *dir, struct dentry *dentry) 805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
813{ 806{
814 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 807 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
815 struct ceph_mds_client *mdsc = &client->mdsc; 808 struct ceph_mds_client *mdsc = fsc->mdsc;
816 struct inode *inode = dentry->d_inode; 809 struct inode *inode = dentry->d_inode;
817 struct ceph_mds_request *req; 810 struct ceph_mds_request *req;
818 int err = -EROFS; 811 int err = -EROFS;
@@ -852,8 +845,8 @@ out:
852static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
853 struct inode *new_dir, struct dentry *new_dentry) 846 struct inode *new_dir, struct dentry *new_dentry)
854{ 847{
855 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 848 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
856 struct ceph_mds_client *mdsc = &client->mdsc; 849 struct ceph_mds_client *mdsc = fsc->mdsc;
857 struct ceph_mds_request *req; 850 struct ceph_mds_request *req;
858 int err; 851 int err;
859 852
@@ -1019,11 +1012,15 @@ out_touch:
1019static void ceph_dentry_release(struct dentry *dentry) 1012static void ceph_dentry_release(struct dentry *dentry)
1020{ 1013{
1021 struct ceph_dentry_info *di = ceph_dentry(dentry); 1014 struct ceph_dentry_info *di = ceph_dentry(dentry);
1022 struct inode *parent_inode = dentry->d_parent->d_inode; 1015 struct inode *parent_inode = NULL;
1023 u64 snapid = ceph_snap(parent_inode); 1016 u64 snapid = CEPH_NOSNAP;
1024 1017
1018 if (!IS_ROOT(dentry)) {
1019 parent_inode = dentry->d_parent->d_inode;
1020 if (parent_inode)
1021 snapid = ceph_snap(parent_inode);
1022 }
1025 dout("dentry_release %p parent %p\n", dentry, parent_inode); 1023 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1026
1027 if (parent_inode && snapid != CEPH_SNAPDIR) { 1024 if (parent_inode && snapid != CEPH_SNAPDIR) {
1028 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1025 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1029 1026
@@ -1070,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1070 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1071 int left; 1068 int left;
1072 1069
1073 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1074 return -EISDIR; 1071 return -EISDIR;
1075 1072
1076 if (!cf->dir_info) { 1073 if (!cf->dir_info) {
@@ -1171,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1171 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1168 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1172 dn->d_name.len, dn->d_name.name); 1169 dn->d_name.len, dn->d_name.name);
1173 if (di) { 1170 if (di) {
1174 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1171 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1175 spin_lock(&mdsc->dentry_lru_lock); 1172 spin_lock(&mdsc->dentry_lru_lock);
1176 list_add_tail(&di->lru, &mdsc->dentry_lru); 1173 list_add_tail(&di->lru, &mdsc->dentry_lru);
1177 mdsc->num_dentry++; 1174 mdsc->num_dentry++;
@@ -1187,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1187 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1184 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1188 dn->d_name.len, dn->d_name.name, di->offset); 1185 dn->d_name.len, dn->d_name.name, di->offset);
1189 if (di) { 1186 if (di) {
1190 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1187 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1191 spin_lock(&mdsc->dentry_lru_lock); 1188 spin_lock(&mdsc->dentry_lru_lock);
1192 list_move_tail(&di->lru, &mdsc->dentry_lru); 1189 list_move_tail(&di->lru, &mdsc->dentry_lru);
1193 spin_unlock(&mdsc->dentry_lru_lock); 1190 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1202,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1202 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1199 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1203 dn->d_name.len, dn->d_name.name); 1200 dn->d_name.len, dn->d_name.name);
1204 if (di) { 1201 if (di) {
1205 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1202 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1206 spin_lock(&mdsc->dentry_lru_lock); 1203 spin_lock(&mdsc->dentry_lru_lock);
1207 list_del_init(&di->lru); 1204 list_del_init(&di->lru);
1208 mdsc->num_dentry--; 1205 mdsc->num_dentry--;
@@ -1239,16 +1236,16 @@ const struct inode_operations ceph_dir_iops = {
1239 .create = ceph_create, 1236 .create = ceph_create,
1240}; 1237};
1241 1238
1242struct dentry_operations ceph_dentry_ops = { 1239const struct dentry_operations ceph_dentry_ops = {
1243 .d_revalidate = ceph_d_revalidate, 1240 .d_revalidate = ceph_d_revalidate,
1244 .d_release = ceph_dentry_release, 1241 .d_release = ceph_dentry_release,
1245}; 1242};
1246 1243
1247struct dentry_operations ceph_snapdir_dentry_ops = { 1244const struct dentry_operations ceph_snapdir_dentry_ops = {
1248 .d_revalidate = ceph_snapdir_d_revalidate, 1245 .d_revalidate = ceph_snapdir_d_revalidate,
1249 .d_release = ceph_dentry_release, 1246 .d_release = ceph_dentry_release,
1250}; 1247};
1251 1248
1252struct dentry_operations ceph_snap_dentry_ops = { 1249const struct dentry_operations ceph_snap_dentry_ops = {
1253 .d_release = ceph_dentry_release, 1250 .d_release = ceph_dentry_release,
1254}; 1251};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e..2297d942699 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -42,32 +43,37 @@ struct ceph_nfs_confh {
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 43static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable) 44 int connectable)
44{ 45{
46 int type;
45 struct ceph_nfs_fh *fh = (void *)rawfh; 47 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh; 48 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent; 49 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode; 50 struct inode *inode = dentry->d_inode;
49 int type; 51 int connected_handle_length = sizeof(*cfh)/4;
52 int handle_length = sizeof(*fh)/4;
50 53
51 /* don't re-export snaps */ 54 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP) 55 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL; 56 return -EINVAL;
54 57
55 if (*max_len >= sizeof(*cfh)) { 58 if (*max_len >= connected_handle_length) {
56 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh); 63 *max_len = connected_handle_length;
61 type = 2; 64 type = 2;
62 } else if (*max_len > sizeof(*fh)) { 65 } else if (*max_len >= handle_length) {
63 if (connectable) 66 if (connectable) {
64 return -ENOSPC; 67 *max_len = connected_handle_length;
68 return 255;
69 }
65 dout("encode_fh %p\n", dentry); 70 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode); 71 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh); 72 *max_len = handle_length;
68 type = 1; 73 type = 1;
69 } else { 74 } else {
70 return -ENOSPC; 75 *max_len = handle_length;
76 return 255;
71 } 77 }
72 return type; 78 return type;
73} 79}
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
117{ 123{
118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 125 struct inode *inode;
120 struct dentry *dentry; 126 struct dentry *dentry;
121 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7c08698fad3..e77c28cf369 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 217 struct nameidata *nd, int mode,
217 int locked_dir) 218 int locked_dir)
218{ 219{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 221 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 222 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 223 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 224 struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 271}
271 272
272/* 273/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 274 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 275 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 276 *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
438 struct page **pages, int num_pages, 282 struct page **pages, int num_pages,
439 int *checkeof) 283 int *checkeof)
440{ 284{
441 struct ceph_client *client = ceph_inode_to_client(inode); 285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 287 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
459 303
460more: 304more:
461 this_len = left; 305 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 307 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 308 ci->i_truncate_seq,
465 ci->i_truncate_size, 309 ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
477 321
478 if (read < pos - off) { 322 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 323 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 324 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 325 pos - off - read, pages);
482 } 326 }
483 pos += ret; 327 pos += ret;
484 read = pos - off; 328 read = pos - off;
@@ -495,8 +339,8 @@ more:
495 /* was original extent fully inside i_size? */ 339 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 340 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 341 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 342 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 343 pages);
500 read = len; 344 read = len;
501 goto out; 345 goto out;
502 } 346 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 376
533 if (file->f_flags & O_DIRECT) { 377 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 378 pages = ceph_get_direct_page_vector(data, num_pages, off, len);
535 379
536 /* 380 /*
537 * flush any page cache pages in this range. this 381 * flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553 397
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 400 if (ret >= 0)
557 *poff = off + ret; 401 *poff = off + ret;
558 402
559done: 403done:
560 if (file->f_flags & O_DIRECT) 404 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 405 ceph_put_page_vector(pages, num_pages);
562 else 406 else
563 ceph_release_page_vector(pages, num_pages); 407 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 408 dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 438{
595 struct inode *inode = file->f_dentry->d_inode; 439 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 442 struct ceph_osd_request *req;
599 struct page **pages; 443 struct page **pages;
600 int num_pages; 444 int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 486 */
643more: 487more:
644 len = left; 488 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 490 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 491 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 492 ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
655 num_pages = calc_pages_for(pos, len); 499 num_pages = calc_pages_for(pos, len);
656 500
657 if (file->f_flags & O_DIRECT) { 501 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) { 503 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 504 ret = PTR_ERR(pages);
661 goto out; 505 goto out;
@@ -665,7 +509,7 @@ more:
665 * throw out any page cache pages in this range. this 509 * throw out any page cache pages in this range. this
666 * may block. 510 * may block.
667 */ 511 */
668 truncate_inode_pages_range(inode->i_mapping, pos, 512 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 513 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 514 } else {
671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 515 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
@@ -673,7 +517,7 @@ more:
673 ret = PTR_ERR(pages); 517 ret = PTR_ERR(pages);
674 goto out; 518 goto out;
675 } 519 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 520 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 521 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 522 ceph_release_page_vector(pages, num_pages);
679 goto out; 523 goto out;
@@ -689,7 +533,7 @@ more:
689 req->r_num_pages = num_pages; 533 req->r_num_pages = num_pages;
690 req->r_inode = inode; 534 req->r_inode = inode;
691 535
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 536 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 537 if (!ret) {
694 if (req->r_safe_callback) { 538 if (req->r_safe_callback) {
695 /* 539 /*
@@ -697,15 +541,15 @@ more:
697 * start_request so that a tid has been assigned. 541 * start_request so that a tid has been assigned.
698 */ 542 */
699 spin_lock(&ci->i_unsafe_lock); 543 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); 544 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
701 spin_unlock(&ci->i_unsafe_lock); 545 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 546 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 547 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 548 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 549 }
706 550
707 if (file->f_flags & O_DIRECT) 551 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 552 ceph_put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC) 553 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 554 ceph_release_page_vector(pages, num_pages);
711 555
@@ -740,28 +584,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
740 unsigned long nr_segs, loff_t pos) 584 unsigned long nr_segs, loff_t pos)
741{ 585{
742 struct file *filp = iocb->ki_filp; 586 struct file *filp = iocb->ki_filp;
587 struct ceph_file_info *fi = filp->private_data;
743 loff_t *ppos = &iocb->ki_pos; 588 loff_t *ppos = &iocb->ki_pos;
744 size_t len = iov->iov_len; 589 size_t len = iov->iov_len;
745 struct inode *inode = filp->f_dentry->d_inode; 590 struct inode *inode = filp->f_dentry->d_inode;
746 struct ceph_inode_info *ci = ceph_inode(inode); 591 struct ceph_inode_info *ci = ceph_inode(inode);
747 void *base = iov->iov_base; 592 void __user *base = iov->iov_base;
748 ssize_t ret; 593 ssize_t ret;
749 int got = 0; 594 int want, got = 0;
750 int checkeof = 0, read = 0; 595 int checkeof = 0, read = 0;
751 596
752 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 597 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
753 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 598 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
754again: 599again:
755 __ceph_do_pending_vmtruncate(inode); 600 __ceph_do_pending_vmtruncate(inode);
756 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, 601 if (fi->fmode & CEPH_FILE_MODE_LAZY)
757 &got, -1); 602 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
603 else
604 want = CEPH_CAP_FILE_CACHE;
605 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
758 if (ret < 0) 606 if (ret < 0)
759 goto out; 607 goto out;
760 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 608 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
761 inode, ceph_vinop(inode), pos, (unsigned)len, 609 inode, ceph_vinop(inode), pos, (unsigned)len,
762 ceph_cap_string(got)); 610 ceph_cap_string(got));
763 611
764 if ((got & CEPH_CAP_FILE_CACHE) == 0 || 612 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) || 613 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) 614 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
767 /* hmm, this isn't really async... */ 615 /* hmm, this isn't really async... */
@@ -807,11 +655,13 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 unsigned long nr_segs, loff_t pos) 655 unsigned long nr_segs, loff_t pos)
808{ 656{
809 struct file *file = iocb->ki_filp; 657 struct file *file = iocb->ki_filp;
658 struct ceph_file_info *fi = file->private_data;
810 struct inode *inode = file->f_dentry->d_inode; 659 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 660 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 661 struct ceph_osd_client *osdc =
662 &ceph_sb_to_client(inode->i_sb)->client->osdc;
813 loff_t endoff = pos + iov->iov_len; 663 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 664 int want, got = 0;
815 int ret, err; 665 int ret, err;
816 666
817 if (ceph_snap(inode) != CEPH_NOSNAP) 667 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +674,11 @@ retry_snap:
824 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 674 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
825 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 675 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
826 inode->i_size); 676 inode->i_size);
827 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 677 if (fi->fmode & CEPH_FILE_MODE_LAZY)
828 &got, endoff); 678 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
679 else
680 want = CEPH_CAP_FILE_BUFFER;
681 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
829 if (ret < 0) 682 if (ret < 0)
830 goto out; 683 goto out;
831 684
@@ -833,7 +686,7 @@ retry_snap:
833 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 686 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
834 ceph_cap_string(got)); 687 ceph_cap_string(got));
835 688
836 if ((got & CEPH_CAP_FILE_BUFFER) == 0 || 689 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
837 (iocb->ki_filp->f_flags & O_DIRECT) || 690 (iocb->ki_filp->f_flags & O_DIRECT) ||
838 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { 691 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
839 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 692 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -930,6 +783,8 @@ const struct file_operations ceph_file_fops = {
930 .aio_write = ceph_aio_write, 783 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap, 784 .mmap = ceph_mmap,
932 .fsync = ceph_fsync, 785 .fsync = ceph_fsync,
786 .lock = ceph_lock,
787 .flock = ceph_flock,
933 .splice_read = generic_file_splice_read, 788 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write, 789 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl, 790 .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 389f9dbd994..1d6a45b5a04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -13,7 +13,8 @@
13#include <linux/pagevec.h> 13#include <linux/pagevec.h>
14 14
15#include "super.h" 15#include "super.h"
16#include "decode.h" 16#include "mds_client.h"
17#include <linux/ceph/decode.h>
17 18
18/* 19/*
19 * Ceph inode operations 20 * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 385 */
385 if (ci->i_snap_realm) { 386 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 387 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 388 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 389 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 390
390 dout(" dropping residual ref to snap realm %p\n", realm); 391 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -442,8 +443,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
442 * the file is either opened or mmaped 443 * the file is either opened or mmaped
443 */ 444 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 445 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 446 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) || 447 CEPH_CAP_FILE_EXCL|
448 CEPH_CAP_FILE_LAZYIO)) ||
447 mapping_mapped(inode->i_mapping) || 449 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) { 450 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++; 451 ci->i_truncate_pending++;
@@ -676,6 +678,7 @@ static int fill_inode(struct inode *inode,
676 if (ci->i_files == 0 && ci->i_subdirs == 0 && 678 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
677 ceph_snap(inode) == CEPH_NOSNAP && 679 ceph_snap(inode) == CEPH_NOSNAP &&
678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 680 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
681 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 682 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
680 dout(" marking %p complete (empty)\n", inode); 683 dout(" marking %p complete (empty)\n", inode);
681 ci->i_ceph_flags |= CEPH_I_COMPLETE; 684 ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -683,7 +686,7 @@ static int fill_inode(struct inode *inode,
683 } 686 }
684 687
685 /* it may be better to set st_size in getattr instead? */ 688 /* it may be better to set st_size in getattr instead? */
686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) 689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
687 inode->i_size = ci->i_rbytes; 690 inode->i_size = ci->i_rbytes;
688 break; 691 break;
689 default: 692 default:
@@ -843,7 +846,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
843 * the caller) if we fail. 846 * the caller) if we fail.
844 */ 847 */
845static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 848static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
846 bool *prehash) 849 bool *prehash, bool set_offset)
847{ 850{
848 struct dentry *realdn; 851 struct dentry *realdn;
849 852
@@ -875,7 +878,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
875 } 878 }
876 if ((!prehash || *prehash) && d_unhashed(dn)) 879 if ((!prehash || *prehash) && d_unhashed(dn))
877 d_rehash(dn); 880 d_rehash(dn);
878 ceph_set_dentry_offset(dn); 881 if (set_offset)
882 ceph_set_dentry_offset(dn);
879out: 883out:
880 return dn; 884 return dn;
881} 885}
@@ -898,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
898 struct inode *in = NULL; 902 struct inode *in = NULL;
899 struct ceph_mds_reply_inode *ininfo; 903 struct ceph_mds_reply_inode *ininfo;
900 struct ceph_vino vino; 904 struct ceph_vino vino;
901 struct ceph_client *client = ceph_sb_to_client(sb); 905 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
902 int i = 0; 906 int i = 0;
903 int err = 0; 907 int err = 0;
904 908
@@ -962,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
962 */ 966 */
963 if (rinfo->head->is_dentry && !req->r_aborted && 967 if (rinfo->head->is_dentry && !req->r_aborted &&
964 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 968 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
965 client->mount_args->snapdir_name, 969 fsc->mount_options->snapdir_name,
966 req->r_dentry->d_name.len))) { 970 req->r_dentry->d_name.len))) {
967 /* 971 /*
968 * lookup link rename : null -> possibly existing inode 972 * lookup link rename : null -> possibly existing inode
@@ -1060,7 +1064,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1060 d_delete(dn); 1064 d_delete(dn);
1061 goto done; 1065 goto done;
1062 } 1066 }
1063 dn = splice_dentry(dn, in, &have_lease); 1067 dn = splice_dentry(dn, in, &have_lease, true);
1064 if (IS_ERR(dn)) { 1068 if (IS_ERR(dn)) {
1065 err = PTR_ERR(dn); 1069 err = PTR_ERR(dn);
1066 goto done; 1070 goto done;
@@ -1103,7 +1107,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1103 goto done; 1107 goto done;
1104 } 1108 }
1105 dout(" linking snapped dir %p to dn %p\n", in, dn); 1109 dout(" linking snapped dir %p to dn %p\n", in, dn);
1106 dn = splice_dentry(dn, in, NULL); 1110 dn = splice_dentry(dn, in, NULL, true);
1107 if (IS_ERR(dn)) { 1111 if (IS_ERR(dn)) {
1108 err = PTR_ERR(dn); 1112 err = PTR_ERR(dn);
1109 goto done; 1113 goto done;
@@ -1228,14 +1232,14 @@ retry_lookup:
1228 in = dn->d_inode; 1232 in = dn->d_inode;
1229 } else { 1233 } else {
1230 in = ceph_get_inode(parent->d_sb, vino); 1234 in = ceph_get_inode(parent->d_sb, vino);
1231 if (in == NULL) { 1235 if (IS_ERR(in)) {
1232 dout("new_inode badness\n"); 1236 dout("new_inode badness\n");
1233 d_delete(dn); 1237 d_delete(dn);
1234 dput(dn); 1238 dput(dn);
1235 err = -ENOMEM; 1239 err = PTR_ERR(in);
1236 goto out; 1240 goto out;
1237 } 1241 }
1238 dn = splice_dentry(dn, in, NULL); 1242 dn = splice_dentry(dn, in, NULL, false);
1239 if (IS_ERR(dn)) 1243 if (IS_ERR(dn))
1240 dn = NULL; 1244 dn = NULL;
1241 } 1245 }
@@ -1530,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1530 struct inode *parent_inode = dentry->d_parent->d_inode; 1534 struct inode *parent_inode = dentry->d_parent->d_inode;
1531 const unsigned int ia_valid = attr->ia_valid; 1535 const unsigned int ia_valid = attr->ia_valid;
1532 struct ceph_mds_request *req; 1536 struct ceph_mds_request *req;
1533 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1537 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1534 int issued; 1538 int issued;
1535 int release = 0, dirtied = 0; 1539 int release = 0, dirtied = 0;
1536 int mask = 0; 1540 int mask = 0;
@@ -1725,8 +1729,8 @@ out:
1725 */ 1729 */
1726int ceph_do_getattr(struct inode *inode, int mask) 1730int ceph_do_getattr(struct inode *inode, int mask)
1727{ 1731{
1728 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1732 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1729 struct ceph_mds_client *mdsc = &client->mdsc; 1733 struct ceph_mds_client *mdsc = fsc->mdsc;
1730 struct ceph_mds_request *req; 1734 struct ceph_mds_request *req;
1731 int err; 1735 int err;
1732 1736
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index d085f07756b..8888c9ba68d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -143,6 +208,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
143 return 0; 208 return 0;
144} 209}
145 210
211static long ceph_ioctl_lazyio(struct file *file)
212{
213 struct ceph_file_info *fi = file->private_data;
214 struct inode *inode = file->f_dentry->d_inode;
215 struct ceph_inode_info *ci = ceph_inode(inode);
216
217 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
218 spin_lock(&inode->i_lock);
219 ci->i_nr_by_mode[fi->fmode]--;
220 fi->fmode |= CEPH_FILE_MODE_LAZY;
221 ci->i_nr_by_mode[fi->fmode]++;
222 spin_unlock(&inode->i_lock);
223 dout("ioctl_layzio: file %p marked lazy\n", file);
224
225 ceph_check_caps(ci, 0, NULL);
226 } else {
227 dout("ioctl_layzio: file %p already lazy\n", file);
228 }
229 return 0;
230}
231
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 232long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{ 233{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); 234 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -153,8 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
153 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
155 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
156 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
247
248 case CEPH_IOC_LAZYIO:
249 return ceph_ioctl_lazyio(file);
158 } 250 }
251
159 return -ENOTTY; 252 return -ENOTTY;
160} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d05..a6ce54e94eb 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x98
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
@@ -37,4 +39,6 @@ struct ceph_ioctl_dataloc {
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 39#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc) 40 struct ceph_ioctl_dataloc)
39 41
42#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
43
40#endif 44#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 00000000000..40abde93c34
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,273 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/file.h>
4#include <linux/namei.h>
5
6#include "super.h"
7#include "mds_client.h"
8#include <linux/ceph/pagelist.h>
9
10/**
11 * Implement fcntl and flock locking functions.
12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns,
15 int cmd, u64 start, u64 length, u8 wait)
16{
17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc =
19 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req;
21 int err;
22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req))
25 return PTR_ERR(req);
26 req->r_inode = igrab(inode);
27
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd);
31
32 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid);
35 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns);
39 req->r_args.filelock_change.start = cpu_to_le64(start);
40 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait;
42
43 err = ceph_mdsc_do_request(mdsc, inode, req);
44 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err);
48 return err;
49}
50
51/**
52 * Attempt to set an fcntl lock.
53 * For now, this just goes away to the server. Later it may be more awesome.
54 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{
57 u64 length;
58 u8 lock_cmd;
59 int err;
60 u8 wait = 0;
61 u16 op = CEPH_MDS_OP_SETFILELOCK;
62
63 fl->fl_nspid = get_pid(task_tgid(current));
64 dout("ceph_lock, fl_pid:%d", fl->fl_pid);
65
66 /* set wait bit as appropriate, then make command as Ceph expects it*/
67 if (F_SETLKW == cmd)
68 wait = 1;
69 if (F_GETLK == cmd)
70 op = CEPH_MDS_OP_GETFILELOCK;
71
72 if (F_RDLCK == fl->fl_type)
73 lock_cmd = CEPH_LOCK_SHARED;
74 else if (F_WRLCK == fl->fl_type)
75 lock_cmd = CEPH_LOCK_EXCL;
76 else
77 lock_cmd = CEPH_LOCK_UNLOCK;
78
79 if (LLONG_MAX == fl->fl_end)
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
87 lock_cmd, fl->fl_start,
88 length, wait);
89 if (!err) {
90 dout("mds locked, locking locally");
91 err = posix_lock_file(file, fl, NULL);
92 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
93 /* undo! This should only happen if the kernel detects
94 * local deadlock. */
95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
96 (u64)fl->fl_pid,
97 (u64)(unsigned long)fl->fl_nspid,
98 CEPH_LOCK_UNLOCK, fl->fl_start,
99 length, 0);
100 dout("got %d on posix_lock_file, undid lock", err);
101 }
102 } else {
103 dout("mds returned error code %d", err);
104 }
105 return err;
106}
107
108int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
109{
110 u64 length;
111 u8 lock_cmd;
112 int err;
113 u8 wait = 1;
114
115 fl->fl_nspid = get_pid(task_tgid(current));
116 dout("ceph_flock, fl_pid:%d", fl->fl_pid);
117
118 /* set wait bit, then clear it out of cmd*/
119 if (cmd & LOCK_NB)
120 wait = 0;
121 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
122 /* set command sequence that Ceph wants to see:
123 shared lock, exclusive lock, or unlock */
124 if (LOCK_SH == cmd)
125 lock_cmd = CEPH_LOCK_SHARED;
126 else if (LOCK_EX == cmd)
127 lock_cmd = CEPH_LOCK_EXCL;
128 else
129 lock_cmd = CEPH_LOCK_UNLOCK;
130 /* mds requires start and length rather than start and end */
131 if (LLONG_MAX == fl->fl_end)
132 length = 0;
133 else
134 length = fl->fl_end - fl->fl_start + 1;
135
136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
137 file, (u64)fl->fl_pid,
138 (u64)(unsigned long)fl->fl_nspid,
139 lock_cmd, fl->fl_start,
140 length, wait);
141 if (!err) {
142 err = flock_lock_file_wait(file, fl);
143 if (err) {
144 ceph_lock_message(CEPH_LOCK_FLOCK,
145 CEPH_MDS_OP_SETFILELOCK,
146 file, (u64)fl->fl_pid,
147 (u64)(unsigned long)fl->fl_nspid,
148 CEPH_LOCK_UNLOCK, fl->fl_start,
149 length, 0);
150 dout("got %d on flock_lock_file_wait, undid lock", err);
151 }
152 } else {
153 dout("mds error code %d", err);
154 }
155 return err;
156}
157
158/**
159 * Must be called with BKL already held. Fills in the passed
160 * counter variables, so you can prepare pagelist metadata before calling
161 * ceph_encode_locks.
162 */
163void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
164{
165 struct file_lock *lock;
166
167 *fcntl_count = 0;
168 *flock_count = 0;
169
170 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
171 if (lock->fl_flags & FL_POSIX)
172 ++(*fcntl_count);
173 else if (lock->fl_flags & FL_FLOCK)
174 ++(*flock_count);
175 }
176 dout("counted %d flock locks and %d fcntl locks",
177 *flock_count, *fcntl_count);
178}
179
180/**
181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks.
184 * Must be called with lock_flocks() already held.
185 * If we encounter more of a specific lock type than expected,
186 * we return the value 1.
187 */
188int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
189 int num_fcntl_locks, int num_flock_locks)
190{
191 struct file_lock *lock;
192 struct ceph_filelock cephlock;
193 int err = 0;
194 int seen_fcntl = 0;
195 int seen_flock = 0;
196
197 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
198 num_fcntl_locks);
199 err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
200 if (err)
201 goto fail;
202 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
203 if (lock->fl_flags & FL_POSIX) {
204 ++seen_fcntl;
205 if (seen_fcntl > num_fcntl_locks) {
206 err = -ENOSPC;
207 goto fail;
208 }
209 err = lock_to_ceph_filelock(lock, &cephlock);
210 if (err)
211 goto fail;
212 err = ceph_pagelist_append(pagelist, &cephlock,
213 sizeof(struct ceph_filelock));
214 }
215 if (err)
216 goto fail;
217 }
218
219 err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
220 if (err)
221 goto fail;
222 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
223 if (lock->fl_flags & FL_FLOCK) {
224 ++seen_flock;
225 if (seen_flock > num_flock_locks) {
226 err = -ENOSPC;
227 goto fail;
228 }
229 err = lock_to_ceph_filelock(lock, &cephlock);
230 if (err)
231 goto fail;
232 err = ceph_pagelist_append(pagelist, &cephlock,
233 sizeof(struct ceph_filelock));
234 }
235 if (err)
236 goto fail;
237 }
238fail:
239 return err;
240}
241
242/*
243 * Given a pointer to a lock, convert it to a ceph filelock
244 */
245int lock_to_ceph_filelock(struct file_lock *lock,
246 struct ceph_filelock *cephlock)
247{
248 int err = 0;
249
250 cephlock->start = cpu_to_le64(lock->fl_start);
251 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
252 cephlock->client = cpu_to_le64(0);
253 cephlock->pid = cpu_to_le64(lock->fl_pid);
254 cephlock->pid_namespace =
255 cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
256
257 switch (lock->fl_type) {
258 case F_RDLCK:
259 cephlock->type = CEPH_LOCK_SHARED;
260 break;
261 case F_WRLCK:
262 cephlock->type = CEPH_LOCK_EXCL;
263 break;
264 case F_UNLCK:
265 cephlock->type = CEPH_LOCK_UNLOCK;
266 break;
267 default:
268 dout("Have unknown lock type %d", lock->fl_type);
269 err = -EINVAL;
270 }
271
272 return err;
273}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dd440bd438a..3142b15940c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,16 +1,21 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9#include <linux/smp_lock.h>
6 10
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h" 11#include "super.h"
10#include "messenger.h" 12#include "mds_client.h"
11#include "decode.h" 13
12#include "auth.h" 14#include <linux/ceph/messenger.h>
13#include "pagelist.h" 15#include <linux/ceph/decode.h>
16#include <linux/ceph/pagelist.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/debugfs.h>
14 19
15/* 20/*
16 * A cluster of MDS (metadata server) daemons is responsible for 21 * A cluster of MDS (metadata server) daemons is responsible for
@@ -37,6 +42,11 @@
37 * are no longer valid. 42 * are no longer valid.
38 */ 43 */
39 44
45struct ceph_reconnect_state {
46 struct ceph_pagelist *pagelist;
47 bool flock;
48};
49
40static void __wake_requests(struct ceph_mds_client *mdsc, 50static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 51 struct list_head *head);
42 52
@@ -280,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 290 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) { 291 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer) 292 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 293 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer); 294 s->s_mdsc->fsc->client->monc.auth,
295 s->s_authorizer);
285 kfree(s); 296 kfree(s);
286 } 297 }
287} 298}
@@ -338,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
338 s->s_seq = 0; 349 s->s_seq = 0;
339 mutex_init(&s->s_mutex); 350 mutex_init(&s->s_mutex);
340 351
341 ceph_con_init(mdsc->client->msgr, &s->s_con); 352 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
342 s->s_con.private = s; 353 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops; 354 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 355 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -449,7 +460,7 @@ void ceph_mdsc_release_request(struct kref *kref)
449 kfree(req->r_path1); 460 kfree(req->r_path1);
450 kfree(req->r_path2); 461 kfree(req->r_path2);
451 put_request_session(req); 462 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation); 463 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
453 kfree(req); 464 kfree(req);
454} 465}
455 466
@@ -512,7 +523,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
512{ 523{
513 req->r_tid = ++mdsc->last_tid; 524 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps) 525 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); 526 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
527 req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid); 528 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req); 529 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req); 530 __insert_request(mdsc, req);
@@ -553,6 +565,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
553 * 565 *
554 * Called under mdsc->mutex. 566 * Called under mdsc->mutex.
555 */ 567 */
568struct dentry *get_nonsnap_parent(struct dentry *dentry)
569{
570 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
571 dentry = dentry->d_parent;
572 return dentry;
573}
574
556static int __choose_mds(struct ceph_mds_client *mdsc, 575static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req) 576 struct ceph_mds_request *req)
558{ 577{
@@ -583,14 +602,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
583 if (req->r_inode) { 602 if (req->r_inode) {
584 inode = req->r_inode; 603 inode = req->r_inode;
585 } else if (req->r_dentry) { 604 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) { 605 struct inode *dir = req->r_dentry->d_parent->d_inode;
606
607 if (dir->i_sb != mdsc->fsc->sb) {
608 /* not this fs! */
609 inode = req->r_dentry->d_inode;
610 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
611 /* direct snapped/virtual snapdir requests
612 * based on parent dir inode */
613 struct dentry *dn =
614 get_nonsnap_parent(req->r_dentry->d_parent);
615 inode = dn->d_inode;
616 dout("__choose_mds using nonsnap parent %p\n", inode);
617 } else if (req->r_dentry->d_inode) {
618 /* dentry target */
587 inode = req->r_dentry->d_inode; 619 inode = req->r_dentry->d_inode;
588 } else { 620 } else {
589 inode = req->r_dentry->d_parent->d_inode; 621 /* dir + name */
622 inode = dir;
590 hash = req->r_dentry->d_name.hash; 623 hash = req->r_dentry->d_name.hash;
591 is_hash = true; 624 is_hash = true;
592 } 625 }
593 } 626 }
627
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 628 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode); 629 (int)hash, mode);
596 if (!inode) 630 if (!inode)
@@ -704,6 +738,51 @@ static int __open_session(struct ceph_mds_client *mdsc,
704} 738}
705 739
706/* 740/*
741 * open sessions for any export targets for the given mds
742 *
743 * called under mdsc->mutex
744 */
745static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
746 struct ceph_mds_session *session)
747{
748 struct ceph_mds_info *mi;
749 struct ceph_mds_session *ts;
750 int i, mds = session->s_mds;
751 int target;
752
753 if (mds >= mdsc->mdsmap->m_max_mds)
754 return;
755 mi = &mdsc->mdsmap->m_info[mds];
756 dout("open_export_target_sessions for mds%d (%d targets)\n",
757 session->s_mds, mi->num_export_targets);
758
759 for (i = 0; i < mi->num_export_targets; i++) {
760 target = mi->export_targets[i];
761 ts = __ceph_lookup_mds_session(mdsc, target);
762 if (!ts) {
763 ts = register_session(mdsc, target);
764 if (IS_ERR(ts))
765 return;
766 }
767 if (session->s_state == CEPH_MDS_SESSION_NEW ||
768 session->s_state == CEPH_MDS_SESSION_CLOSING)
769 __open_session(mdsc, session);
770 else
771 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
772 i, ts, session_state_name(ts->s_state));
773 ceph_put_mds_session(ts);
774 }
775}
776
777void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
778 struct ceph_mds_session *session)
779{
780 mutex_lock(&mdsc->mutex);
781 __open_export_target_sessions(mdsc, session);
782 mutex_unlock(&mdsc->mutex);
783}
784
785/*
707 * session caps 786 * session caps
708 */ 787 */
709 788
@@ -764,7 +843,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
764 last_inode = NULL; 843 last_inode = NULL;
765 } 844 }
766 if (old_cap) { 845 if (old_cap) {
767 ceph_put_cap(old_cap); 846 ceph_put_cap(session->s_mdsc, old_cap);
768 old_cap = NULL; 847 old_cap = NULL;
769 } 848 }
770 849
@@ -793,7 +872,7 @@ out:
793 if (last_inode) 872 if (last_inode)
794 iput(last_inode); 873 iput(last_inode);
795 if (old_cap) 874 if (old_cap)
796 ceph_put_cap(old_cap); 875 ceph_put_cap(session->s_mdsc, old_cap);
797 876
798 return ret; 877 return ret;
799} 878}
@@ -810,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
810 __ceph_remove_cap(cap); 889 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) { 890 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc = 891 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc; 892 ceph_sb_to_client(inode->i_sb)->mdsc;
814 893
815 spin_lock(&mdsc->cap_dirty_lock); 894 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) { 895 if (!list_empty(&ci->i_dirty_item)) {
@@ -1067,15 +1146,16 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1067 * Called under s_mutex. 1146 * Called under s_mutex.
1068 */ 1147 */
1069int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 1148int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1070 struct ceph_mds_session *session, 1149 struct ceph_mds_session *session)
1071 int extra)
1072{ 1150{
1073 struct ceph_msg *msg; 1151 struct ceph_msg *msg, *partial = NULL;
1074 struct ceph_mds_cap_release *head; 1152 struct ceph_mds_cap_release *head;
1075 int err = -ENOMEM; 1153 int err = -ENOMEM;
1154 int extra = mdsc->fsc->mount_options->cap_release_safety;
1155 int num;
1076 1156
1077 if (extra < 0) 1157 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1078 extra = mdsc->client->mount_args->cap_release_safety; 1158 extra);
1079 1159
1080 spin_lock(&session->s_cap_lock); 1160 spin_lock(&session->s_cap_lock);
1081 1161
@@ -1084,9 +1164,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1084 struct ceph_msg, 1164 struct ceph_msg,
1085 list_head); 1165 list_head);
1086 head = msg->front.iov_base; 1166 head = msg->front.iov_base;
1087 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); 1167 num = le32_to_cpu(head->num);
1168 if (num) {
1169 dout(" partial %p with (%d/%d)\n", msg, num,
1170 (int)CEPH_CAPS_PER_RELEASE);
1171 extra += CEPH_CAPS_PER_RELEASE - num;
1172 partial = msg;
1173 }
1088 } 1174 }
1089
1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1175 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1091 spin_unlock(&session->s_cap_lock); 1176 spin_unlock(&session->s_cap_lock);
1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1177 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
@@ -1103,19 +1188,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1103 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; 1188 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1104 } 1189 }
1105 1190
1106 if (!list_empty(&session->s_cap_releases)) { 1191 if (partial) {
1107 msg = list_first_entry(&session->s_cap_releases, 1192 head = partial->front.iov_base;
1108 struct ceph_msg, 1193 num = le32_to_cpu(head->num);
1109 list_head); 1194 dout(" queueing partial %p with %d/%d\n", partial, num,
1110 head = msg->front.iov_base; 1195 (int)CEPH_CAPS_PER_RELEASE);
1111 if (head->num) { 1196 list_move_tail(&partial->list_head,
1112 dout(" queueing non-full %p (%d)\n", msg, 1197 &session->s_cap_releases_done);
1113 le32_to_cpu(head->num)); 1198 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1114 list_move_tail(&msg->list_head,
1115 &session->s_cap_releases_done);
1116 session->s_num_cap_releases -=
1117 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1118 }
1119 } 1199 }
1120 err = 0; 1200 err = 0;
1121 spin_unlock(&session->s_cap_lock); 1201 spin_unlock(&session->s_cap_lock);
@@ -1250,6 +1330,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1250 return ERR_PTR(-ENOMEM); 1330 return ERR_PTR(-ENOMEM);
1251 1331
1252 mutex_init(&req->r_fill_mutex); 1332 mutex_init(&req->r_fill_mutex);
1333 req->r_mdsc = mdsc;
1253 req->r_started = jiffies; 1334 req->r_started = jiffies;
1254 req->r_resend_mds = -1; 1335 req->r_resend_mds = -1;
1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1336 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1580,6 +1661,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1580 1661
1581 req->r_mds = mds; 1662 req->r_mds = mds;
1582 req->r_attempts++; 1663 req->r_attempts++;
1664 if (req->r_inode) {
1665 struct ceph_cap *cap =
1666 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1667
1668 if (cap)
1669 req->r_sent_on_mseq = cap->mseq;
1670 else
1671 req->r_sent_on_mseq = -1;
1672 }
1583 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1673 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1584 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1674 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1585 1675
@@ -1914,21 +2004,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1914 result = le32_to_cpu(head->result); 2004 result = le32_to_cpu(head->result);
1915 2005
1916 /* 2006 /*
1917 * Tolerate 2 consecutive ESTALEs from the same mds. 2007 * Handle an ESTALE
1918 * FIXME: we should be looking at the cap migrate_seq. 2008 * if we're not talking to the authority, send to them
2009 * if the authority has changed while we weren't looking,
2010 * send to new authority
2011 * Otherwise we just have to return an ESTALE
1919 */ 2012 */
1920 if (result == -ESTALE) { 2013 if (result == -ESTALE) {
1921 req->r_direct_mode = USE_AUTH_MDS; 2014 dout("got ESTALE on request %llu", req->r_tid);
1922 req->r_num_stale++; 2015 if (!req->r_inode) {
1923 if (req->r_num_stale <= 2) { 2016 /* do nothing; not an authority problem */
2017 } else if (req->r_direct_mode != USE_AUTH_MDS) {
2018 dout("not using auth, setting for that now");
2019 req->r_direct_mode = USE_AUTH_MDS;
1924 __do_request(mdsc, req); 2020 __do_request(mdsc, req);
1925 mutex_unlock(&mdsc->mutex); 2021 mutex_unlock(&mdsc->mutex);
1926 goto out; 2022 goto out;
2023 } else {
2024 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2025 struct ceph_cap *cap =
2026 ceph_get_cap_for_mds(ci, req->r_mds);;
2027
2028 dout("already using auth");
2029 if ((!cap || cap != ci->i_auth_cap) ||
2030 (cap->mseq != req->r_sent_on_mseq)) {
2031 dout("but cap changed, so resending");
2032 __do_request(mdsc, req);
2033 mutex_unlock(&mdsc->mutex);
2034 goto out;
2035 }
1927 } 2036 }
1928 } else { 2037 dout("have to return ESTALE on request %llu", req->r_tid);
1929 req->r_num_stale = 0;
1930 } 2038 }
1931 2039
2040
1932 if (head->safe) { 2041 if (head->safe) {
1933 req->r_got_safe = true; 2042 req->r_got_safe = true;
1934 __unregister_request(mdsc, req); 2043 __unregister_request(mdsc, req);
@@ -1981,11 +2090,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1981 2090
1982 /* insert trace into our cache */ 2091 /* insert trace into our cache */
1983 mutex_lock(&req->r_fill_mutex); 2092 mutex_lock(&req->r_fill_mutex);
1984 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
1985 if (err == 0) { 2094 if (err == 0) {
1986 if (result == 0 && rinfo->dir_nr) 2095 if (result == 0 && rinfo->dir_nr)
1987 ceph_readdir_prepopulate(req, req->r_session); 2096 ceph_readdir_prepopulate(req, req->r_session);
1988 ceph_unreserve_caps(&req->r_caps_reservation); 2097 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
1989 } 2098 }
1990 mutex_unlock(&req->r_fill_mutex); 2099 mutex_unlock(&req->r_fill_mutex);
1991 2100
@@ -2005,7 +2114,7 @@ out_err:
2005 } 2114 }
2006 mutex_unlock(&mdsc->mutex); 2115 mutex_unlock(&mdsc->mutex);
2007 2116
2008 ceph_add_cap_releases(mdsc, req->r_session, -1); 2117 ceph_add_cap_releases(mdsc, req->r_session);
2009 mutex_unlock(&session->s_mutex); 2118 mutex_unlock(&session->s_mutex);
2010 2119
2011 /* kick calling process */ 2120 /* kick calling process */
@@ -2126,7 +2235,7 @@ static void handle_session(struct ceph_mds_session *session,
2126 pr_info("mds%d reconnect denied\n", session->s_mds); 2235 pr_info("mds%d reconnect denied\n", session->s_mds);
2127 remove_session_caps(session); 2236 remove_session_caps(session);
2128 wake = 1; /* for good measure */ 2237 wake = 1; /* for good measure */
2129 complete_all(&mdsc->session_close_waiters); 2238 wake_up_all(&mdsc->session_close_wq);
2130 kick_requests(mdsc, mds); 2239 kick_requests(mdsc, mds);
2131 break; 2240 break;
2132 2241
@@ -2193,9 +2302,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2193static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, 2302static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2194 void *arg) 2303 void *arg)
2195{ 2304{
2196 struct ceph_mds_cap_reconnect rec; 2305 union {
2306 struct ceph_mds_cap_reconnect v2;
2307 struct ceph_mds_cap_reconnect_v1 v1;
2308 } rec;
2309 size_t reclen;
2197 struct ceph_inode_info *ci; 2310 struct ceph_inode_info *ci;
2198 struct ceph_pagelist *pagelist = arg; 2311 struct ceph_reconnect_state *recon_state = arg;
2312 struct ceph_pagelist *pagelist = recon_state->pagelist;
2199 char *path; 2313 char *path;
2200 int pathlen, err; 2314 int pathlen, err;
2201 u64 pathbase; 2315 u64 pathbase;
@@ -2215,7 +2329,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2215 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2329 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2216 if (IS_ERR(path)) { 2330 if (IS_ERR(path)) {
2217 err = PTR_ERR(path); 2331 err = PTR_ERR(path);
2218 BUG_ON(err); 2332 goto out_dput;
2219 } 2333 }
2220 } else { 2334 } else {
2221 path = NULL; 2335 path = NULL;
@@ -2223,25 +2337,71 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2223 } 2337 }
2224 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2338 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2225 if (err) 2339 if (err)
2226 goto out; 2340 goto out_free;
2227 2341
2228 spin_lock(&inode->i_lock); 2342 spin_lock(&inode->i_lock);
2229 cap->seq = 0; /* reset cap seq */ 2343 cap->seq = 0; /* reset cap seq */
2230 cap->issue_seq = 0; /* and issue_seq */ 2344 cap->issue_seq = 0; /* and issue_seq */
2231 rec.cap_id = cpu_to_le64(cap->cap_id); 2345
2232 rec.pathbase = cpu_to_le64(pathbase); 2346 if (recon_state->flock) {
2233 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2347 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2234 rec.issued = cpu_to_le32(cap->issued); 2348 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2235 rec.size = cpu_to_le64(inode->i_size); 2349 rec.v2.issued = cpu_to_le32(cap->issued);
2236 ceph_encode_timespec(&rec.mtime, &inode->i_mtime); 2350 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2237 ceph_encode_timespec(&rec.atime, &inode->i_atime); 2351 rec.v2.pathbase = cpu_to_le64(pathbase);
2238 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2352 rec.v2.flock_len = 0;
2353 reclen = sizeof(rec.v2);
2354 } else {
2355 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2356 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2357 rec.v1.issued = cpu_to_le32(cap->issued);
2358 rec.v1.size = cpu_to_le64(inode->i_size);
2359 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2360 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2361 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2362 rec.v1.pathbase = cpu_to_le64(pathbase);
2363 reclen = sizeof(rec.v1);
2364 }
2239 spin_unlock(&inode->i_lock); 2365 spin_unlock(&inode->i_lock);
2240 2366
2241 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); 2367 if (recon_state->flock) {
2368 int num_fcntl_locks, num_flock_locks;
2369 struct ceph_pagelist_cursor trunc_point;
2370
2371 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2372 do {
2373 lock_flocks();
2374 ceph_count_locks(inode, &num_fcntl_locks,
2375 &num_flock_locks);
2376 rec.v2.flock_len = (2*sizeof(u32) +
2377 (num_fcntl_locks+num_flock_locks) *
2378 sizeof(struct ceph_filelock));
2379 unlock_flocks();
2380
2381 /* pre-alloc pagelist */
2382 ceph_pagelist_truncate(pagelist, &trunc_point);
2383 err = ceph_pagelist_append(pagelist, &rec, reclen);
2384 if (!err)
2385 err = ceph_pagelist_reserve(pagelist,
2386 rec.v2.flock_len);
2387
2388 /* encode locks */
2389 if (!err) {
2390 lock_flocks();
2391 err = ceph_encode_locks(inode,
2392 pagelist,
2393 num_fcntl_locks,
2394 num_flock_locks);
2395 unlock_flocks();
2396 }
2397 } while (err == -ENOSPC);
2398 } else {
2399 err = ceph_pagelist_append(pagelist, &rec, reclen);
2400 }
2242 2401
2243out: 2402out_free:
2244 kfree(path); 2403 kfree(path);
2404out_dput:
2245 dput(dentry); 2405 dput(dentry);
2246 return err; 2406 return err;
2247} 2407}
@@ -2267,6 +2427,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2267 int mds = session->s_mds; 2427 int mds = session->s_mds;
2268 int err = -ENOMEM; 2428 int err = -ENOMEM;
2269 struct ceph_pagelist *pagelist; 2429 struct ceph_pagelist *pagelist;
2430 struct ceph_reconnect_state recon_state;
2270 2431
2271 pr_info("mds%d reconnect start\n", mds); 2432 pr_info("mds%d reconnect start\n", mds);
2272 2433
@@ -2301,7 +2462,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2301 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2462 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2302 if (err) 2463 if (err)
2303 goto fail; 2464 goto fail;
2304 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2465
2466 recon_state.pagelist = pagelist;
2467 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2468 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2305 if (err < 0) 2469 if (err < 0)
2306 goto fail; 2470 goto fail;
2307 2471
@@ -2326,6 +2490,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2326 } 2490 }
2327 2491
2328 reply->pagelist = pagelist; 2492 reply->pagelist = pagelist;
2493 if (recon_state.flock)
2494 reply->hdr.version = cpu_to_le16(2);
2329 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2495 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2330 reply->nr_pages = calc_pages_for(0, pagelist->length); 2496 reply->nr_pages = calc_pages_for(0, pagelist->length);
2331 ceph_con_send(&session->s_con, reply); 2497 ceph_con_send(&session->s_con, reply);
@@ -2376,9 +2542,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2376 oldstate = ceph_mdsmap_get_state(oldmap, i); 2542 oldstate = ceph_mdsmap_get_state(oldmap, i);
2377 newstate = ceph_mdsmap_get_state(newmap, i); 2543 newstate = ceph_mdsmap_get_state(newmap, i);
2378 2544
2379 dout("check_new_map mds%d state %s -> %s (session %s)\n", 2545 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2380 i, ceph_mds_state_name(oldstate), 2546 i, ceph_mds_state_name(oldstate),
2547 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2381 ceph_mds_state_name(newstate), 2548 ceph_mds_state_name(newstate),
2549 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2382 session_state_name(s->s_state)); 2550 session_state_name(s->s_state));
2383 2551
2384 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2552 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2428,6 +2596,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2428 wake_up_session_caps(s, 1); 2596 wake_up_session_caps(s, 1);
2429 } 2597 }
2430 } 2598 }
2599
2600 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2601 s = mdsc->sessions[i];
2602 if (!s)
2603 continue;
2604 if (!ceph_mdsmap_is_laggy(newmap, i))
2605 continue;
2606 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2607 s->s_state == CEPH_MDS_SESSION_HUNG ||
2608 s->s_state == CEPH_MDS_SESSION_CLOSING) {
2609 dout(" connecting to export targets of laggy mds%d\n",
2610 i);
2611 __open_export_target_sessions(mdsc, s);
2612 }
2613 }
2431} 2614}
2432 2615
2433 2616
@@ -2451,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2451 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2452 struct ceph_msg *msg) 2635 struct ceph_msg *msg)
2453{ 2636{
2454 struct super_block *sb = mdsc->client->sb; 2637 struct super_block *sb = mdsc->fsc->sb;
2455 struct inode *inode; 2638 struct inode *inode;
2456 struct ceph_inode_info *ci; 2639 struct ceph_inode_info *ci;
2457 struct dentry *parent, *dentry; 2640 struct dentry *parent, *dentry;
@@ -2715,7 +2898,7 @@ static void delayed_work(struct work_struct *work)
2715 send_renew_caps(mdsc, s); 2898 send_renew_caps(mdsc, s);
2716 else 2899 else
2717 ceph_con_keepalive(&s->s_con); 2900 ceph_con_keepalive(&s->s_con);
2718 ceph_add_cap_releases(mdsc, s, -1); 2901 ceph_add_cap_releases(mdsc, s);
2719 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2902 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2720 s->s_state == CEPH_MDS_SESSION_HUNG) 2903 s->s_state == CEPH_MDS_SESSION_HUNG)
2721 ceph_send_cap_releases(mdsc, s); 2904 ceph_send_cap_releases(mdsc, s);
@@ -2729,17 +2912,23 @@ static void delayed_work(struct work_struct *work)
2729 schedule_delayed(mdsc); 2912 schedule_delayed(mdsc);
2730} 2913}
2731 2914
2915int ceph_mdsc_init(struct ceph_fs_client *fsc)
2732 2916
2733int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2734{ 2917{
2735 mdsc->client = client; 2918 struct ceph_mds_client *mdsc;
2919
2920 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2921 if (!mdsc)
2922 return -ENOMEM;
2923 mdsc->fsc = fsc;
2924 fsc->mdsc = mdsc;
2736 mutex_init(&mdsc->mutex); 2925 mutex_init(&mdsc->mutex);
2737 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2926 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2738 if (mdsc->mdsmap == NULL) 2927 if (mdsc->mdsmap == NULL)
2739 return -ENOMEM; 2928 return -ENOMEM;
2740 2929
2741 init_completion(&mdsc->safe_umount_waiters); 2930 init_completion(&mdsc->safe_umount_waiters);
2742 init_completion(&mdsc->session_close_waiters); 2931 init_waitqueue_head(&mdsc->session_close_wq);
2743 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2932 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2744 mdsc->sessions = NULL; 2933 mdsc->sessions = NULL;
2745 mdsc->max_sessions = 0; 2934 mdsc->max_sessions = 0;
@@ -2764,6 +2953,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2764 spin_lock_init(&mdsc->dentry_lru_lock); 2953 spin_lock_init(&mdsc->dentry_lru_lock);
2765 INIT_LIST_HEAD(&mdsc->dentry_lru); 2954 INIT_LIST_HEAD(&mdsc->dentry_lru);
2766 2955
2956 ceph_caps_init(mdsc);
2957 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2958
2767 return 0; 2959 return 0;
2768} 2960}
2769 2961
@@ -2774,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2774static void wait_requests(struct ceph_mds_client *mdsc) 2966static void wait_requests(struct ceph_mds_client *mdsc)
2775{ 2967{
2776 struct ceph_mds_request *req; 2968 struct ceph_mds_request *req;
2777 struct ceph_client *client = mdsc->client; 2969 struct ceph_fs_client *fsc = mdsc->fsc;
2778 2970
2779 mutex_lock(&mdsc->mutex); 2971 mutex_lock(&mdsc->mutex);
2780 if (__get_oldest_req(mdsc)) { 2972 if (__get_oldest_req(mdsc)) {
@@ -2782,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2782 2974
2783 dout("wait_requests waiting for requests\n"); 2975 dout("wait_requests waiting for requests\n");
2784 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 2976 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2785 client->mount_args->mount_timeout * HZ); 2977 fsc->client->options->mount_timeout * HZ);
2786 2978
2787 /* tear down remaining requests */ 2979 /* tear down remaining requests */
2788 mutex_lock(&mdsc->mutex); 2980 mutex_lock(&mdsc->mutex);
@@ -2865,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2865{ 3057{
2866 u64 want_tid, want_flush; 3058 u64 want_tid, want_flush;
2867 3059
2868 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3060 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
2869 return; 3061 return;
2870 3062
2871 dout("sync\n"); 3063 dout("sync\n");
@@ -2881,6 +3073,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2881 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3073 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2882} 3074}
2883 3075
3076/*
3077 * true if all sessions are closed, or we force unmount
3078 */
3079bool done_closing_sessions(struct ceph_mds_client *mdsc)
3080{
3081 int i, n = 0;
3082
3083 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3084 return true;
3085
3086 mutex_lock(&mdsc->mutex);
3087 for (i = 0; i < mdsc->max_sessions; i++)
3088 if (mdsc->sessions[i])
3089 n++;
3090 mutex_unlock(&mdsc->mutex);
3091 return n == 0;
3092}
2884 3093
2885/* 3094/*
2886 * called after sb is ro. 3095 * called after sb is ro.
@@ -2889,45 +3098,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2889{ 3098{
2890 struct ceph_mds_session *session; 3099 struct ceph_mds_session *session;
2891 int i; 3100 int i;
2892 int n; 3101 struct ceph_fs_client *fsc = mdsc->fsc;
2893 struct ceph_client *client = mdsc->client; 3102 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
2894 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2895 3103
2896 dout("close_sessions\n"); 3104 dout("close_sessions\n");
2897 3105
2898 mutex_lock(&mdsc->mutex);
2899
2900 /* close sessions */ 3106 /* close sessions */
2901 started = jiffies; 3107 mutex_lock(&mdsc->mutex);
2902 while (time_before(jiffies, started + timeout)) { 3108 for (i = 0; i < mdsc->max_sessions; i++) {
2903 dout("closing sessions\n"); 3109 session = __ceph_lookup_mds_session(mdsc, i);
2904 n = 0; 3110 if (!session)
2905 for (i = 0; i < mdsc->max_sessions; i++) { 3111 continue;
2906 session = __ceph_lookup_mds_session(mdsc, i);
2907 if (!session)
2908 continue;
2909 mutex_unlock(&mdsc->mutex);
2910 mutex_lock(&session->s_mutex);
2911 __close_session(mdsc, session);
2912 mutex_unlock(&session->s_mutex);
2913 ceph_put_mds_session(session);
2914 mutex_lock(&mdsc->mutex);
2915 n++;
2916 }
2917 if (n == 0)
2918 break;
2919
2920 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2921 break;
2922
2923 dout("waiting for sessions to close\n");
2924 mutex_unlock(&mdsc->mutex); 3112 mutex_unlock(&mdsc->mutex);
2925 wait_for_completion_timeout(&mdsc->session_close_waiters, 3113 mutex_lock(&session->s_mutex);
2926 timeout); 3114 __close_session(mdsc, session);
3115 mutex_unlock(&session->s_mutex);
3116 ceph_put_mds_session(session);
2927 mutex_lock(&mdsc->mutex); 3117 mutex_lock(&mdsc->mutex);
2928 } 3118 }
3119 mutex_unlock(&mdsc->mutex);
3120
3121 dout("waiting for sessions to close\n");
3122 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3123 timeout);
2929 3124
2930 /* tear down remaining sessions */ 3125 /* tear down remaining sessions */
3126 mutex_lock(&mdsc->mutex);
2931 for (i = 0; i < mdsc->max_sessions; i++) { 3127 for (i = 0; i < mdsc->max_sessions; i++) {
2932 if (mdsc->sessions[i]) { 3128 if (mdsc->sessions[i]) {
2933 session = get_session(mdsc->sessions[i]); 3129 session = get_session(mdsc->sessions[i]);
@@ -2940,9 +3136,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2940 mutex_lock(&mdsc->mutex); 3136 mutex_lock(&mdsc->mutex);
2941 } 3137 }
2942 } 3138 }
2943
2944 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3139 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2945
2946 mutex_unlock(&mdsc->mutex); 3140 mutex_unlock(&mdsc->mutex);
2947 3141
2948 ceph_cleanup_empty_realms(mdsc); 3142 ceph_cleanup_empty_realms(mdsc);
@@ -2952,13 +3146,23 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2952 dout("stopped\n"); 3146 dout("stopped\n");
2953} 3147}
2954 3148
2955void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3149static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2956{ 3150{
2957 dout("stop\n"); 3151 dout("stop\n");
2958 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3152 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2959 if (mdsc->mdsmap) 3153 if (mdsc->mdsmap)
2960 ceph_mdsmap_destroy(mdsc->mdsmap); 3154 ceph_mdsmap_destroy(mdsc->mdsmap);
2961 kfree(mdsc->sessions); 3155 kfree(mdsc->sessions);
3156 ceph_caps_finalize(mdsc);
3157}
3158
3159void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3160{
3161 struct ceph_mds_client *mdsc = fsc->mdsc;
3162
3163 ceph_mdsc_stop(mdsc);
3164 fsc->mdsc = NULL;
3165 kfree(mdsc);
2962} 3166}
2963 3167
2964 3168
@@ -2977,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2977 3181
2978 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3182 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2979 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3183 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2980 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3184 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
2981 return; 3185 return;
2982 epoch = ceph_decode_32(&p); 3186 epoch = ceph_decode_32(&p);
2983 maplen = ceph_decode_32(&p); 3187 maplen = ceph_decode_32(&p);
2984 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3188 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2985 3189
2986 /* do we need it? */ 3190 /* do we need it? */
2987 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3191 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
2988 mutex_lock(&mdsc->mutex); 3192 mutex_lock(&mdsc->mutex);
2989 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3193 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2990 dout("handle_map epoch %u <= our %u\n", 3194 dout("handle_map epoch %u <= our %u\n",
@@ -3008,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3008 } else { 3212 } else {
3009 mdsc->mdsmap = newmap; /* first mds map */ 3213 mdsc->mdsmap = newmap; /* first mds map */
3010 } 3214 }
3011 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3215 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3012 3216
3013 __wake_requests(mdsc, &mdsc->waiting_for_map); 3217 __wake_requests(mdsc, &mdsc->waiting_for_map);
3014 3218
@@ -3109,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
3109{ 3313{
3110 struct ceph_mds_session *s = con->private; 3314 struct ceph_mds_session *s = con->private;
3111 struct ceph_mds_client *mdsc = s->s_mdsc; 3315 struct ceph_mds_client *mdsc = s->s_mdsc;
3112 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3316 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3113 int ret = 0; 3317 int ret = 0;
3114 3318
3115 if (force_new && s->s_authorizer) { 3319 if (force_new && s->s_authorizer) {
@@ -3143,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3143{ 3347{
3144 struct ceph_mds_session *s = con->private; 3348 struct ceph_mds_session *s = con->private;
3145 struct ceph_mds_client *mdsc = s->s_mdsc; 3349 struct ceph_mds_client *mdsc = s->s_mdsc;
3146 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3350 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3147 3351
3148 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3352 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3149} 3353}
@@ -3152,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3152{ 3356{
3153 struct ceph_mds_session *s = con->private; 3357 struct ceph_mds_session *s = con->private;
3154 struct ceph_mds_client *mdsc = s->s_mdsc; 3358 struct ceph_mds_client *mdsc = s->s_mdsc;
3155 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3359 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3156 3360
3157 if (ac->ops->invalidate_authorizer) 3361 if (ac->ops->invalidate_authorizer)
3158 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3362 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3159 3363
3160 return ceph_monc_validate_auth(&mdsc->client->monc); 3364 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3161} 3365}
3162 3366
3163static const struct ceph_connection_operations mds_con_ops = { 3367static const struct ceph_connection_operations mds_con_ops = {
@@ -3170,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3170 .peer_reset = peer_reset, 3374 .peer_reset = peer_reset,
3171}; 3375};
3172 3376
3173
3174
3175
3176/* eof */ 3377/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 952410c60d0..d66d63c7235 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
151struct ceph_mds_request { 151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */ 152 u64 r_tid; /* transaction id */
153 struct rb_node r_node; 153 struct rb_node r_node;
154 struct ceph_mds_client *r_mdsc;
154 155
155 int r_op; /* mds op code */ 156 int r_op; /* mds op code */
156 int r_mds; 157 int r_mds;
@@ -207,8 +208,8 @@ struct ceph_mds_request {
207 208
208 int r_attempts; /* resend attempts */ 209 int r_attempts; /* resend attempts */
209 int r_num_fwd; /* number of forward attempts */ 210 int r_num_fwd; /* number of forward attempts */
210 int r_num_stale;
211 int r_resend_mds; /* mds to resend to next, if any*/ 211 int r_resend_mds; /* mds to resend to next, if any*/
212 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
212 213
213 struct kref r_kref; 214 struct kref r_kref;
214 struct list_head r_wait; 215 struct list_head r_wait;
@@ -229,11 +230,12 @@ struct ceph_mds_request {
229 * mds client state 230 * mds client state
230 */ 231 */
231struct ceph_mds_client { 232struct ceph_mds_client {
232 struct ceph_client *client; 233 struct ceph_fs_client *fsc;
233 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
234 235
235 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
236 struct completion safe_umount_waiters, session_close_waiters; 237 struct completion safe_umount_waiters;
238 wait_queue_head_t session_close_wq;
237 struct list_head waiting_for_map; 239 struct list_head waiting_for_map;
238 240
239 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 241 struct ceph_mds_session **sessions; /* NULL for mds if no session */
@@ -267,10 +269,26 @@ struct ceph_mds_client {
267 spinlock_t cap_dirty_lock; /* protects above items */ 269 spinlock_t cap_dirty_lock; /* protects above items */
268 wait_queue_head_t cap_flushing_wq; 270 wait_queue_head_t cap_flushing_wq;
269 271
270#ifdef CONFIG_DEBUG_FS 272 /*
271 struct dentry *debugfs_file; 273 * Cap reservations
272#endif 274 *
273 275 * Maintain a global pool of preallocated struct ceph_caps, referenced
276 * by struct ceph_caps_reservations. This ensures that we preallocate
277 * memory needed to successfully process an MDS response. (If an MDS
278 * sends us cap information and we fail to process it, we will have
279 * problems due to the client and MDS being out of sync.)
280 *
281 * Reservations are 'owned' by a ceph_cap_reservation context.
282 */
283 spinlock_t caps_list_lock;
284 struct list_head caps_list; /* unused (reserved or
285 unreserved) */
286 int caps_total_count; /* total caps allocated */
287 int caps_use_count; /* in use */
288 int caps_reserve_count; /* unused, reserved */
289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many
291 (unreserved) */
274 spinlock_t dentry_lru_lock; 292 spinlock_t dentry_lru_lock;
275 struct list_head dentry_lru; 293 struct list_head dentry_lru;
276 int num_dentry; 294 int num_dentry;
@@ -293,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
293extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 311extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
294 struct ceph_msg *msg, int mds); 312 struct ceph_msg *msg, int mds);
295 313
296extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 314extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
297 struct ceph_client *client);
298extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 315extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
299extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 316extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
300 317
301extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 318extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
302 319
@@ -324,8 +341,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
324} 341}
325 342
326extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 343extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
327 struct ceph_mds_session *session, 344 struct ceph_mds_session *session);
328 int extra);
329extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 345extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
330 struct ceph_mds_session *session); 346 struct ceph_mds_session *session);
331 347
@@ -343,4 +359,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
343extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 359extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
344 struct ceph_msg *msg); 360 struct ceph_msg *msg);
345 361
362extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
363 struct ceph_mds_session *session);
364
346#endif 365#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfe..73b7d44e8a3 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
85 struct ceph_entity_addr addr; 85 struct ceph_entity_addr addr;
86 u32 num_export_targets; 86 u32 num_export_targets;
87 void *pexport_targets = NULL; 87 void *pexport_targets = NULL;
88 struct ceph_timespec laggy_since;
88 89
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 90 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p); 91 global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
103 state_seq = ceph_decode_64(p); 104 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr)); 105 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr); 106 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec); 107 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
107 *p += sizeof(u32); 108 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad); 109 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen; 110 *p += namelen;
@@ -116,12 +117,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
116 } 117 }
117 118
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state; 125 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr; 126 m->m_info[mds].addr = addr;
127 m->m_info[mds].laggy =
128 (laggy_since.tv_sec != 0 ||
129 laggy_since.tv_nsec != 0);
125 m->m_info[mds].num_export_targets = num_export_targets; 130 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) { 131 if (num_export_targets) {
127 m->m_info[mds].export_targets = 132 m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index eacc131aa5c..00000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 15167b2daa5..00000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2276 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42/*
43 * nicely render a sockaddr as a string.
44 */
45#define MAX_ADDR_STR 20
46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str;
50
51const char *pr_addr(const struct sockaddr_storage *ss)
52{
53 int i;
54 char *s;
55 struct sockaddr_in *in4 = (void *)ss;
56 struct sockaddr_in6 *in6 = (void *)ss;
57
58 spin_lock(&addr_str_lock);
59 i = last_addr_str++;
60 if (last_addr_str == MAX_ADDR_STR)
61 last_addr_str = 0;
62 spin_unlock(&addr_str_lock);
63 s = addr_str[i];
64
65 switch (ss->ss_family) {
66 case AF_INET:
67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
68 (unsigned int)ntohs(in4->sin_port));
69 break;
70
71 case AF_INET6:
72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
73 (unsigned int)ntohs(in6->sin6_port));
74 break;
75
76 default:
77 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
78 }
79
80 return s;
81}
82
83static void encode_my_addr(struct ceph_messenger *msgr)
84{
85 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
86 ceph_encode_addr(&msgr->my_enc_addr);
87}
88
89/*
90 * work queue for all reading and writing to/from the socket.
91 */
92struct workqueue_struct *ceph_msgr_wq;
93
94int __init ceph_msgr_init(void)
95{
96 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) {
98 int ret = PTR_ERR(ceph_msgr_wq);
99 pr_err("msgr_init failed to create workqueue: %d\n", ret);
100 ceph_msgr_wq = NULL;
101 return ret;
102 }
103 return 0;
104}
105
106void ceph_msgr_exit(void)
107{
108 destroy_workqueue(ceph_msgr_wq);
109}
110
111void ceph_msgr_flush()
112{
113 flush_workqueue(ceph_msgr_wq);
114}
115
116
117/*
118 * socket callback functions
119 */
120
121/* data available on socket, or listen socket received a connect */
122static void ceph_data_ready(struct sock *sk, int count_unused)
123{
124 struct ceph_connection *con =
125 (struct ceph_connection *)sk->sk_user_data;
126 if (sk->sk_state != TCP_CLOSE_WAIT) {
127 dout("ceph_data_ready on %p state = %lu, queueing work\n",
128 con, con->state);
129 queue_con(con);
130 }
131}
132
133/* socket has buffer space for writing */
134static void ceph_write_space(struct sock *sk)
135{
136 struct ceph_connection *con =
137 (struct ceph_connection *)sk->sk_user_data;
138
139 /* only queue to workqueue if there is data we want to write. */
140 if (test_bit(WRITE_PENDING, &con->state)) {
141 dout("ceph_write_space %p queueing write work\n", con);
142 queue_con(con);
143 } else {
144 dout("ceph_write_space %p nothing to write\n", con);
145 }
146
147 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
148 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
149}
150
151/* socket's state has changed */
152static void ceph_state_change(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 dout("ceph_state_change %p state = %lu sk_state = %u\n",
158 con, con->state, sk->sk_state);
159
160 if (test_bit(CLOSED, &con->state))
161 return;
162
163 switch (sk->sk_state) {
164 case TCP_CLOSE:
165 dout("ceph_state_change TCP_CLOSE\n");
166 case TCP_CLOSE_WAIT:
167 dout("ceph_state_change TCP_CLOSE_WAIT\n");
168 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
169 if (test_bit(CONNECTING, &con->state))
170 con->error_msg = "connection failed";
171 else
172 con->error_msg = "socket closed";
173 queue_con(con);
174 }
175 break;
176 case TCP_ESTABLISHED:
177 dout("ceph_state_change TCP_ESTABLISHED\n");
178 queue_con(con);
179 break;
180 }
181}
182
183/*
184 * set up socket callbacks
185 */
186static void set_sock_callbacks(struct socket *sock,
187 struct ceph_connection *con)
188{
189 struct sock *sk = sock->sk;
190 sk->sk_user_data = (void *)con;
191 sk->sk_data_ready = ceph_data_ready;
192 sk->sk_write_space = ceph_write_space;
193 sk->sk_state_change = ceph_state_change;
194}
195
196
197/*
198 * socket helpers
199 */
200
201/*
202 * initiate connection to a remote socket.
203 */
204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
205{
206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
207 struct socket *sock;
208 int ret;
209
210 BUG_ON(con->sock);
211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
213 if (ret)
214 return ERR_PTR(ret);
215 con->sock = sock;
216 sock->sk->sk_allocation = GFP_NOFS;
217
218#ifdef CONFIG_LOCKDEP
219 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
220#endif
221
222 set_sock_callbacks(sock, con);
223
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
225
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
228 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state);
232 ret = 0;
233 }
234 if (ret < 0) {
235 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock);
238 con->sock = NULL;
239 con->error_msg = "connect error";
240 }
241
242 if (ret < 0)
243 return ERR_PTR(ret);
244 return sock;
245}
246
247static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
248{
249 struct kvec iov = {buf, len};
250 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
251
252 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
253}
254
255/*
256 * write something. @more is true if caller will be sending more data
257 * shortly.
258 */
259static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
260 size_t kvlen, size_t len, int more)
261{
262 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
263
264 if (more)
265 msg.msg_flags |= MSG_MORE;
266 else
267 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
268
269 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
270}
271
272
273/*
274 * Shutdown/close the socket for the given connection.
275 */
276static int con_close_socket(struct ceph_connection *con)
277{
278 int rc;
279
280 dout("con_close_socket on %p sock %p\n", con, con->sock);
281 if (!con->sock)
282 return 0;
283 set_bit(SOCK_CLOSED, &con->state);
284 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
285 sock_release(con->sock);
286 con->sock = NULL;
287 clear_bit(SOCK_CLOSED, &con->state);
288 return rc;
289}
290
291/*
292 * Reset a connection. Discard all incoming and outgoing messages
293 * and clear *_seq state.
294 */
295static void ceph_msg_remove(struct ceph_msg *msg)
296{
297 list_del_init(&msg->list_head);
298 ceph_msg_put(msg);
299}
300static void ceph_msg_remove_list(struct list_head *head)
301{
302 while (!list_empty(head)) {
303 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
304 list_head);
305 ceph_msg_remove(msg);
306 }
307}
308
309static void reset_connection(struct ceph_connection *con)
310{
311 /* reset connection, out_queue, msg_ and connect_seq */
312 /* discard existing out_queue and msg_seq */
313 ceph_msg_remove_list(&con->out_queue);
314 ceph_msg_remove_list(&con->out_sent);
315
316 if (con->in_msg) {
317 ceph_msg_put(con->in_msg);
318 con->in_msg = NULL;
319 }
320
321 con->connect_seq = 0;
322 con->out_seq = 0;
323 if (con->out_msg) {
324 ceph_msg_put(con->out_msg);
325 con->out_msg = NULL;
326 }
327 con->out_keepalive_pending = false;
328 con->in_seq = 0;
329 con->in_seq_acked = 0;
330}
331
332/*
333 * mark a peer down. drop any open connections.
334 */
335void ceph_con_close(struct ceph_connection *con)
336{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
341 clear_bit(KEEPALIVE_PENDING, &con->state);
342 clear_bit(WRITE_PENDING, &con->state);
343 mutex_lock(&con->mutex);
344 reset_connection(con);
345 con->peer_global_seq = 0;
346 cancel_delayed_work(&con->work);
347 mutex_unlock(&con->mutex);
348 queue_con(con);
349}
350
351/*
352 * Reopen a closed connection, with a new peer address.
353 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */
361 queue_con(con);
362}
363
364/*
365 * return true if this connection ever successfully opened
366 */
367bool ceph_con_opened(struct ceph_connection *con)
368{
369 return con->connect_seq > 0;
370}
371
372/*
373 * generic get/put
374 */
375struct ceph_connection *ceph_con_get(struct ceph_connection *con)
376{
377 dout("con_get %p nref = %d -> %d\n", con,
378 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
379 if (atomic_inc_not_zero(&con->nref))
380 return con;
381 return NULL;
382}
383
384void ceph_con_put(struct ceph_connection *con)
385{
386 dout("con_put %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
388 BUG_ON(atomic_read(&con->nref) == 0);
389 if (atomic_dec_and_test(&con->nref)) {
390 BUG_ON(con->sock);
391 kfree(con);
392 }
393}
394
395/*
396 * initialize a new connection.
397 */
398void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
399{
400 dout("con_init %p\n", con);
401 memset(con, 0, sizeof(*con));
402 atomic_set(&con->nref, 1);
403 con->msgr = msgr;
404 mutex_init(&con->mutex);
405 INIT_LIST_HEAD(&con->out_queue);
406 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work);
408}
409
410
411/*
412 * We maintain a global counter to order connection attempts. Get
413 * a unique seq greater than @gt.
414 */
415static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
416{
417 u32 ret;
418
419 spin_lock(&msgr->global_seq_lock);
420 if (msgr->global_seq < gt)
421 msgr->global_seq = gt;
422 ret = ++msgr->global_seq;
423 spin_unlock(&msgr->global_seq_lock);
424 return ret;
425}
426
427
428/*
429 * Prepare footer for currently outgoing message, and finish things
430 * off. Assumes out_kvec* are already valid.. we just add on to the end.
431 */
432static void prepare_write_message_footer(struct ceph_connection *con, int v)
433{
434 struct ceph_msg *m = con->out_msg;
435
436 dout("prepare_write_message_footer %p\n", con);
437 con->out_kvec_is_msg = true;
438 con->out_kvec[v].iov_base = &m->footer;
439 con->out_kvec[v].iov_len = sizeof(m->footer);
440 con->out_kvec_bytes += sizeof(m->footer);
441 con->out_kvec_left++;
442 con->out_more = m->more_to_follow;
443 con->out_msg_done = true;
444}
445
446/*
447 * Prepare headers for the next outgoing message.
448 */
449static void prepare_write_message(struct ceph_connection *con)
450{
451 struct ceph_msg *m;
452 int v = 0;
453
454 con->out_kvec_bytes = 0;
455 con->out_kvec_is_msg = true;
456 con->out_msg_done = false;
457
458 /* Sneak an ack in there first? If we can get it into the same
459 * TCP packet that's a good thing. */
460 if (con->in_seq > con->in_seq_acked) {
461 con->in_seq_acked = con->in_seq;
462 con->out_kvec[v].iov_base = &tag_ack;
463 con->out_kvec[v++].iov_len = 1;
464 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
465 con->out_kvec[v].iov_base = &con->out_temp_ack;
466 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
467 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
468 }
469
470 m = list_first_entry(&con->out_queue,
471 struct ceph_msg, list_head);
472 con->out_msg = m;
473 if (test_bit(LOSSYTX, &con->state)) {
474 list_del_init(&m->list_head);
475 } else {
476 /* put message on sent list */
477 ceph_msg_get(m);
478 list_move_tail(&m->list_head, &con->out_sent);
479 }
480
481 /*
482 * only assign outgoing seq # if we haven't sent this message
483 * yet. if it is requeued, resend with it's original seq.
484 */
485 if (m->needs_out_seq) {
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487 m->needs_out_seq = false;
488 }
489
490 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
491 m, con->out_seq, le16_to_cpu(m->hdr.type),
492 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
493 le32_to_cpu(m->hdr.data_len),
494 m->nr_pages);
495 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
496
497 /* tag + hdr + front + middle */
498 con->out_kvec[v].iov_base = &tag_msg;
499 con->out_kvec[v++].iov_len = 1;
500 con->out_kvec[v].iov_base = &m->hdr;
501 con->out_kvec[v++].iov_len = sizeof(m->hdr);
502 con->out_kvec[v++] = m->front;
503 if (m->middle)
504 con->out_kvec[v++] = m->middle->vec;
505 con->out_kvec_left = v;
506 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
507 (m->middle ? m->middle->vec.iov_len : 0);
508 con->out_kvec_cur = con->out_kvec;
509
510 /* fill in crc (except data pages), footer */
511 con->out_msg->hdr.crc =
512 cpu_to_le32(crc32c(0, (void *)&m->hdr,
513 sizeof(m->hdr) - sizeof(m->hdr.crc)));
514 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
515 con->out_msg->footer.front_crc =
516 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
517 if (m->middle)
518 con->out_msg->footer.middle_crc =
519 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
520 m->middle->vec.iov_len));
521 else
522 con->out_msg->footer.middle_crc = 0;
523 con->out_msg->footer.data_crc = 0;
524 dout("prepare_write_message front_crc %u data_crc %u\n",
525 le32_to_cpu(con->out_msg->footer.front_crc),
526 le32_to_cpu(con->out_msg->footer.middle_crc));
527
528 /* is there a data payload? */
529 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */
531 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos =
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
534 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */
537 } else {
538 /* no, queue up footer too and be done */
539 prepare_write_message_footer(con, v);
540 }
541
542 set_bit(WRITE_PENDING, &con->state);
543}
544
545/*
546 * Prepare an ack.
547 */
548static void prepare_write_ack(struct ceph_connection *con)
549{
550 dout("prepare_write_ack %p %llu -> %llu\n", con,
551 con->in_seq_acked, con->in_seq);
552 con->in_seq_acked = con->in_seq;
553
554 con->out_kvec[0].iov_base = &tag_ack;
555 con->out_kvec[0].iov_len = 1;
556 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
557 con->out_kvec[1].iov_base = &con->out_temp_ack;
558 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
559 con->out_kvec_left = 2;
560 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
561 con->out_kvec_cur = con->out_kvec;
562 con->out_more = 1; /* more will follow.. eventually.. */
563 set_bit(WRITE_PENDING, &con->state);
564}
565
566/*
567 * Prepare to write keepalive byte.
568 */
569static void prepare_write_keepalive(struct ceph_connection *con)
570{
571 dout("prepare_write_keepalive %p\n", con);
572 con->out_kvec[0].iov_base = &tag_keepalive;
573 con->out_kvec[0].iov_len = 1;
574 con->out_kvec_left = 1;
575 con->out_kvec_bytes = 1;
576 con->out_kvec_cur = con->out_kvec;
577 set_bit(WRITE_PENDING, &con->state);
578}
579
580/*
581 * Connection negotiation.
582 */
583
584static void prepare_connect_authorizer(struct ceph_connection *con)
585{
586 void *auth_buf;
587 int auth_len = 0;
588 int auth_protocol = 0;
589
590 mutex_unlock(&con->mutex);
591 if (con->ops->get_authorizer)
592 con->ops->get_authorizer(con, &auth_buf, &auth_len,
593 &auth_protocol, &con->auth_reply_buf,
594 &con->auth_reply_buf_len,
595 con->auth_retry);
596 mutex_lock(&con->mutex);
597
598 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
599 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
600
601 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
602 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
603 con->out_kvec_left++;
604 con->out_kvec_bytes += auth_len;
605}
606
607/*
608 * We connected to a peer and are saying hello.
609 */
610static void prepare_write_banner(struct ceph_messenger *msgr,
611 struct ceph_connection *con)
612{
613 int len = strlen(CEPH_BANNER);
614
615 con->out_kvec[0].iov_base = CEPH_BANNER;
616 con->out_kvec[0].iov_len = len;
617 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
618 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
619 con->out_kvec_left = 2;
620 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
621 con->out_kvec_cur = con->out_kvec;
622 con->out_more = 0;
623 set_bit(WRITE_PENDING, &con->state);
624}
625
626static void prepare_write_connect(struct ceph_messenger *msgr,
627 struct ceph_connection *con,
628 int after_banner)
629{
630 unsigned global_seq = get_global_seq(con->msgr, 0);
631 int proto;
632
633 switch (con->peer_name.type) {
634 case CEPH_ENTITY_TYPE_MON:
635 proto = CEPH_MONC_PROTOCOL;
636 break;
637 case CEPH_ENTITY_TYPE_OSD:
638 proto = CEPH_OSDC_PROTOCOL;
639 break;
640 case CEPH_ENTITY_TYPE_MDS:
641 proto = CEPH_MDSC_PROTOCOL;
642 break;
643 default:
644 BUG();
645 }
646
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto);
649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq);
654 con->out_connect.protocol_version = cpu_to_le32(proto);
655 con->out_connect.flags = 0;
656
657 if (!after_banner) {
658 con->out_kvec_left = 0;
659 con->out_kvec_bytes = 0;
660 }
661 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
662 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
663 con->out_kvec_left++;
664 con->out_kvec_bytes += sizeof(con->out_connect);
665 con->out_kvec_cur = con->out_kvec;
666 con->out_more = 0;
667 set_bit(WRITE_PENDING, &con->state);
668
669 prepare_connect_authorizer(con);
670}
671
672
673/*
674 * write as much of pending kvecs to the socket as we can.
675 * 1 -> done
676 * 0 -> socket full, but more to do
677 * <0 -> error
678 */
679static int write_partial_kvec(struct ceph_connection *con)
680{
681 int ret;
682
683 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
684 while (con->out_kvec_bytes > 0) {
685 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
686 con->out_kvec_left, con->out_kvec_bytes,
687 con->out_more);
688 if (ret <= 0)
689 goto out;
690 con->out_kvec_bytes -= ret;
691 if (con->out_kvec_bytes == 0)
692 break; /* done */
693 while (ret > 0) {
694 if (ret >= con->out_kvec_cur->iov_len) {
695 ret -= con->out_kvec_cur->iov_len;
696 con->out_kvec_cur++;
697 con->out_kvec_left--;
698 } else {
699 con->out_kvec_cur->iov_len -= ret;
700 con->out_kvec_cur->iov_base += ret;
701 ret = 0;
702 break;
703 }
704 }
705 }
706 con->out_kvec_left = 0;
707 con->out_kvec_is_msg = false;
708 ret = 1;
709out:
710 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
711 con->out_kvec_bytes, con->out_kvec_left, ret);
712 return ret; /* done! */
713}
714
715/*
716 * Write as much message data payload as we can. If we finish, queue
717 * up the footer.
718 * 1 -> done, footer is now queued in out_kvec[].
719 * 0 -> socket full, but more to do
720 * <0 -> error
721 */
722static int write_partial_msg_pages(struct ceph_connection *con)
723{
724 struct ceph_msg *msg = con->out_msg;
725 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
726 size_t len;
727 int crc = con->msgr->nocrc;
728 int ret;
729
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos);
733
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
735 struct page *page = NULL;
736 void *kaddr = NULL;
737
738 /*
739 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the
741 * zero page.
742 */
743 if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page];
745 if (crc)
746 kaddr = kmap(page);
747 } else if (msg->pagelist) {
748 page = list_first_entry(&msg->pagelist->head,
749 struct page, lru);
750 if (crc)
751 kaddr = kmap(page);
752 } else {
753 page = con->msgr->zero_page;
754 if (crc)
755 kaddr = page_address(con->msgr->zero_page);
756 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
758 (int)(data_len - con->out_msg_pos.data_pos));
759 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
762
763 BUG_ON(kaddr == NULL);
764 con->out_msg->footer.data_crc =
765 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1;
767 }
768
769 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len,
771 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE);
773
774 if (crc && (msg->pages || msg->pagelist))
775 kunmap(page);
776
777 if (ret <= 0)
778 goto out;
779
780 con->out_msg_pos.data_pos += ret;
781 con->out_msg_pos.page_pos += ret;
782 if (ret == len) {
783 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist)
787 list_move_tail(&page->lru,
788 &msg->pagelist->head);
789 }
790 }
791
792 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
793
794 /* prepare and queue up footer, too */
795 if (!crc)
796 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
797 con->out_kvec_bytes = 0;
798 con->out_kvec_left = 0;
799 con->out_kvec_cur = con->out_kvec;
800 prepare_write_message_footer(con, 0);
801 ret = 1;
802out:
803 return ret;
804}
805
806/*
807 * write some zeros
808 */
809static int write_partial_skip(struct ceph_connection *con)
810{
811 int ret;
812
813 while (con->out_skip > 0) {
814 struct kvec iov = {
815 .iov_base = page_address(con->msgr->zero_page),
816 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
817 };
818
819 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
820 if (ret <= 0)
821 goto out;
822 con->out_skip -= ret;
823 }
824 ret = 1;
825out:
826 return ret;
827}
828
829/*
830 * Prepare to read connection handshake, or an ack.
831 */
832static void prepare_read_banner(struct ceph_connection *con)
833{
834 dout("prepare_read_banner %p\n", con);
835 con->in_base_pos = 0;
836}
837
838static void prepare_read_connect(struct ceph_connection *con)
839{
840 dout("prepare_read_connect %p\n", con);
841 con->in_base_pos = 0;
842}
843
844static void prepare_read_ack(struct ceph_connection *con)
845{
846 dout("prepare_read_ack %p\n", con);
847 con->in_base_pos = 0;
848}
849
850static void prepare_read_tag(struct ceph_connection *con)
851{
852 dout("prepare_read_tag %p\n", con);
853 con->in_base_pos = 0;
854 con->in_tag = CEPH_MSGR_TAG_READY;
855}
856
857/*
858 * Prepare to read a message.
859 */
860static int prepare_read_message(struct ceph_connection *con)
861{
862 dout("prepare_read_message %p\n", con);
863 BUG_ON(con->in_msg != NULL);
864 con->in_base_pos = 0;
865 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
866 return 0;
867}
868
869
870static int read_partial(struct ceph_connection *con,
871 int *to, int size, void *object)
872{
873 *to += size;
874 while (con->in_base_pos < *to) {
875 int left = *to - con->in_base_pos;
876 int have = size - left;
877 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
878 if (ret <= 0)
879 return ret;
880 con->in_base_pos += ret;
881 }
882 return 1;
883}
884
885
886/*
887 * Read all or part of the connect-side handshake on a new connection
888 */
889static int read_partial_banner(struct ceph_connection *con)
890{
891 int ret, to = 0;
892
893 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
894
895 /* peer's banner */
896 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
900 &con->actual_peer_addr);
901 if (ret <= 0)
902 goto out;
903 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
904 &con->peer_addr_for_me);
905 if (ret <= 0)
906 goto out;
907out:
908 return ret;
909}
910
911static int read_partial_connect(struct ceph_connection *con)
912{
913 int ret, to = 0;
914
915 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
916
917 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
918 if (ret <= 0)
919 goto out;
920 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
921 con->auth_reply_buf);
922 if (ret <= 0)
923 goto out;
924
925 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
926 con, (int)con->in_reply.tag,
927 le32_to_cpu(con->in_reply.connect_seq),
928 le32_to_cpu(con->in_reply.global_seq));
929out:
930 return ret;
931
932}
933
934/*
935 * Verify the hello banner looks okay.
936 */
937static int verify_hello(struct ceph_connection *con)
938{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner";
943 return -1;
944 }
945 return 0;
946}
947
948static bool addr_is_blank(struct sockaddr_storage *ss)
949{
950 switch (ss->ss_family) {
951 case AF_INET:
952 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
953 case AF_INET6:
954 return
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
957 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
958 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
959 }
960 return false;
961}
962
963static int addr_port(struct sockaddr_storage *ss)
964{
965 switch (ss->ss_family) {
966 case AF_INET:
967 return ntohs(((struct sockaddr_in *)ss)->sin_port);
968 case AF_INET6:
969 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
970 }
971 return 0;
972}
973
974static void addr_set_port(struct sockaddr_storage *ss, int p)
975{
976 switch (ss->ss_family) {
977 case AF_INET:
978 ((struct sockaddr_in *)ss)->sin_port = htons(p);
979 case AF_INET6:
980 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
981 }
982}
983
984/*
985 * Parse an ip[:port] list into an addr array. Use the default
986 * monitor port if a port isn't specified.
987 */
988int ceph_parse_ips(const char *c, const char *end,
989 struct ceph_entity_addr *addr,
990 int max_count, int *count)
991{
992 int i;
993 const char *p = c;
994
995 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
996 for (i = 0; i < max_count; i++) {
997 const char *ipend;
998 struct sockaddr_storage *ss = &addr[i].in_addr;
999 struct sockaddr_in *in4 = (void *)ss;
1000 struct sockaddr_in6 *in6 = (void *)ss;
1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1008
1009 memset(ss, 0, sizeof(*ss));
1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1011 delim, &ipend))
1012 ss->ss_family = AF_INET;
1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1014 delim, &ipend))
1015 ss->ss_family = AF_INET6;
1016 else
1017 goto bad;
1018 p = ipend;
1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1028 /* port? */
1029 if (p < end && *p == ':') {
1030 port = 0;
1031 p++;
1032 while (p < end && *p >= '0' && *p <= '9') {
1033 port = (port * 10) + (*p - '0');
1034 p++;
1035 }
1036 if (port > 65535 || port == 0)
1037 goto bad;
1038 } else {
1039 port = CEPH_MON_PORT;
1040 }
1041
1042 addr_set_port(ss, port);
1043
1044 dout("parse_ips got %s\n", pr_addr(ss));
1045
1046 if (p == end)
1047 break;
1048 if (*p != ',')
1049 goto bad;
1050 p++;
1051 }
1052
1053 if (p != end)
1054 goto bad;
1055
1056 if (count)
1057 *count = i + 1;
1058 return 0;
1059
1060bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL;
1063}
1064
1065static int process_banner(struct ceph_connection *con)
1066{
1067 dout("process_banner on %p\n", con);
1068
1069 if (verify_hello(con) < 0)
1070 return -1;
1071
1072 ceph_decode_addr(&con->actual_peer_addr);
1073 ceph_decode_addr(&con->peer_addr_for_me);
1074
1075 /*
1076 * Make sure the other end is who we wanted. note that the other
1077 * end may not yet know their ip address, so if it's 0.0.0.0, give
1078 * them the benefit of the doubt.
1079 */
1080 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1085 pr_addr(&con->peer_addr.in_addr),
1086 le64_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 le64_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address";
1090 return -1;
1091 }
1092
1093 /*
1094 * did we learn our address?
1095 */
1096 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1097 int port = addr_port(&con->msgr->inst.addr.in_addr);
1098
1099 memcpy(&con->msgr->inst.addr.in_addr,
1100 &con->peer_addr_for_me.in_addr,
1101 sizeof(con->peer_addr_for_me.in_addr));
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr));
1106 }
1107
1108 set_bit(NEGOTIATING, &con->state);
1109 prepare_read_connect(con);
1110 return 0;
1111}
1112
1113static void fail_protocol(struct ceph_connection *con)
1114{
1115 reset_connection(con);
1116 set_bit(CLOSED, &con->state); /* in case there's queued work */
1117
1118 mutex_unlock(&con->mutex);
1119 if (con->ops->bad_proto)
1120 con->ops->bad_proto(con);
1121 mutex_lock(&con->mutex);
1122}
1123
1124static int process_connect(struct ceph_connection *con)
1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1131
1132 switch (con->in_reply.tag) {
1133 case CEPH_MSGR_TAG_FEATURES:
1134 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features";
1140 fail_protocol(con);
1141 return -1;
1142
1143 case CEPH_MSGR_TAG_BADPROTOVER:
1144 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch";
1151 fail_protocol(con);
1152 return -1;
1153
1154 case CEPH_MSGR_TAG_BADAUTHORIZER:
1155 con->auth_retry++;
1156 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1157 con->auth_retry);
1158 if (con->auth_retry == 2) {
1159 con->error_msg = "connect authorization failure";
1160 reset_connection(con);
1161 set_bit(CLOSED, &con->state);
1162 return -1;
1163 }
1164 con->auth_retry = 1;
1165 prepare_write_connect(con->msgr, con, 0);
1166 prepare_read_connect(con);
1167 break;
1168
1169 case CEPH_MSGR_TAG_RESETSESSION:
1170 /*
1171 * If we connected with a large connect_seq but the peer
1172 * has no record of a session with us (no connection, or
1173 * connect_seq == 0), they will send RESETSESION to indicate
1174 * that they must have reset their session, and may have
1175 * dropped messages.
1176 */
1177 dout("process_connect got RESET peer seq %u\n",
1178 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con);
1185
1186 /* Tell ceph about it. */
1187 mutex_unlock(&con->mutex);
1188 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1189 if (con->ops->peer_reset)
1190 con->ops->peer_reset(con);
1191 mutex_lock(&con->mutex);
1192 break;
1193
1194 case CEPH_MSGR_TAG_RETRY_SESSION:
1195 /*
1196 * If we sent a smaller connect_seq than the peer has, try
1197 * again with a larger value.
1198 */
1199 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1200 le32_to_cpu(con->out_connect.connect_seq),
1201 le32_to_cpu(con->in_connect.connect_seq));
1202 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1203 prepare_write_connect(con->msgr, con, 0);
1204 prepare_read_connect(con);
1205 break;
1206
1207 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1208 /*
1209 * If we sent a smaller global_seq than the peer has, try
1210 * again with a larger value.
1211 */
1212 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1213 con->peer_global_seq,
1214 le32_to_cpu(con->in_connect.global_seq));
1215 get_global_seq(con->msgr,
1216 le32_to_cpu(con->in_connect.global_seq));
1217 prepare_write_connect(con->msgr, con, 0);
1218 prepare_read_connect(con);
1219 break;
1220
1221 case CEPH_MSGR_TAG_READY:
1222 if (req_feat & ~server_feat) {
1223 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features";
1229 fail_protocol(con);
1230 return -1;
1231 }
1232 clear_bit(CONNECTING, &con->state);
1233 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1234 con->connect_seq++;
1235 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq),
1239 con->connect_seq);
1240 WARN_ON(con->connect_seq !=
1241 le32_to_cpu(con->in_reply.connect_seq));
1242
1243 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1244 set_bit(LOSSYTX, &con->state);
1245
1246 prepare_read_tag(con);
1247 break;
1248
1249 case CEPH_MSGR_TAG_WAIT:
1250 /*
1251 * If there is a connection race (we are opening
1252 * connections to each other), one of us may just have
1253 * to WAIT. This shouldn't happen if we are the
1254 * client.
1255 */
1256 pr_err("process_connect peer connecting WAIT\n");
1257
1258 default:
1259 pr_err("connect protocol error, will retry\n");
1260 con->error_msg = "protocol error, garbage tag during connect";
1261 return -1;
1262 }
1263 return 0;
1264}
1265
1266
1267/*
1268 * read (part of) an ack
1269 */
1270static int read_partial_ack(struct ceph_connection *con)
1271{
1272 int to = 0;
1273
1274 return read_partial(con, &to, sizeof(con->in_temp_ack),
1275 &con->in_temp_ack);
1276}
1277
1278
1279/*
1280 * We can finally discard anything that's been acked.
1281 */
1282static void process_ack(struct ceph_connection *con)
1283{
1284 struct ceph_msg *m;
1285 u64 ack = le64_to_cpu(con->in_temp_ack);
1286 u64 seq;
1287
1288 while (!list_empty(&con->out_sent)) {
1289 m = list_first_entry(&con->out_sent, struct ceph_msg,
1290 list_head);
1291 seq = le64_to_cpu(m->hdr.seq);
1292 if (seq > ack)
1293 break;
1294 dout("got ack for seq %llu type %d at %p\n", seq,
1295 le16_to_cpu(m->hdr.type), m);
1296 ceph_msg_remove(m);
1297 }
1298 prepare_read_tag(con);
1299}
1300
1301
1302
1303
1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, unsigned int sec_len,
1306 u32 *crc)
1307{
1308 int left;
1309 int ret;
1310
1311 BUG_ON(!section);
1312
1313 while (section->iov_len < sec_len) {
1314 BUG_ON(section->iov_base == NULL);
1315 left = sec_len - section->iov_len;
1316 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1317 section->iov_len, left);
1318 if (ret <= 0)
1319 return ret;
1320 section->iov_len += ret;
1321 if (section->iov_len == sec_len)
1322 *crc = crc32c(0, section->iov_base,
1323 section->iov_len);
1324 }
1325
1326 return 1;
1327}
1328
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr,
1331 int *skip);
1332/*
1333 * read (part of) a message.
1334 */
1335static int read_partial_message(struct ceph_connection *con)
1336{
1337 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret;
1340 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off;
1342 int datacrc = con->msgr->nocrc;
1343 int skip;
1344 u64 seq;
1345
1346 dout("read_partial_message con %p msg %p\n", con, m);
1347
1348 /* header */
1349 while (con->in_base_pos < sizeof(con->in_hdr)) {
1350 left = sizeof(con->in_hdr) - con->in_base_pos;
1351 ret = ceph_tcp_recvmsg(con->sock,
1352 (char *)&con->in_hdr + con->in_base_pos,
1353 left);
1354 if (ret <= 0)
1355 return ret;
1356 con->in_base_pos += ret;
1357 if (con->in_base_pos == sizeof(con->in_hdr)) {
1358 u32 crc = crc32c(0, (void *)&con->in_hdr,
1359 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1360 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1361 pr_err("read_partial_message bad hdr "
1362 " crc %u != expected %u\n",
1363 crc, con->in_hdr.crc);
1364 return -EBADMSG;
1365 }
1366 }
1367 }
1368 front_len = le32_to_cpu(con->in_hdr.front_len);
1369 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1370 return -EIO;
1371 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1372 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1373 return -EIO;
1374 data_len = le32_to_cpu(con->in_hdr.data_len);
1375 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1376 return -EIO;
1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1398 /* allocate message? */
1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1404 if (skip) {
1405 /* skip this message */
1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1408 con->in_base_pos = -front_len - middle_len - data_len -
1409 sizeof(m->footer);
1410 con->in_tag = CEPH_MSGR_TAG_READY;
1411 con->in_seq++;
1412 return 0;
1413 }
1414 if (!con->in_msg) {
1415 con->error_msg =
1416 "error allocating memory for incoming message";
1417 return -ENOMEM;
1418 }
1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */
1421 if (m->middle)
1422 m->middle->vec.iov_len = 0;
1423
1424 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1426 con->in_msg_pos.data_pos = 0;
1427 }
1428
1429 /* front */
1430 ret = read_partial_message_section(con, &m->front, front_len,
1431 &con->in_front_crc);
1432 if (ret <= 0)
1433 return ret;
1434
1435 /* middle */
1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1438 &con->in_middle_crc);
1439 if (ret <= 0)
1440 return ret;
1441 }
1442
1443 /* (page) data */
1444 while (con->in_msg_pos.data_pos < data_len) {
1445 left = min((int)(data_len - con->in_msg_pos.data_pos),
1446 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1447 BUG_ON(m->pages == NULL);
1448 p = kmap(m->pages[con->in_msg_pos.page]);
1449 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1450 left);
1451 if (ret > 0 && datacrc)
1452 con->in_data_crc =
1453 crc32c(con->in_data_crc,
1454 p + con->in_msg_pos.page_pos, ret);
1455 kunmap(m->pages[con->in_msg_pos.page]);
1456 if (ret <= 0)
1457 return ret;
1458 con->in_msg_pos.data_pos += ret;
1459 con->in_msg_pos.page_pos += ret;
1460 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1461 con->in_msg_pos.page_pos = 0;
1462 con->in_msg_pos.page++;
1463 }
1464 }
1465
1466 /* footer */
1467 to = sizeof(m->hdr) + sizeof(m->footer);
1468 while (con->in_base_pos < to) {
1469 left = to - con->in_base_pos;
1470 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1471 (con->in_base_pos - sizeof(m->hdr)),
1472 left);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_base_pos += ret;
1476 }
1477 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1478 m, front_len, m->footer.front_crc, middle_len,
1479 m->footer.middle_crc, data_len, m->footer.data_crc);
1480
1481 /* crc ok? */
1482 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1483 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1484 m, con->in_front_crc, m->footer.front_crc);
1485 return -EBADMSG;
1486 }
1487 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1488 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1489 m, con->in_middle_crc, m->footer.middle_crc);
1490 return -EBADMSG;
1491 }
1492 if (datacrc &&
1493 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1494 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1495 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1496 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1497 return -EBADMSG;
1498 }
1499
1500 return 1; /* done! */
1501}
1502
1503/*
1504 * Process message. This happens in the worker thread. The callback should
1505 * be careful not to do anything that waits on other incoming messages or it
1506 * may deadlock.
1507 */
1508static void process_message(struct ceph_connection *con)
1509{
1510 struct ceph_msg *msg;
1511
1512 msg = con->in_msg;
1513 con->in_msg = NULL;
1514
1515 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src;
1518
1519 con->in_seq++;
1520 mutex_unlock(&con->mutex);
1521
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len),
1528 le32_to_cpu(msg->hdr.data_len),
1529 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1530 con->ops->dispatch(con, msg);
1531
1532 mutex_lock(&con->mutex);
1533 prepare_read_tag(con);
1534}
1535
1536
1537/*
1538 * Write something to the socket. Called in a worker thread when the
1539 * socket appears to be writeable and we have something ready to send.
1540 */
1541static int try_write(struct ceph_connection *con)
1542{
1543 struct ceph_messenger *msgr = con->msgr;
1544 int ret = 1;
1545
1546 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref));
1548
1549more:
1550 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1551
1552 /* open the socket first? */
1553 if (con->sock == NULL) {
1554 /*
1555 * if we were STANDBY and are reconnecting _this_
1556 * connection, bump connect_seq now. Always bump
1557 * global_seq.
1558 */
1559 if (test_and_clear_bit(STANDBY, &con->state))
1560 con->connect_seq++;
1561
1562 prepare_write_banner(msgr, con);
1563 prepare_write_connect(msgr, con, 1);
1564 prepare_read_banner(con);
1565 set_bit(CONNECTING, &con->state);
1566 clear_bit(NEGOTIATING, &con->state);
1567
1568 BUG_ON(con->in_msg);
1569 con->in_tag = CEPH_MSGR_TAG_READY;
1570 dout("try_write initiating connect on %p new state %lu\n",
1571 con, con->state);
1572 con->sock = ceph_tcp_connect(con);
1573 if (IS_ERR(con->sock)) {
1574 con->sock = NULL;
1575 con->error_msg = "connect error";
1576 ret = -1;
1577 goto out;
1578 }
1579 }
1580
1581more_kvec:
1582 /* kvec data queued? */
1583 if (con->out_skip) {
1584 ret = write_partial_skip(con);
1585 if (ret <= 0)
1586 goto done;
1587 if (ret < 0) {
1588 dout("try_write write_partial_skip err %d\n", ret);
1589 goto done;
1590 }
1591 }
1592 if (con->out_kvec_left) {
1593 ret = write_partial_kvec(con);
1594 if (ret <= 0)
1595 goto done;
1596 }
1597
1598 /* msg pages? */
1599 if (con->out_msg) {
1600 if (con->out_msg_done) {
1601 ceph_msg_put(con->out_msg);
1602 con->out_msg = NULL; /* we're done with this one */
1603 goto do_next;
1604 }
1605
1606 ret = write_partial_msg_pages(con);
1607 if (ret == 1)
1608 goto more_kvec; /* we need to send the footer, too! */
1609 if (ret == 0)
1610 goto done;
1611 if (ret < 0) {
1612 dout("try_write write_partial_msg_pages err %d\n",
1613 ret);
1614 goto done;
1615 }
1616 }
1617
1618do_next:
1619 if (!test_bit(CONNECTING, &con->state)) {
1620 /* is anything else pending? */
1621 if (!list_empty(&con->out_queue)) {
1622 prepare_write_message(con);
1623 goto more;
1624 }
1625 if (con->in_seq > con->in_seq_acked) {
1626 prepare_write_ack(con);
1627 goto more;
1628 }
1629 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1630 prepare_write_keepalive(con);
1631 goto more;
1632 }
1633 }
1634
1635 /* Nothing to do! */
1636 clear_bit(WRITE_PENDING, &con->state);
1637 dout("try_write nothing else to write.\n");
1638done:
1639 ret = 0;
1640out:
1641 dout("try_write done on %p\n", con);
1642 return ret;
1643}
1644
1645
1646
1647/*
1648 * Read what we can from the socket.
1649 */
1650static int try_read(struct ceph_connection *con)
1651{
1652 int ret = -1;
1653
1654 if (!con->sock)
1655 return 0;
1656
1657 if (test_bit(STANDBY, &con->state))
1658 return 0;
1659
1660 dout("try_read start on %p\n", con);
1661
1662more:
1663 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1664 con->in_base_pos);
1665 if (test_bit(CONNECTING, &con->state)) {
1666 if (!test_bit(NEGOTIATING, &con->state)) {
1667 dout("try_read connecting\n");
1668 ret = read_partial_banner(con);
1669 if (ret <= 0)
1670 goto done;
1671 if (process_banner(con) < 0) {
1672 ret = -1;
1673 goto out;
1674 }
1675 }
1676 ret = read_partial_connect(con);
1677 if (ret <= 0)
1678 goto done;
1679 if (process_connect(con) < 0) {
1680 ret = -1;
1681 goto out;
1682 }
1683 goto more;
1684 }
1685
1686 if (con->in_base_pos < 0) {
1687 /*
1688 * skipping + discarding content.
1689 *
1690 * FIXME: there must be a better way to do this!
1691 */
1692 static char buf[1024];
1693 int skip = min(1024, -con->in_base_pos);
1694 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1695 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1696 if (ret <= 0)
1697 goto done;
1698 con->in_base_pos += ret;
1699 if (con->in_base_pos)
1700 goto more;
1701 }
1702 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1703 /*
1704 * what's next?
1705 */
1706 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1707 if (ret <= 0)
1708 goto done;
1709 dout("try_read got tag %d\n", (int)con->in_tag);
1710 switch (con->in_tag) {
1711 case CEPH_MSGR_TAG_MSG:
1712 prepare_read_message(con);
1713 break;
1714 case CEPH_MSGR_TAG_ACK:
1715 prepare_read_ack(con);
1716 break;
1717 case CEPH_MSGR_TAG_CLOSE:
1718 set_bit(CLOSED, &con->state); /* fixme */
1719 goto done;
1720 default:
1721 goto bad_tag;
1722 }
1723 }
1724 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1725 ret = read_partial_message(con);
1726 if (ret <= 0) {
1727 switch (ret) {
1728 case -EBADMSG:
1729 con->error_msg = "bad crc";
1730 ret = -EIO;
1731 goto out;
1732 case -EIO:
1733 con->error_msg = "io error";
1734 goto out;
1735 default:
1736 goto done;
1737 }
1738 }
1739 if (con->in_tag == CEPH_MSGR_TAG_READY)
1740 goto more;
1741 process_message(con);
1742 goto more;
1743 }
1744 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1745 ret = read_partial_ack(con);
1746 if (ret <= 0)
1747 goto done;
1748 process_ack(con);
1749 goto more;
1750 }
1751
1752done:
1753 ret = 0;
1754out:
1755 dout("try_read done on %p\n", con);
1756 return ret;
1757
1758bad_tag:
1759 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1760 con->error_msg = "protocol error, garbage tag";
1761 ret = -1;
1762 goto out;
1763}
1764
1765
1766/*
1767 * Atomically queue work on a connection. Bump @con reference to
1768 * avoid races with connection teardown.
1769 *
1770 * There is some trickery going on with QUEUED and BUSY because we
1771 * only want a _single_ thread operating on each connection at any
1772 * point in time, but we want to use all available CPUs.
1773 *
1774 * The worker thread only proceeds if it can atomically set BUSY. It
1775 * clears QUEUED and does it's thing. When it thinks it's done, it
1776 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1777 * (tries again to set BUSY).
1778 *
1779 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1780 * try to queue work. If that fails (work is already queued, or BUSY)
1781 * we give up (work also already being done or is queued) but leave QUEUED
1782 * set so that the worker thread will loop if necessary.
1783 */
1784static void queue_con(struct ceph_connection *con)
1785{
1786 if (test_bit(DEAD, &con->state)) {
1787 dout("queue_con %p ignoring: DEAD\n",
1788 con);
1789 return;
1790 }
1791
1792 if (!con->ops->get(con)) {
1793 dout("queue_con %p ref count 0\n", con);
1794 return;
1795 }
1796
1797 set_bit(QUEUED, &con->state);
1798 if (test_bit(BUSY, &con->state)) {
1799 dout("queue_con %p - already BUSY\n", con);
1800 con->ops->put(con);
1801 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1802 dout("queue_con %p - already queued\n", con);
1803 con->ops->put(con);
1804 } else {
1805 dout("queue_con %p\n", con);
1806 }
1807}
1808
1809/*
1810 * Do some work on a connection. Drop a connection ref when we're done.
1811 */
1812static void con_work(struct work_struct *work)
1813{
1814 struct ceph_connection *con = container_of(work, struct ceph_connection,
1815 work.work);
1816 int backoff = 0;
1817
1818more:
1819 if (test_and_set_bit(BUSY, &con->state) != 0) {
1820 dout("con_work %p BUSY already set\n", con);
1821 goto out;
1822 }
1823 dout("con_work %p start, clearing QUEUED\n", con);
1824 clear_bit(QUEUED, &con->state);
1825
1826 mutex_lock(&con->mutex);
1827
1828 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1829 dout("con_work CLOSED\n");
1830 con_close_socket(con);
1831 goto done;
1832 }
1833 if (test_and_clear_bit(OPENING, &con->state)) {
1834 /* reopen w/ new peer */
1835 dout("con_work OPENING\n");
1836 con_close_socket(con);
1837 }
1838
1839 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1840 try_read(con) < 0 ||
1841 try_write(con) < 0) {
1842 mutex_unlock(&con->mutex);
1843 backoff = 1;
1844 ceph_fault(con); /* error/fault path */
1845 goto done_unlocked;
1846 }
1847
1848done:
1849 mutex_unlock(&con->mutex);
1850
1851done_unlocked:
1852 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) {
1855 if (!backoff || test_bit(OPENING, &con->state)) {
1856 dout("con_work %p QUEUED reset, looping\n", con);
1857 goto more;
1858 }
1859 dout("con_work %p QUEUED reset, but just faulted\n", con);
1860 clear_bit(QUEUED, &con->state);
1861 }
1862 dout("con_work %p done\n", con);
1863
1864out:
1865 con->ops->put(con);
1866}
1867
1868
1869/*
1870 * Generic error/fault handler. A retry mechanism is used with
1871 * exponential backoff
1872 */
1873static void ceph_fault(struct ceph_connection *con)
1874{
1875 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1876 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1877 dout("fault %p state %lu to peer %s\n",
1878 con, con->state, pr_addr(&con->peer_addr.in_addr));
1879
1880 if (test_bit(LOSSYTX, &con->state)) {
1881 dout("fault on LOSSYTX channel\n");
1882 goto out;
1883 }
1884
1885 mutex_lock(&con->mutex);
1886 if (test_bit(CLOSED, &con->state))
1887 goto out_unlock;
1888
1889 con_close_socket(con);
1890
1891 if (con->in_msg) {
1892 ceph_msg_put(con->in_msg);
1893 con->in_msg = NULL;
1894 }
1895
1896 /* Requeue anything that hasn't been acked */
1897 list_splice_init(&con->out_sent, &con->out_queue);
1898
1899 /* If there are no messages in the queue, place the connection
1900 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1901 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1902 dout("fault setting STANDBY\n");
1903 set_bit(STANDBY, &con->state);
1904 } else {
1905 /* retry after a delay. */
1906 if (con->delay == 0)
1907 con->delay = BASE_DELAY_INTERVAL;
1908 else if (con->delay < MAX_DELAY_INTERVAL)
1909 con->delay *= 2;
1910 dout("fault queueing %p delay %lu\n", con, con->delay);
1911 con->ops->get(con);
1912 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1913 round_jiffies_relative(con->delay)) == 0)
1914 con->ops->put(con);
1915 }
1916
1917out_unlock:
1918 mutex_unlock(&con->mutex);
1919out:
1920 /*
1921 * in case we faulted due to authentication, invalidate our
1922 * current tickets so that we can get new ones.
1923 */
1924 if (con->auth_retry && con->ops->invalidate_authorizer) {
1925 dout("calling invalidate_authorizer()\n");
1926 con->ops->invalidate_authorizer(con);
1927 }
1928
1929 if (con->ops->fault)
1930 con->ops->fault(con);
1931}
1932
1933
1934
1935/*
1936 * create a new messenger instance
1937 */
1938struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1939{
1940 struct ceph_messenger *msgr;
1941
1942 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1943 if (msgr == NULL)
1944 return ERR_PTR(-ENOMEM);
1945
1946 spin_lock_init(&msgr->global_seq_lock);
1947
1948 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */
1950 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) {
1952 kfree(msgr);
1953 return ERR_PTR(-ENOMEM);
1954 }
1955 kmap(msgr->zero_page);
1956
1957 if (myaddr)
1958 msgr->inst.addr = *myaddr;
1959
1960 /* select a random nonce */
1961 msgr->inst.addr.type = 0;
1962 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1963 encode_my_addr(msgr);
1964
1965 dout("messenger_create %p\n", msgr);
1966 return msgr;
1967}
1968
1969void ceph_messenger_destroy(struct ceph_messenger *msgr)
1970{
1971 dout("destroy %p\n", msgr);
1972 kunmap(msgr->zero_page);
1973 __free_page(msgr->zero_page);
1974 kfree(msgr);
1975 dout("destroyed messenger %p\n", msgr);
1976}
1977
1978/*
1979 * Queue up an outgoing message on the given connection.
1980 */
1981void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1982{
1983 if (test_bit(CLOSED, &con->state)) {
1984 dout("con_send %p closed, dropping %p\n", con, msg);
1985 ceph_msg_put(msg);
1986 return;
1987 }
1988
1989 /* set src+dst */
1990 msg->hdr.src = con->msgr->inst.name;
1991
1992 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1993
1994 msg->needs_out_seq = true;
1995
1996 /* queue */
1997 mutex_lock(&con->mutex);
1998 BUG_ON(!list_empty(&msg->list_head));
1999 list_add_tail(&msg->list_head, &con->out_queue);
2000 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2001 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2002 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2003 le32_to_cpu(msg->hdr.front_len),
2004 le32_to_cpu(msg->hdr.middle_len),
2005 le32_to_cpu(msg->hdr.data_len));
2006 mutex_unlock(&con->mutex);
2007
2008 /* if there wasn't anything waiting to send before, queue
2009 * new work */
2010 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2011 queue_con(con);
2012}
2013
2014/*
2015 * Revoke a message that was previously queued for send
2016 */
2017void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2018{
2019 mutex_lock(&con->mutex);
2020 if (!list_empty(&msg->list_head)) {
2021 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2022 list_del_init(&msg->list_head);
2023 ceph_msg_put(msg);
2024 msg->hdr.seq = 0;
2025 }
2026 if (con->out_msg == msg) {
2027 dout("con_revoke %p msg %p - was sending\n", con, msg);
2028 con->out_msg = NULL;
2029 if (con->out_kvec_is_msg) {
2030 con->out_skip = con->out_kvec_bytes;
2031 con->out_kvec_is_msg = false;
2032 }
2033 ceph_msg_put(msg);
2034 msg->hdr.seq = 0;
2035 }
2036 mutex_unlock(&con->mutex);
2037}
2038
2039/*
2040 * Revoke a message that we may be reading data into
2041 */
2042void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2043{
2044 mutex_lock(&con->mutex);
2045 if (con->in_msg && con->in_msg == msg) {
2046 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2047 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2048 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2049
2050 /* skip rest of message */
2051 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2052 con->in_base_pos = con->in_base_pos -
2053 sizeof(struct ceph_msg_header) -
2054 front_len -
2055 middle_len -
2056 data_len -
2057 sizeof(struct ceph_msg_footer);
2058 ceph_msg_put(con->in_msg);
2059 con->in_msg = NULL;
2060 con->in_tag = CEPH_MSGR_TAG_READY;
2061 con->in_seq++;
2062 } else {
2063 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2064 con, con->in_msg, msg);
2065 }
2066 mutex_unlock(&con->mutex);
2067}
2068
2069/*
2070 * Queue a keepalive byte to ensure the tcp connection is alive.
2071 */
2072void ceph_con_keepalive(struct ceph_connection *con)
2073{
2074 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2075 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2076 queue_con(con);
2077}
2078
2079
2080/*
2081 * construct a new message with given type, size
2082 * the new msg has a ref count of 1.
2083 */
2084struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2085{
2086 struct ceph_msg *m;
2087
2088 m = kmalloc(sizeof(*m), flags);
2089 if (m == NULL)
2090 goto out;
2091 kref_init(&m->kref);
2092 INIT_LIST_HEAD(&m->list_head);
2093
2094 m->hdr.tid = 0;
2095 m->hdr.type = cpu_to_le16(type);
2096 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2097 m->hdr.version = 0;
2098 m->hdr.front_len = cpu_to_le32(front_len);
2099 m->hdr.middle_len = 0;
2100 m->hdr.data_len = 0;
2101 m->hdr.data_off = 0;
2102 m->hdr.reserved = 0;
2103 m->footer.front_crc = 0;
2104 m->footer.middle_crc = 0;
2105 m->footer.data_crc = 0;
2106 m->footer.flags = 0;
2107 m->front_max = front_len;
2108 m->front_is_vmalloc = false;
2109 m->more_to_follow = false;
2110 m->pool = NULL;
2111
2112 /* front */
2113 if (front_len) {
2114 if (front_len > PAGE_CACHE_SIZE) {
2115 m->front.iov_base = __vmalloc(front_len, flags,
2116 PAGE_KERNEL);
2117 m->front_is_vmalloc = true;
2118 } else {
2119 m->front.iov_base = kmalloc(front_len, flags);
2120 }
2121 if (m->front.iov_base == NULL) {
2122 pr_err("msg_new can't allocate %d bytes\n",
2123 front_len);
2124 goto out2;
2125 }
2126 } else {
2127 m->front.iov_base = NULL;
2128 }
2129 m->front.iov_len = front_len;
2130
2131 /* middle */
2132 m->middle = NULL;
2133
2134 /* data */
2135 m->nr_pages = 0;
2136 m->pages = NULL;
2137 m->pagelist = NULL;
2138
2139 dout("ceph_msg_new %p front %d\n", m, front_len);
2140 return m;
2141
2142out2:
2143 ceph_msg_put(m);
2144out:
2145 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2146 return NULL;
2147}
2148
2149/*
2150 * Allocate "middle" portion of a message, if it is needed and wasn't
2151 * allocated by alloc_msg. This allows us to read a small fixed-size
2152 * per-type header in the front and then gracefully fail (i.e.,
2153 * propagate the error to the caller based on info in the front) when
2154 * the middle is too large.
2155 */
2156static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2157{
2158 int type = le16_to_cpu(msg->hdr.type);
2159 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2160
2161 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2162 ceph_msg_type_name(type), middle_len);
2163 BUG_ON(!middle_len);
2164 BUG_ON(msg->middle);
2165
2166 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2167 if (!msg->middle)
2168 return -ENOMEM;
2169 return 0;
2170}
2171
2172/*
2173 * Generic message allocator, for incoming messages.
2174 */
2175static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2176 struct ceph_msg_header *hdr,
2177 int *skip)
2178{
2179 int type = le16_to_cpu(hdr->type);
2180 int front_len = le32_to_cpu(hdr->front_len);
2181 int middle_len = le32_to_cpu(hdr->middle_len);
2182 struct ceph_msg *msg = NULL;
2183 int ret;
2184
2185 if (con->ops->alloc_msg) {
2186 mutex_unlock(&con->mutex);
2187 msg = con->ops->alloc_msg(con, hdr, skip);
2188 mutex_lock(&con->mutex);
2189 if (!msg || *skip)
2190 return NULL;
2191 }
2192 if (!msg) {
2193 *skip = 0;
2194 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2195 if (!msg) {
2196 pr_err("unable to allocate msg type %d len %d\n",
2197 type, front_len);
2198 return NULL;
2199 }
2200 }
2201 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2202
2203 if (middle_len && !msg->middle) {
2204 ret = ceph_alloc_middle(con, msg);
2205 if (ret < 0) {
2206 ceph_msg_put(msg);
2207 return NULL;
2208 }
2209 }
2210
2211 return msg;
2212}
2213
2214
2215/*
2216 * Free a generically kmalloc'd message.
2217 */
2218void ceph_msg_kfree(struct ceph_msg *m)
2219{
2220 dout("msg_kfree %p\n", m);
2221 if (m->front_is_vmalloc)
2222 vfree(m->front.iov_base);
2223 else
2224 kfree(m->front.iov_base);
2225 kfree(m);
2226}
2227
2228/*
2229 * Drop a msg ref. Destroy as needed.
2230 */
2231void ceph_msg_last_put(struct kref *kref)
2232{
2233 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2234
2235 dout("ceph_msg_put last one on %p\n", m);
2236 WARN_ON(!list_empty(&m->list_head));
2237
2238 /* drop middle, data, if any */
2239 if (m->middle) {
2240 ceph_buffer_put(m->middle);
2241 m->middle = NULL;
2242 }
2243 m->nr_pages = 0;
2244 m->pages = NULL;
2245
2246 if (m->pagelist) {
2247 ceph_pagelist_release(m->pagelist);
2248 kfree(m->pagelist);
2249 m->pagelist = NULL;
2250 }
2251
2252 if (m->pool)
2253 ceph_msgpool_put(m->pool, m);
2254 else
2255 ceph_msg_kfree(m);
2256}
2257
2258void ceph_msg_dump(struct ceph_msg *msg)
2259{
2260 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2261 msg->front_max, msg->nr_pages);
2262 print_hex_dump(KERN_DEBUG, "header: ",
2263 DUMP_PREFIX_OFFSET, 16, 1,
2264 &msg->hdr, sizeof(msg->hdr), true);
2265 print_hex_dump(KERN_DEBUG, " front: ",
2266 DUMP_PREFIX_OFFSET, 16, 1,
2267 msg->front.iov_base, msg->front.iov_len, true);
2268 if (msg->middle)
2269 print_hex_dump(KERN_DEBUG, "middle: ",
2270 DUMP_PREFIX_OFFSET, 16, 1,
2271 msg->middle->vec.iov_base,
2272 msg->middle->vec.iov_len, true);
2273 print_hex_dump(KERN_DEBUG, "footer: ",
2274 DUMP_PREFIX_OFFSET, 16, 1,
2275 &msg->footer, sizeof(msg->footer), true);
2276}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc1..00000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52/* use format string %s%d */
53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
54
55struct ceph_messenger {
56 struct ceph_entity_inst inst; /* my name+address */
57 struct ceph_entity_addr my_enc_addr;
58 struct page *zero_page; /* used in certain error cases */
59
60 bool nocrc;
61
62 /*
63 * the global_seq counts connections i (attempt to) initiate
64 * in order to disambiguate certain connect race conditions.
65 */
66 u32 global_seq;
67 spinlock_t global_seq_lock;
68};
69
70/*
71 * a single message. it contains a header (src, dest, message type, etc.),
72 * footer (crc values, mainly), a "front" message body, and possibly a
73 * data payload (stored in some number of pages).
74 */
75struct ceph_msg {
76 struct ceph_msg_header hdr; /* header */
77 struct ceph_msg_footer footer; /* footer */
78 struct kvec front; /* unaligned blobs of message */
79 struct ceph_buffer *middle;
80 struct page **pages; /* data payload. NOT OWNER. */
81 unsigned nr_pages; /* size of page array */
82 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head;
84 struct kref kref;
85 bool front_is_vmalloc;
86 bool more_to_follow;
87 bool needs_out_seq;
88 int front_max;
89
90 struct ceph_msgpool *pool;
91};
92
93struct ceph_msg_pos {
94 int page, page_pos; /* which page; offset in page */
95 int data_pos; /* offset in data payload */
96 int did_page_crc; /* true if we've calculated crc for current page */
97};
98
99/* ceph connection fault delay defaults, for exponential backoff */
100#define BASE_DELAY_INTERVAL (HZ/2)
101#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
102
103/*
104 * ceph_connection state bit flags
105 *
106 * QUEUED and BUSY are used together to ensure that only a single
107 * thread is currently opening, reading or writing data to the socket.
108 */
109#define LOSSYTX 0 /* we can close channel or drop messages on errors */
110#define CONNECTING 1
111#define NEGOTIATING 2
112#define KEEPALIVE_PENDING 3
113#define WRITE_PENDING 4 /* we have data ready to send */
114#define QUEUED 5 /* there is work queued on this connection */
115#define BUSY 6 /* work is being done */
116#define STANDBY 8 /* no outgoing messages, socket closed. we keep
117 * the ceph_connection around to maintain shared
118 * state with the peer. */
119#define CLOSED 10 /* we've closed the connection */
120#define SOCK_CLOSED 11 /* socket state changed to closed */
121#define OPENING 13 /* open connection w/ (possibly new) peer */
122#define DEAD 14 /* dead, about to kfree */
123
124/*
125 * A single connection with another host.
126 *
127 * We maintain a queue of outgoing messages, and some session state to
128 * ensure that we can preserve the lossless, ordered delivery of
129 * messages in the case of a TCP disconnect.
130 */
131struct ceph_connection {
132 void *private;
133 atomic_t nref;
134
135 const struct ceph_connection_operations *ops;
136
137 struct ceph_messenger *msgr;
138 struct socket *sock;
139 unsigned long state; /* connection state (see flags above) */
140 const char *error_msg; /* error message, if any */
141
142 struct ceph_entity_addr peer_addr; /* peer address */
143 struct ceph_entity_name peer_name; /* peer name */
144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 bool out_keepalive_pending;
161
162 u64 in_seq, in_seq_acked; /* last message received, acked */
163
164 /* connection negotiation temps */
165 char in_banner[CEPH_BANNER_MAX_LEN];
166 union {
167 struct { /* outgoing connection */
168 struct ceph_msg_connect out_connect;
169 struct ceph_msg_connect_reply in_reply;
170 };
171 struct { /* incoming */
172 struct ceph_msg_connect in_connect;
173 struct ceph_msg_connect_reply out_reply;
174 };
175 };
176 struct ceph_entity_addr actual_peer_addr;
177
178 /* message out temps */
179 struct ceph_msg *out_msg; /* sending message (== tail of
180 out_sent) */
181 bool out_msg_done;
182 struct ceph_msg_pos out_msg_pos;
183
184 struct kvec out_kvec[8], /* sending header/footer data */
185 *out_kvec_cur;
186 int out_kvec_left; /* kvec's left in out_kvec */
187 int out_skip; /* skip this many bytes */
188 int out_kvec_bytes; /* total bytes left */
189 bool out_kvec_is_msg; /* kvec refers to out_msg */
190 int out_more; /* there is more data after the kvecs */
191 __le64 out_temp_ack; /* for writing an ack */
192
193 /* message in temps */
194 struct ceph_msg_header in_hdr;
195 struct ceph_msg *in_msg;
196 struct ceph_msg_pos in_msg_pos;
197 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
198
199 char in_tag; /* protocol control byte */
200 int in_base_pos; /* bytes read */
201 __le64 in_temp_ack; /* for reading an ack */
202
203 struct delayed_work work; /* send|recv work */
204 unsigned long delay; /* current delay interval */
205};
206
207
208extern const char *pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr,
211 int max_count, int *count);
212
213
214extern int ceph_msgr_init(void);
215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
237extern void ceph_msg_kfree(struct ceph_msg *m);
238
239
240static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
241{
242 kref_get(&msg->kref);
243 return msg;
244}
245extern void ceph_msg_last_put(struct kref *kref);
246static inline void ceph_msg_put(struct ceph_msg *msg)
247{
248 kref_put(&msg->kref, ceph_msg_last_put);
249}
250
251extern void ceph_msg_dump(struct ceph_msg *msg);
252
253#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index 54fe01c5070..00000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,882 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31static const struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
108 ceph_msg_get(monc->m_auth); /* keep our ref */
109 ceph_con_send(monc->con, monc->m_auth);
110}
111
112/*
113 * Close monitor session, if any.
114 */
115static void __close_session(struct ceph_mon_client *monc)
116{
117 if (monc->con) {
118 dout("__close_session closing mon%d\n", monc->cur_mon);
119 ceph_con_revoke(monc->con, monc->m_auth);
120 ceph_con_close(monc->con);
121 monc->cur_mon = -1;
122 monc->pending_auth = 0;
123 ceph_auth_reset(monc->auth);
124 }
125}
126
127/*
128 * Open a session with a (new) monitor.
129 */
130static int __open_session(struct ceph_mon_client *monc)
131{
132 char r;
133 int ret;
134
135 if (monc->cur_mon < 0) {
136 get_random_bytes(&r, 1);
137 monc->cur_mon = r % monc->monmap->num_mon;
138 dout("open_session num=%d r=%d -> mon%d\n",
139 monc->monmap->num_mon, r, monc->cur_mon);
140 monc->sub_sent = 0;
141 monc->sub_renew_after = jiffies; /* i.e., expired */
142 monc->want_next_osdmap = !!monc->want_next_osdmap;
143
144 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr);
149
150 /* initiatiate authentication handshake */
151 ret = ceph_auth_build_hello(monc->auth,
152 monc->m_auth->front.iov_base,
153 monc->m_auth->front_max);
154 __send_prepared_auth_request(monc, ret);
155 } else {
156 dout("open_session mon%d already open\n", monc->cur_mon);
157 }
158 return 0;
159}
160
161static bool __sub_expired(struct ceph_mon_client *monc)
162{
163 return time_after_eq(jiffies, monc->sub_renew_after);
164}
165
166/*
167 * Reschedule delayed work timer.
168 */
169static void __schedule_delayed(struct ceph_mon_client *monc)
170{
171 unsigned delay;
172
173 if (monc->cur_mon < 0 || __sub_expired(monc))
174 delay = 10 * HZ;
175 else
176 delay = 20 * HZ;
177 dout("__schedule_delayed after %u\n", delay);
178 schedule_delayed_work(&monc->delayed_work, delay);
179}
180
181/*
182 * Send subscribe request for mdsmap and/or osdmap.
183 */
184static void __send_subscribe(struct ceph_mon_client *monc)
185{
186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
187 (unsigned)monc->sub_sent, __sub_expired(monc),
188 monc->want_next_osdmap);
189 if ((__sub_expired(monc) && !monc->sub_sent) ||
190 monc->want_next_osdmap == 1) {
191 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i;
193 void *p, *end;
194
195 p = msg->front.iov_base;
196 end = p + msg->front_max;
197
198 dout("__send_subscribe to 'mdsmap' %u+\n",
199 (unsigned)monc->have_mdsmap);
200 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1;
208 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 }
213 ceph_encode_string(&p, end, "mdsmap", 6);
214 i = p;
215 i->have = cpu_to_le64(monc->have_mdsmap);
216 i->onetime = 0;
217 p += sizeof(*i);
218 ceph_encode_string(&p, end, "monmap", 6);
219 i = p;
220 i->have = 0;
221 i->onetime = 0;
222 p += sizeof(*i);
223
224 msg->front.iov_len = p - msg->front.iov_base;
225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
228
229 monc->sub_sent = jiffies | 1; /* never 0 */
230 }
231}
232
233static void handle_subscribe_ack(struct ceph_mon_client *monc,
234 struct ceph_msg *msg)
235{
236 unsigned seconds;
237 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
238
239 if (msg->front.iov_len < sizeof(*h))
240 goto bad;
241 seconds = le32_to_cpu(h->duration);
242
243 mutex_lock(&monc->mutex);
244 if (monc->hunting) {
245 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false;
248 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds);
250 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
251 monc->sub_sent = 0;
252 mutex_unlock(&monc->mutex);
253 return;
254bad:
255 pr_err("got corrupt subscribe-ack msg\n");
256 ceph_msg_dump(msg);
257}
258
259/*
260 * Keep track of which maps we have
261 */
262int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
263{
264 mutex_lock(&monc->mutex);
265 monc->have_mdsmap = got;
266 mutex_unlock(&monc->mutex);
267 return 0;
268}
269
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{
272 mutex_lock(&monc->mutex);
273 monc->have_osdmap = got;
274 monc->want_next_osdmap = 0;
275 mutex_unlock(&monc->mutex);
276 return 0;
277}
278
279/*
280 * Register interest in the next osdmap
281 */
282void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
283{
284 dout("request_next_osdmap have %u\n", monc->have_osdmap);
285 mutex_lock(&monc->mutex);
286 if (!monc->want_next_osdmap)
287 monc->want_next_osdmap = 1;
288 if (monc->want_next_osdmap < 2)
289 __send_subscribe(monc);
290 mutex_unlock(&monc->mutex);
291}
292
293/*
294 *
295 */
296int ceph_monc_open_session(struct ceph_mon_client *monc)
297{
298 if (!monc->con) {
299 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
300 if (!monc->con)
301 return -ENOMEM;
302 ceph_con_init(monc->client->msgr, monc->con);
303 monc->con->private = monc;
304 monc->con->ops = &mon_con_ops;
305 }
306
307 mutex_lock(&monc->mutex);
308 __open_session(monc);
309 __schedule_delayed(monc);
310 mutex_unlock(&monc->mutex);
311 return 0;
312}
313
314/*
315 * The monitor responds with mount ack indicate mount success. The
316 * included client ticket allows the client to talk to MDSs and OSDs.
317 */
318static void ceph_monc_handle_map(struct ceph_mon_client *monc,
319 struct ceph_msg *msg)
320{
321 struct ceph_client *client = monc->client;
322 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
323 void *p, *end;
324
325 mutex_lock(&monc->mutex);
326
327 dout("handle_monmap\n");
328 p = msg->front.iov_base;
329 end = p + msg->front.iov_len;
330
331 monmap = ceph_monmap_decode(p, end);
332 if (IS_ERR(monmap)) {
333 pr_err("problem decoding monmap, %d\n",
334 (int)PTR_ERR(monmap));
335 goto out;
336 }
337
338 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
339 kfree(monmap);
340 goto out;
341 }
342
343 client->monc.monmap = monmap;
344 kfree(old);
345
346out:
347 mutex_unlock(&monc->mutex);
348 wake_up_all(&client->auth_wq);
349}
350
351/*
352 * statfs
353 */
354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid)
356{
357 struct ceph_mon_generic_request *req;
358 struct rb_node *n = monc->generic_request_tree.rb_node;
359
360 while (n) {
361 req = rb_entry(n, struct ceph_mon_generic_request, node);
362 if (tid < req->tid)
363 n = n->rb_left;
364 else if (tid > req->tid)
365 n = n->rb_right;
366 else
367 return req;
368 }
369 return NULL;
370}
371
372static void __insert_generic_request(struct ceph_mon_client *monc,
373 struct ceph_mon_generic_request *new)
374{
375 struct rb_node **p = &monc->generic_request_tree.rb_node;
376 struct rb_node *parent = NULL;
377 struct ceph_mon_generic_request *req = NULL;
378
379 while (*p) {
380 parent = *p;
381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
382 if (new->tid < req->tid)
383 p = &(*p)->rb_left;
384 else if (new->tid > req->tid)
385 p = &(*p)->rb_right;
386 else
387 BUG();
388 }
389
390 rb_link_node(&new->node, parent, p);
391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
443}
444
445static void handle_statfs_reply(struct ceph_mon_client *monc,
446 struct ceph_msg *msg)
447{
448 struct ceph_mon_generic_request *req;
449 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
450 u64 tid = le64_to_cpu(msg->hdr.tid);
451
452 if (msg->front.iov_len != sizeof(*reply))
453 goto bad;
454 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
455
456 mutex_lock(&monc->mutex);
457 req = __lookup_generic_req(monc, tid);
458 if (req) {
459 *(struct ceph_statfs *)req->buf = reply->st;
460 req->result = 0;
461 get_generic_request(req);
462 }
463 mutex_unlock(&monc->mutex);
464 if (req) {
465 complete_all(&req->completion);
466 put_generic_request(req);
467 }
468 return;
469
470bad:
471 pr_err("corrupt generic reply, no tid\n");
472 ceph_msg_dump(msg);
473}
474
475/*
476 * Do a synchronous statfs().
477 */
478int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
479{
480 struct ceph_mon_generic_request *req;
481 struct ceph_mon_statfs *h;
482 int err;
483
484 req = kzalloc(sizeof(*req), GFP_NOFS);
485 if (!req)
486 return -ENOMEM;
487
488 kref_init(&req->kref);
489 req->buf = buf;
490 init_completion(&req->completion);
491
492 err = -ENOMEM;
493 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
494 if (!req->request)
495 goto out;
496 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
497 if (!req->reply)
498 goto out;
499
500 /* fill out request */
501 h = req->request->front.iov_base;
502 h->monhdr.have_version = 0;
503 h->monhdr.session_mon = cpu_to_le16(-1);
504 h->monhdr.session_mon_tid = 0;
505 h->fsid = monc->monmap->fsid;
506
507 /* register request */
508 mutex_lock(&monc->mutex);
509 req->tid = ++monc->last_tid;
510 req->request->hdr.tid = cpu_to_le64(req->tid);
511 __insert_generic_request(monc, req);
512 monc->num_generic_requests++;
513 mutex_unlock(&monc->mutex);
514
515 /* send request and wait */
516 ceph_con_send(monc->con, ceph_msg_get(req->request));
517 err = wait_for_completion_interruptible(&req->completion);
518
519 mutex_lock(&monc->mutex);
520 rb_erase(&req->node, &monc->generic_request_tree);
521 monc->num_generic_requests--;
522 mutex_unlock(&monc->mutex);
523
524 if (!err)
525 err = req->result;
526
527out:
528 kref_put(&req->kref, release_generic_request);
529 return err;
530}
531
532/*
533 * Resend pending statfs requests.
534 */
535static void __resend_generic_request(struct ceph_mon_client *monc)
536{
537 struct ceph_mon_generic_request *req;
538 struct rb_node *p;
539
540 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
541 req = rb_entry(p, struct ceph_mon_generic_request, node);
542 ceph_con_revoke(monc->con, req->request);
543 ceph_con_send(monc->con, ceph_msg_get(req->request));
544 }
545}
546
547/*
548 * Delayed work. If we haven't mounted yet, retry. Otherwise,
549 * renew/retry subscription as needed (in case it is timing out, or we
550 * got an ENOMEM). And keep the monitor connection alive.
551 */
552static void delayed_work(struct work_struct *work)
553{
554 struct ceph_mon_client *monc =
555 container_of(work, struct ceph_mon_client, delayed_work.work);
556
557 dout("monc delayed_work\n");
558 mutex_lock(&monc->mutex);
559 if (monc->hunting) {
560 __close_session(monc);
561 __open_session(monc); /* continue hunting */
562 } else {
563 ceph_con_keepalive(monc->con);
564
565 __validate_auth(monc);
566
567 if (monc->auth->ops->is_authenticated(monc->auth))
568 __send_subscribe(monc);
569 }
570 __schedule_delayed(monc);
571 mutex_unlock(&monc->mutex);
572}
573
574/*
575 * On startup, we build a temporary monmap populated with the IPs
576 * provided by mount(2).
577 */
578static int build_initial_monmap(struct ceph_mon_client *monc)
579{
580 struct ceph_mount_args *args = monc->client->mount_args;
581 struct ceph_entity_addr *mon_addr = args->mon_addr;
582 int num_mon = args->num_mon;
583 int i;
584
585 /* build initial monmap */
586 monc->monmap = kzalloc(sizeof(*monc->monmap) +
587 num_mon*sizeof(monc->monmap->mon_inst[0]),
588 GFP_KERNEL);
589 if (!monc->monmap)
590 return -ENOMEM;
591 for (i = 0; i < num_mon; i++) {
592 monc->monmap->mon_inst[i].addr = mon_addr[i];
593 monc->monmap->mon_inst[i].addr.nonce = 0;
594 monc->monmap->mon_inst[i].name.type =
595 CEPH_ENTITY_TYPE_MON;
596 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
597 }
598 monc->monmap->num_mon = num_mon;
599 monc->have_fsid = false;
600
601 /* release addr memory */
602 kfree(args->mon_addr);
603 args->mon_addr = NULL;
604 args->num_mon = 0;
605 return 0;
606}
607
608int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
609{
610 int err = 0;
611
612 dout("init\n");
613 memset(monc, 0, sizeof(*monc));
614 monc->client = cl;
615 monc->monmap = NULL;
616 mutex_init(&monc->mutex);
617
618 err = build_initial_monmap(monc);
619 if (err)
620 goto out;
621
622 monc->con = NULL;
623
624 /* authentication */
625 monc->auth = ceph_auth_init(cl->mount_args->name,
626 cl->mount_args->secret);
627 if (IS_ERR(monc->auth))
628 return PTR_ERR(monc->auth);
629 monc->auth->want_keys =
630 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
631 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
632
633 /* msgs */
634 err = -ENOMEM;
635 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
636 sizeof(struct ceph_mon_subscribe_ack),
637 GFP_NOFS);
638 if (!monc->m_subscribe_ack)
639 goto out_monmap;
640
641 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
642 if (!monc->m_subscribe)
643 goto out_subscribe_ack;
644
645 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
646 if (!monc->m_auth_reply)
647 goto out_subscribe;
648
649 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
650 monc->pending_auth = 0;
651 if (!monc->m_auth)
652 goto out_auth_reply;
653
654 monc->cur_mon = -1;
655 monc->hunting = true;
656 monc->sub_renew_after = jiffies;
657 monc->sub_sent = 0;
658
659 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
660 monc->generic_request_tree = RB_ROOT;
661 monc->num_generic_requests = 0;
662 monc->last_tid = 0;
663
664 monc->have_mdsmap = 0;
665 monc->have_osdmap = 0;
666 monc->want_next_osdmap = 1;
667 return 0;
668
669out_auth_reply:
670 ceph_msg_put(monc->m_auth_reply);
671out_subscribe:
672 ceph_msg_put(monc->m_subscribe);
673out_subscribe_ack:
674 ceph_msg_put(monc->m_subscribe_ack);
675out_monmap:
676 kfree(monc->monmap);
677out:
678 return err;
679}
680
681void ceph_monc_stop(struct ceph_mon_client *monc)
682{
683 dout("stop\n");
684 cancel_delayed_work_sync(&monc->delayed_work);
685
686 mutex_lock(&monc->mutex);
687 __close_session(monc);
688 if (monc->con) {
689 monc->con->private = NULL;
690 monc->con->ops->put(monc->con);
691 monc->con = NULL;
692 }
693 mutex_unlock(&monc->mutex);
694
695 ceph_auth_destroy(monc->auth);
696
697 ceph_msg_put(monc->m_auth);
698 ceph_msg_put(monc->m_auth_reply);
699 ceph_msg_put(monc->m_subscribe);
700 ceph_msg_put(monc->m_subscribe_ack);
701
702 kfree(monc->monmap);
703}
704
705static void handle_auth_reply(struct ceph_mon_client *monc,
706 struct ceph_msg *msg)
707{
708 int ret;
709 int was_auth = 0;
710
711 mutex_lock(&monc->mutex);
712 if (monc->auth->ops)
713 was_auth = monc->auth->ops->is_authenticated(monc->auth);
714 monc->pending_auth = 0;
715 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
716 msg->front.iov_len,
717 monc->m_auth->front.iov_base,
718 monc->m_auth->front_max);
719 if (ret < 0) {
720 monc->client->auth_err = ret;
721 wake_up_all(&monc->client->auth_wq);
722 } else if (ret > 0) {
723 __send_prepared_auth_request(monc, ret);
724 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
725 dout("authenticated, starting session\n");
726
727 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
728 monc->client->msgr->inst.name.num =
729 cpu_to_le64(monc->auth->global_id);
730
731 __send_subscribe(monc);
732 __resend_generic_request(monc);
733 }
734 mutex_unlock(&monc->mutex);
735}
736
737static int __validate_auth(struct ceph_mon_client *monc)
738{
739 int ret;
740
741 if (monc->pending_auth)
742 return 0;
743
744 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
745 monc->m_auth->front_max);
746 if (ret <= 0)
747 return ret; /* either an error, or no need to authenticate */
748 __send_prepared_auth_request(monc, ret);
749 return 0;
750}
751
752int ceph_monc_validate_auth(struct ceph_mon_client *monc)
753{
754 int ret;
755
756 mutex_lock(&monc->mutex);
757 ret = __validate_auth(monc);
758 mutex_unlock(&monc->mutex);
759 return ret;
760}
761
762/*
763 * handle incoming message
764 */
765static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
766{
767 struct ceph_mon_client *monc = con->private;
768 int type = le16_to_cpu(msg->hdr.type);
769
770 if (!monc)
771 return;
772
773 switch (type) {
774 case CEPH_MSG_AUTH_REPLY:
775 handle_auth_reply(monc, msg);
776 break;
777
778 case CEPH_MSG_MON_SUBSCRIBE_ACK:
779 handle_subscribe_ack(monc, msg);
780 break;
781
782 case CEPH_MSG_STATFS_REPLY:
783 handle_statfs_reply(monc, msg);
784 break;
785
786 case CEPH_MSG_MON_MAP:
787 ceph_monc_handle_map(monc, msg);
788 break;
789
790 case CEPH_MSG_MDS_MAP:
791 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
792 break;
793
794 case CEPH_MSG_OSD_MAP:
795 ceph_osdc_handle_map(&monc->client->osdc, msg);
796 break;
797
798 default:
799 pr_err("received unknown message type %d %s\n", type,
800 ceph_msg_type_name(type));
801 }
802 ceph_msg_put(msg);
803}
804
805/*
806 * Allocate memory for incoming message
807 */
808static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
809 struct ceph_msg_header *hdr,
810 int *skip)
811{
812 struct ceph_mon_client *monc = con->private;
813 int type = le16_to_cpu(hdr->type);
814 int front_len = le32_to_cpu(hdr->front_len);
815 struct ceph_msg *m = NULL;
816
817 *skip = 0;
818
819 switch (type) {
820 case CEPH_MSG_MON_SUBSCRIBE_ACK:
821 m = ceph_msg_get(monc->m_subscribe_ack);
822 break;
823 case CEPH_MSG_STATFS_REPLY:
824 return get_generic_reply(con, hdr, skip);
825 case CEPH_MSG_AUTH_REPLY:
826 m = ceph_msg_get(monc->m_auth_reply);
827 break;
828 case CEPH_MSG_MON_MAP:
829 case CEPH_MSG_MDS_MAP:
830 case CEPH_MSG_OSD_MAP:
831 m = ceph_msg_new(type, front_len, GFP_NOFS);
832 break;
833 }
834
835 if (!m) {
836 pr_info("alloc_msg unknown type %d\n", type);
837 *skip = 1;
838 }
839 return m;
840}
841
842/*
843 * If the monitor connection resets, pick a new monitor and resubmit
844 * any pending requests.
845 */
846static void mon_fault(struct ceph_connection *con)
847{
848 struct ceph_mon_client *monc = con->private;
849
850 if (!monc)
851 return;
852
853 dout("mon_fault\n");
854 mutex_lock(&monc->mutex);
855 if (!con->private)
856 goto out;
857
858 if (monc->con && !monc->hunting)
859 pr_info("mon%d %s session lost, "
860 "hunting for new mon\n", monc->cur_mon,
861 pr_addr(&monc->con->peer_addr.in_addr));
862
863 __close_session(monc);
864 if (!monc->hunting) {
865 /* start hunting */
866 monc->hunting = true;
867 __open_session(monc);
868 } else {
869 /* already hunting, let's wait a bit */
870 __schedule_delayed(monc);
871 }
872out:
873 mutex_unlock(&monc->mutex);
874}
875
876static const struct ceph_connection_operations mon_con_ops = {
877 .get = ceph_con_get,
878 .put = ceph_con_put,
879 .dispatch = dispatch,
880 .fault = mon_fault,
881 .alloc_msg = mon_alloc_msg,
882};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 174d794321d..00000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,116 +0,0 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/rbtree.h>
7
8#include "messenger.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_generic_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
45 * to the caller
46 */
47struct ceph_mon_generic_request {
48 struct kref kref;
49 u64 tid;
50 struct rb_node node;
51 int result;
52 void *buf;
53 struct completion completion;
54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
56};
57
58struct ceph_mon_client {
59 struct ceph_client *client;
60 struct ceph_monmap *monmap;
61
62 struct mutex mutex;
63 struct delayed_work delayed_work;
64
65 struct ceph_auth_client *auth;
66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
67 int pending_auth;
68
69 bool hunting;
70 int cur_mon; /* last monitor i contacted */
71 unsigned long sub_sent, sub_renew_after;
72 struct ceph_connection *con;
73 bool have_fsid;
74
75 /* pending generic requests */
76 struct rb_root generic_request_tree;
77 int num_generic_requests;
78 u64 last_tid;
79
80 /* mds/osd map */
81 int want_next_osdmap; /* 1 = want, 2 = want+asked */
82 u32 have_osdmap, have_mdsmap;
83
84#ifdef CONFIG_DEBUG_FS
85 struct dentry *debugfs_file;
86#endif
87};
88
89extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
90extern int ceph_monmap_contains(struct ceph_monmap *m,
91 struct ceph_entity_addr *addr);
92
93extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
94extern void ceph_monc_stop(struct ceph_mon_client *monc);
95
96/*
97 * The model here is to indicate that we need a new map of at least
98 * epoch @want, and also call in when we receive a map. We will
99 * periodically rerequest the map from the monitor cluster until we
100 * get what we want.
101 */
102extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
103extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
104
105extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
106
107extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
108 struct ceph_statfs *buf);
109
110extern int ceph_monc_open_session(struct ceph_mon_client *monc);
111
112extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
113
114
115
116#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a643813..00000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f936..00000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include <linux/mempool.h>
5#include "messenger.h"
6
7/*
8 * we use memory pools for preallocating messages we may receive, to
9 * avoid unexpected OOM conditions.
10 */
11struct ceph_msgpool {
12 const char *name;
13 mempool_t *pool;
14 int front_len; /* preallocated payload size */
15};
16
17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
18 int front_len, int size, bool blocking,
19 const char *name);
20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
22 int front_len);
23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
24
25#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 892a0298dfd..00000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_AUTH 0x20
54
55#define CEPH_ENTITY_TYPE_ANY 0xFF
56
57extern const char *ceph_entity_type_name(int type);
58
59/*
60 * entity_addr -- network address
61 */
62struct ceph_entity_addr {
63 __le32 type;
64 __le32 nonce; /* unique id for process (e.g. pid) */
65 struct sockaddr_storage in_addr;
66} __attribute__ ((packed));
67
68struct ceph_entity_inst {
69 struct ceph_entity_name name;
70 struct ceph_entity_addr addr;
71} __attribute__ ((packed));
72
73
74/* used by message exchange protocol */
75#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
76#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
77#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
78 incoming connection */
79#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
80 with higher cseq */
81#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
82 with higher gseq */
83#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90
91
92/*
93 * connection negotiation
94 */
95struct ceph_msg_connect {
96 __le64 features; /* supported feature bits */
97 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
98 __le32 global_seq; /* count connections initiated by this host */
99 __le32 connect_seq; /* count connections initiated in this session */
100 __le32 protocol_version;
101 __le32 authorizer_protocol;
102 __le32 authorizer_len;
103 __u8 flags; /* CEPH_MSG_CONNECT_* */
104} __attribute__ ((packed));
105
106struct ceph_msg_connect_reply {
107 __u8 tag;
108 __le64 features; /* feature bits for this session */
109 __le32 global_seq;
110 __le32 connect_seq;
111 __le32 protocol_version;
112 __le32 authorizer_len;
113 __u8 flags;
114} __attribute__ ((packed));
115
116#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
117
118
119/*
120 * message header
121 */
122struct ceph_msg_header_old {
123 __le64 seq; /* message seq# for this session */
124 __le64 tid; /* transaction id */
125 __le16 type; /* message type */
126 __le16 priority; /* priority. higher value == higher priority */
127 __le16 version; /* version of message encoding */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 reserved;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
158#define CEPH_MSG_PRIO_LOW 64
159#define CEPH_MSG_PRIO_DEFAULT 127
160#define CEPH_MSG_PRIO_HIGH 196
161#define CEPH_MSG_PRIO_HIGHEST 255
162
163/*
164 * follows data payload
165 */
166struct ceph_msg_footer {
167 __le32 front_crc, middle_crc, data_crc;
168 __u8 flags;
169} __attribute__ ((packed));
170
171#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
172#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
173
174
175#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index e3852234789..00000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1542 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return NULL;
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (!msg) {
169 ceph_osdc_put_request(req);
170 return NULL;
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (!msg) {
183 ceph_osdc_put_request(req);
184 return NULL;
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
369 kfree(osd);
370 }
371}
372
373/*
374 * remove an osd from our map
375 */
376static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
377{
378 dout("__remove_osd %p\n", osd);
379 BUG_ON(!list_empty(&osd->o_requests));
380 rb_erase(&osd->o_node, &osdc->osds);
381 list_del_init(&osd->o_osd_lru);
382 ceph_con_close(&osd->o_con);
383 put_osd(osd);
384}
385
386static void __move_osd_to_lru(struct ceph_osd_client *osdc,
387 struct ceph_osd *osd)
388{
389 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
393}
394
395static void __remove_osd_from_lru(struct ceph_osd *osd)
396{
397 dout("__remove_osd_from_lru %p\n", osd);
398 if (!list_empty(&osd->o_osd_lru))
399 list_del_init(&osd->o_osd_lru);
400}
401
402static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
403{
404 struct ceph_osd *osd, *nosd;
405
406 dout("__remove_old_osds %p\n", osdc);
407 mutex_lock(&osdc->request_mutex);
408 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
409 if (!remove_all && time_before(jiffies, osd->lru_ttl))
410 break;
411 __remove_osd(osdc, osd);
412 }
413 mutex_unlock(&osdc->request_mutex);
414}
415
416/*
417 * reset osd connect
418 */
419static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
420{
421 struct ceph_osd_request *req;
422 int ret = 0;
423
424 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
425 if (list_empty(&osd->o_requests)) {
426 __remove_osd(osdc, osd);
427 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
428 &osd->o_con.peer_addr,
429 sizeof(osd->o_con.peer_addr)) == 0 &&
430 !ceph_con_opened(&osd->o_con)) {
431 dout(" osd addr hasn't changed and connection never opened,"
432 " letting msgr retry");
433 /* touch each r_stamp for handle_timeout()'s benfit */
434 list_for_each_entry(req, &osd->o_requests, r_osd_item)
435 req->r_stamp = jiffies;
436 ret = -EAGAIN;
437 } else {
438 ceph_con_close(&osd->o_con);
439 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
440 osd->o_incarnation++;
441 }
442 return ret;
443}
444
445static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
446{
447 struct rb_node **p = &osdc->osds.rb_node;
448 struct rb_node *parent = NULL;
449 struct ceph_osd *osd = NULL;
450
451 while (*p) {
452 parent = *p;
453 osd = rb_entry(parent, struct ceph_osd, o_node);
454 if (new->o_osd < osd->o_osd)
455 p = &(*p)->rb_left;
456 else if (new->o_osd > osd->o_osd)
457 p = &(*p)->rb_right;
458 else
459 BUG();
460 }
461
462 rb_link_node(&new->o_node, parent, p);
463 rb_insert_color(&new->o_node, &osdc->osds);
464}
465
466static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
467{
468 struct ceph_osd *osd;
469 struct rb_node *n = osdc->osds.rb_node;
470
471 while (n) {
472 osd = rb_entry(n, struct ceph_osd, o_node);
473 if (o < osd->o_osd)
474 n = n->rb_left;
475 else if (o > osd->o_osd)
476 n = n->rb_right;
477 else
478 return osd;
479 }
480 return NULL;
481}
482
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{
485 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ);
487}
488
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
490{
491 cancel_delayed_work(&osdc->timeout_work);
492}
493
494/*
495 * Register request, assign tid. If this is the first request, set up
496 * the timeout event.
497 */
498static void register_request(struct ceph_osd_client *osdc,
499 struct ceph_osd_request *req)
500{
501 mutex_lock(&osdc->request_mutex);
502 req->r_tid = ++osdc->last_tid;
503 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
504 INIT_LIST_HEAD(&req->r_req_lru_item);
505
506 dout("register_request %p tid %lld\n", req, req->r_tid);
507 __insert_request(osdc, req);
508 ceph_osdc_get_request(req);
509 osdc->num_requests++;
510
511 if (osdc->num_requests == 1) {
512 dout(" first request, scheduling timeout\n");
513 __schedule_osd_timeout(osdc);
514 }
515 mutex_unlock(&osdc->request_mutex);
516}
517
518/*
519 * called under osdc->request_mutex
520 */
521static void __unregister_request(struct ceph_osd_client *osdc,
522 struct ceph_osd_request *req)
523{
524 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
525 rb_erase(&req->r_node, &osdc->requests);
526 osdc->num_requests--;
527
528 if (req->r_osd) {
529 /* make sure the original request isn't in flight. */
530 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
531
532 list_del_init(&req->r_osd_item);
533 if (list_empty(&req->r_osd->o_requests))
534 __move_osd_to_lru(osdc, req->r_osd);
535 req->r_osd = NULL;
536 }
537
538 ceph_osdc_put_request(req);
539
540 list_del_init(&req->r_req_lru_item);
541 if (osdc->num_requests == 0) {
542 dout(" no requests, canceling timeout\n");
543 __cancel_osd_timeout(osdc);
544 }
545}
546
547/*
548 * Cancel a previously queued request message
549 */
550static void __cancel_request(struct ceph_osd_request *req)
551{
552 if (req->r_sent) {
553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
554 req->r_sent = 0;
555 }
556 list_del_init(&req->r_req_lru_item);
557}
558
559/*
560 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
561 * (as needed), and set the request r_osd appropriately. If there is
562 * no up osd, set r_osd to NULL.
563 *
564 * Return 0 if unchanged, 1 if changed, or negative on error.
565 *
566 * Caller should hold map_sem for read and request_mutex.
567 */
568static int __map_osds(struct ceph_osd_client *osdc,
569 struct ceph_osd_request *req)
570{
571 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
572 struct ceph_pg pgid;
573 int acting[CEPH_PG_MAX_SIZE];
574 int o = -1, num = 0;
575 int err;
576
577 dout("map_osds %p tid %lld\n", req, req->r_tid);
578 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
579 &req->r_file_layout, osdc->osdmap);
580 if (err)
581 return err;
582 pgid = reqhead->layout.ol_pgid;
583 req->r_pgid = pgid;
584
585 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
586 if (err > 0) {
587 o = acting[0];
588 num = err;
589 }
590
591 if ((req->r_osd && req->r_osd->o_osd == o &&
592 req->r_sent >= req->r_osd->o_incarnation &&
593 req->r_num_pg_osds == num &&
594 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
595 (req->r_osd == NULL && o == -1))
596 return 0; /* no change */
597
598 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
599 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
600 req->r_osd ? req->r_osd->o_osd : -1);
601
602 /* record full pg acting set */
603 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
604 req->r_num_pg_osds = num;
605
606 if (req->r_osd) {
607 __cancel_request(req);
608 list_del_init(&req->r_osd_item);
609 req->r_osd = NULL;
610 }
611
612 req->r_osd = __lookup_osd(osdc, o);
613 if (!req->r_osd && o >= 0) {
614 err = -ENOMEM;
615 req->r_osd = create_osd(osdc);
616 if (!req->r_osd)
617 goto out;
618
619 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
620 req->r_osd->o_osd = o;
621 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
622 __insert_osd(osdc, req->r_osd);
623
624 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
625 }
626
627 if (req->r_osd) {
628 __remove_osd_from_lru(req->r_osd);
629 list_add(&req->r_osd_item, &req->r_osd->o_requests);
630 }
631 err = 1; /* osd or pg changed */
632
633out:
634 return err;
635}
636
637/*
638 * caller should hold map_sem (for read) and request_mutex
639 */
640static int __send_request(struct ceph_osd_client *osdc,
641 struct ceph_osd_request *req)
642{
643 struct ceph_osd_request_head *reqhead;
644 int err;
645
646 err = __map_osds(osdc, req);
647 if (err < 0)
648 return err;
649 if (req->r_osd == NULL) {
650 dout("send_request %p no up osds in pg\n", req);
651 ceph_monc_request_next_osdmap(&osdc->client->monc);
652 return 0;
653 }
654
655 dout("send_request %p tid %llu to osd%d flags %d\n",
656 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
657
658 reqhead = req->r_request->front.iov_base;
659 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
660 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
661 reqhead->reassert_version = req->r_reassert_version;
662
663 req->r_stamp = jiffies;
664 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
665
666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request);
668 req->r_sent = req->r_osd->o_incarnation;
669 return 0;
670}
671
672/*
673 * Timeout callback, called every N seconds when 1 or more osd
674 * requests has been active for more than N seconds. When this
675 * happens, we ping all OSDs with requests who have timed out to
676 * ensure any communications channel reset is detected. Reset the
677 * request timeouts another N seconds in the future as we go.
678 * Reschedule the timeout event another N seconds in future (unless
679 * there are no open requests).
680 */
681static void handle_timeout(struct work_struct *work)
682{
683 struct ceph_osd_client *osdc =
684 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
688 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0;
691 struct rb_node *p;
692 struct list_head slow_osds;
693
694 dout("timeout\n");
695 down_read(&osdc->map_sem);
696
697 ceph_monc_request_next_osdmap(&osdc->client->monc);
698
699 mutex_lock(&osdc->request_mutex);
700 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
701 req = rb_entry(p, struct ceph_osd_request, r_node);
702
703 if (req->r_resend) {
704 int err;
705
706 dout("osdc resending prev failed %lld\n", req->r_tid);
707 err = __send_request(osdc, req);
708 if (err)
709 dout("osdc failed again on %lld\n", req->r_tid);
710 else
711 req->r_resend = false;
712 continue;
713 }
714 }
715
716 /*
717 * reset osds that appear to be _really_ unresponsive. this
718 * is a failsafe measure.. we really shouldn't be getting to
719 * this point if the system is working properly. the monitors
720 * should mark the osd as failed and we should find out about
721 * it from an updated osd map.
722 */
723 while (timeout && !list_empty(&osdc->req_lru)) {
724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
725 r_req_lru_item);
726
727 if (time_before(jiffies, req->r_stamp + timeout))
728 break;
729
730 BUG_ON(req == last_req && req->r_stamp == last_stamp);
731 last_req = req;
732 last_stamp = req->r_stamp;
733
734 osd = req->r_osd;
735 BUG_ON(!osd);
736 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
737 req->r_tid, osd->o_osd);
738 __kick_requests(osdc, osd);
739 }
740
741 /*
742 * ping osds that are a bit slow. this ensures that if there
743 * is a break in the TCP connection we will notice, and reopen
744 * a connection with that osd (from the fault callback).
745 */
746 INIT_LIST_HEAD(&slow_osds);
747 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
748 if (time_before(jiffies, req->r_stamp + keepalive))
749 break;
750
751 osd = req->r_osd;
752 BUG_ON(!osd);
753 dout(" tid %llu is slow, will send keepalive on osd%d\n",
754 req->r_tid, osd->o_osd);
755 list_move_tail(&osd->o_keepalive_item, &slow_osds);
756 }
757 while (!list_empty(&slow_osds)) {
758 osd = list_entry(slow_osds.next, struct ceph_osd,
759 o_keepalive_item);
760 list_del_init(&osd->o_keepalive_item);
761 ceph_con_keepalive(&osd->o_con);
762 }
763
764 __schedule_osd_timeout(osdc);
765 mutex_unlock(&osdc->request_mutex);
766
767 up_read(&osdc->map_sem);
768}
769
770static void handle_osds_timeout(struct work_struct *work)
771{
772 struct ceph_osd_client *osdc =
773 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work);
775 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
777
778 dout("osds timeout\n");
779 down_read(&osdc->map_sem);
780 remove_old_osds(osdc, 0);
781 up_read(&osdc->map_sem);
782
783 schedule_delayed_work(&osdc->osds_timeout_work,
784 round_jiffies_relative(delay));
785}
786
787/*
788 * handle osd op reply. either call the callback if it is specified,
789 * or do the completion to wake up the waiting thread.
790 */
791static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
792 struct ceph_connection *con)
793{
794 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
795 struct ceph_osd_request *req;
796 u64 tid;
797 int numops, object_len, flags;
798 s32 result;
799
800 tid = le64_to_cpu(msg->hdr.tid);
801 if (msg->front.iov_len < sizeof(*rhead))
802 goto bad;
803 numops = le32_to_cpu(rhead->num_ops);
804 object_len = le32_to_cpu(rhead->object_len);
805 result = le32_to_cpu(rhead->result);
806 if (msg->front.iov_len != sizeof(*rhead) + object_len +
807 numops * sizeof(struct ceph_osd_op))
808 goto bad;
809 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
810
811 /* lookup */
812 mutex_lock(&osdc->request_mutex);
813 req = __lookup_request(osdc, tid);
814 if (req == NULL) {
815 dout("handle_reply tid %llu dne\n", tid);
816 mutex_unlock(&osdc->request_mutex);
817 return;
818 }
819 ceph_osdc_get_request(req);
820 flags = le32_to_cpu(rhead->flags);
821
822 /*
823 * if this connection filled our message, drop our reference now, to
824 * avoid a (safe but slower) revoke later.
825 */
826 if (req->r_con_filling_msg == con && req->r_reply == msg) {
827 dout(" dropping con_filling_msg ref %p\n", con);
828 req->r_con_filling_msg = NULL;
829 ceph_con_put(con);
830 }
831
832 if (!req->r_got_reply) {
833 unsigned bytes;
834
835 req->r_result = le32_to_cpu(rhead->result);
836 bytes = le32_to_cpu(msg->hdr.data_len);
837 dout("handle_reply result %d bytes %d\n", req->r_result,
838 bytes);
839 if (req->r_result == 0)
840 req->r_result = bytes;
841
842 /* in case this is a write and we need to replay, */
843 req->r_reassert_version = rhead->reassert_version;
844
845 req->r_got_reply = 1;
846 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
847 dout("handle_reply tid %llu dup ack\n", tid);
848 mutex_unlock(&osdc->request_mutex);
849 goto done;
850 }
851
852 dout("handle_reply tid %llu flags %d\n", tid, flags);
853
854 /* either this is a read, or we got the safe response */
855 if (result < 0 ||
856 (flags & CEPH_OSD_FLAG_ONDISK) ||
857 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
858 __unregister_request(osdc, req);
859
860 mutex_unlock(&osdc->request_mutex);
861
862 if (req->r_callback)
863 req->r_callback(req, msg);
864 else
865 complete_all(&req->r_completion);
866
867 if (flags & CEPH_OSD_FLAG_ONDISK) {
868 if (req->r_safe_callback)
869 req->r_safe_callback(req, msg);
870 complete_all(&req->r_safe_completion); /* fsync waiter */
871 }
872
873done:
874 ceph_osdc_put_request(req);
875 return;
876
877bad:
878 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
879 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
880 (int)sizeof(*rhead));
881 ceph_msg_dump(msg);
882}
883
884
885static int __kick_requests(struct ceph_osd_client *osdc,
886 struct ceph_osd *kickosd)
887{
888 struct ceph_osd_request *req;
889 struct rb_node *p, *n;
890 int needmap = 0;
891 int err;
892
893 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
894 if (kickosd) {
895 err = __reset_osd(osdc, kickosd);
896 if (err == -EAGAIN)
897 return 1;
898 } else {
899 for (p = rb_first(&osdc->osds); p; p = n) {
900 struct ceph_osd *osd =
901 rb_entry(p, struct ceph_osd, o_node);
902
903 n = rb_next(p);
904 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
905 memcmp(&osd->o_con.peer_addr,
906 ceph_osd_addr(osdc->osdmap,
907 osd->o_osd),
908 sizeof(struct ceph_entity_addr)) != 0)
909 __reset_osd(osdc, osd);
910 }
911 }
912
913 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
914 req = rb_entry(p, struct ceph_osd_request, r_node);
915
916 if (req->r_resend) {
917 dout(" r_resend set on tid %llu\n", req->r_tid);
918 __cancel_request(req);
919 goto kick;
920 }
921 if (req->r_osd && kickosd == req->r_osd) {
922 __cancel_request(req);
923 goto kick;
924 }
925
926 err = __map_osds(osdc, req);
927 if (err == 0)
928 continue; /* no change */
929 if (err < 0) {
930 /*
931 * FIXME: really, we should set the request
932 * error and fail if this isn't a 'nofail'
933 * request, but that's a fair bit more
934 * complicated to do. So retry!
935 */
936 dout(" setting r_resend on %llu\n", req->r_tid);
937 req->r_resend = true;
938 continue;
939 }
940 if (req->r_osd == NULL) {
941 dout("tid %llu maps to no valid osd\n", req->r_tid);
942 needmap++; /* request a newer map */
943 continue;
944 }
945
946kick:
947 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
948 req->r_osd ? req->r_osd->o_osd : -1);
949 req->r_flags |= CEPH_OSD_FLAG_RETRY;
950 err = __send_request(osdc, req);
951 if (err) {
952 dout(" setting r_resend on %llu\n", req->r_tid);
953 req->r_resend = true;
954 }
955 }
956
957 return needmap;
958}
959
960/*
961 * Resubmit osd requests whose osd or osd address has changed. Request
962 * a new osd map if osds are down, or we are otherwise unable to determine
963 * how to direct a request.
964 *
965 * Close connections to down osds.
966 *
967 * If @who is specified, resubmit requests for that specific osd.
968 *
969 * Caller should hold map_sem for read and request_mutex.
970 */
971static void kick_requests(struct ceph_osd_client *osdc,
972 struct ceph_osd *kickosd)
973{
974 int needmap;
975
976 mutex_lock(&osdc->request_mutex);
977 needmap = __kick_requests(osdc, kickosd);
978 mutex_unlock(&osdc->request_mutex);
979
980 if (needmap) {
981 dout("%d requests for down osds, need new map\n", needmap);
982 ceph_monc_request_next_osdmap(&osdc->client->monc);
983 }
984
985}
986/*
987 * Process updated osd map.
988 *
989 * The message contains any number of incremental and full maps, normally
990 * indicating some sort of topology change in the cluster. Kick requests
991 * off to different OSDs as needed.
992 */
993void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
994{
995 void *p, *end, *next;
996 u32 nr_maps, maplen;
997 u32 epoch;
998 struct ceph_osdmap *newmap = NULL, *oldmap;
999 int err;
1000 struct ceph_fsid fsid;
1001
1002 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1003 p = msg->front.iov_base;
1004 end = p + msg->front.iov_len;
1005
1006 /* verify fsid */
1007 ceph_decode_need(&p, end, sizeof(fsid), bad);
1008 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1009 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1010 return;
1011
1012 down_write(&osdc->map_sem);
1013
1014 /* incremental maps */
1015 ceph_decode_32_safe(&p, end, nr_maps, bad);
1016 dout(" %d inc maps\n", nr_maps);
1017 while (nr_maps > 0) {
1018 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1019 epoch = ceph_decode_32(&p);
1020 maplen = ceph_decode_32(&p);
1021 ceph_decode_need(&p, end, maplen, bad);
1022 next = p + maplen;
1023 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1024 dout("applying incremental map %u len %d\n",
1025 epoch, maplen);
1026 newmap = osdmap_apply_incremental(&p, next,
1027 osdc->osdmap,
1028 osdc->client->msgr);
1029 if (IS_ERR(newmap)) {
1030 err = PTR_ERR(newmap);
1031 goto bad;
1032 }
1033 BUG_ON(!newmap);
1034 if (newmap != osdc->osdmap) {
1035 ceph_osdmap_destroy(osdc->osdmap);
1036 osdc->osdmap = newmap;
1037 }
1038 } else {
1039 dout("ignoring incremental map %u len %d\n",
1040 epoch, maplen);
1041 }
1042 p = next;
1043 nr_maps--;
1044 }
1045 if (newmap)
1046 goto done;
1047
1048 /* full maps */
1049 ceph_decode_32_safe(&p, end, nr_maps, bad);
1050 dout(" %d full maps\n", nr_maps);
1051 while (nr_maps) {
1052 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1053 epoch = ceph_decode_32(&p);
1054 maplen = ceph_decode_32(&p);
1055 ceph_decode_need(&p, end, maplen, bad);
1056 if (nr_maps > 1) {
1057 dout("skipping non-latest full map %u len %d\n",
1058 epoch, maplen);
1059 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1060 dout("skipping full map %u len %d, "
1061 "older than our %u\n", epoch, maplen,
1062 osdc->osdmap->epoch);
1063 } else {
1064 dout("taking full map %u len %d\n", epoch, maplen);
1065 newmap = osdmap_decode(&p, p+maplen);
1066 if (IS_ERR(newmap)) {
1067 err = PTR_ERR(newmap);
1068 goto bad;
1069 }
1070 BUG_ON(!newmap);
1071 oldmap = osdc->osdmap;
1072 osdc->osdmap = newmap;
1073 if (oldmap)
1074 ceph_osdmap_destroy(oldmap);
1075 }
1076 p += maplen;
1077 nr_maps--;
1078 }
1079
1080done:
1081 downgrade_write(&osdc->map_sem);
1082 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1083 if (newmap)
1084 kick_requests(osdc, NULL);
1085 up_read(&osdc->map_sem);
1086 wake_up_all(&osdc->client->auth_wq);
1087 return;
1088
1089bad:
1090 pr_err("osdc handle_map corrupt msg\n");
1091 ceph_msg_dump(msg);
1092 up_write(&osdc->map_sem);
1093 return;
1094}
1095
1096/*
1097 * Register request, send initial attempt.
1098 */
1099int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1100 struct ceph_osd_request *req,
1101 bool nofail)
1102{
1103 int rc = 0;
1104
1105 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages;
1107
1108 register_request(osdc, req);
1109
1110 down_read(&osdc->map_sem);
1111 mutex_lock(&osdc->request_mutex);
1112 /*
1113 * a racing kick_requests() may have sent the message for us
1114 * while we dropped request_mutex above, so only send now if
1115 * the request still han't been touched yet.
1116 */
1117 if (req->r_sent == 0) {
1118 rc = __send_request(osdc, req);
1119 if (rc) {
1120 if (nofail) {
1121 dout("osdc_start_request failed send, "
1122 " marking %lld\n", req->r_tid);
1123 req->r_resend = true;
1124 rc = 0;
1125 } else {
1126 __unregister_request(osdc, req);
1127 }
1128 }
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131 up_read(&osdc->map_sem);
1132 return rc;
1133}
1134
1135/*
1136 * wait for a request to complete
1137 */
1138int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1139 struct ceph_osd_request *req)
1140{
1141 int rc;
1142
1143 rc = wait_for_completion_interruptible(&req->r_completion);
1144 if (rc < 0) {
1145 mutex_lock(&osdc->request_mutex);
1146 __cancel_request(req);
1147 __unregister_request(osdc, req);
1148 mutex_unlock(&osdc->request_mutex);
1149 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1150 return rc;
1151 }
1152
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result;
1155}
1156
1157/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation.
1159 */
1160void ceph_osdc_sync(struct ceph_osd_client *osdc)
1161{
1162 struct ceph_osd_request *req;
1163 u64 last_tid, next_tid = 0;
1164
1165 mutex_lock(&osdc->request_mutex);
1166 last_tid = osdc->last_tid;
1167 while (1) {
1168 req = __lookup_request_ge(osdc, next_tid);
1169 if (!req)
1170 break;
1171 if (req->r_tid > last_tid)
1172 break;
1173
1174 next_tid = req->r_tid + 1;
1175 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1176 continue;
1177
1178 ceph_osdc_get_request(req);
1179 mutex_unlock(&osdc->request_mutex);
1180 dout("sync waiting on tid %llu (last is %llu)\n",
1181 req->r_tid, last_tid);
1182 wait_for_completion(&req->r_safe_completion);
1183 mutex_lock(&osdc->request_mutex);
1184 ceph_osdc_put_request(req);
1185 }
1186 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid);
1188}
1189
1190/*
1191 * init, shutdown
1192 */
1193int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1194{
1195 int err;
1196
1197 dout("init\n");
1198 osdc->client = client;
1199 osdc->osdmap = NULL;
1200 init_rwsem(&osdc->map_sem);
1201 init_completion(&osdc->map_waiters);
1202 osdc->last_requested_map = 0;
1203 mutex_init(&osdc->request_mutex);
1204 osdc->last_tid = 0;
1205 osdc->osds = RB_ROOT;
1206 INIT_LIST_HEAD(&osdc->osd_lru);
1207 osdc->requests = RB_ROOT;
1208 INIT_LIST_HEAD(&osdc->req_lru);
1209 osdc->num_requests = 0;
1210 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212
1213 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1215
1216 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1218 sizeof(struct ceph_osd_request));
1219 if (!osdc->req_mempool)
1220 goto out;
1221
1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1224 if (err < 0)
1225 goto out_mempool;
1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1229 if (err < 0)
1230 goto out_msgpool;
1231 return 0;
1232
1233out_msgpool:
1234 ceph_msgpool_destroy(&osdc->msgpool_op);
1235out_mempool:
1236 mempool_destroy(osdc->req_mempool);
1237out:
1238 return err;
1239}
1240
1241void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{
1243 cancel_delayed_work_sync(&osdc->timeout_work);
1244 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1245 if (osdc->osdmap) {
1246 ceph_osdmap_destroy(osdc->osdmap);
1247 osdc->osdmap = NULL;
1248 }
1249 remove_old_osds(osdc, 1);
1250 mempool_destroy(osdc->req_mempool);
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253}
1254
1255/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten
1257 * *plen. Return number of bytes read, or error.
1258 */
1259int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1260 struct ceph_vino vino, struct ceph_file_layout *layout,
1261 u64 off, u64 *plen,
1262 u32 truncate_seq, u64 truncate_size,
1263 struct page **pages, int num_pages)
1264{
1265 struct ceph_osd_request *req;
1266 int rc = 0;
1267
1268 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1269 vino.snap, off, *plen);
1270 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1272 NULL, 0, truncate_seq, truncate_size, NULL,
1273 false, 1);
1274 if (!req)
1275 return -ENOMEM;
1276
1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages;
1279 num_pages = calc_pages_for(off, *plen);
1280 req->r_num_pages = num_pages;
1281
1282 dout("readpages final extent is %llu~%llu (%d pages)\n",
1283 off, *plen, req->r_num_pages);
1284
1285 rc = ceph_osdc_start_request(osdc, req, false);
1286 if (!rc)
1287 rc = ceph_osdc_wait_request(osdc, req);
1288
1289 ceph_osdc_put_request(req);
1290 dout("readpages result %d\n", rc);
1291 return rc;
1292}
1293
1294/*
1295 * do a synchronous write on N pages
1296 */
1297int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1298 struct ceph_file_layout *layout,
1299 struct ceph_snap_context *snapc,
1300 u64 off, u64 len,
1301 u32 truncate_seq, u64 truncate_size,
1302 struct timespec *mtime,
1303 struct page **pages, int num_pages,
1304 int flags, int do_sync, bool nofail)
1305{
1306 struct ceph_osd_request *req;
1307 int rc = 0;
1308
1309 BUG_ON(vino.snap != CEPH_NOSNAP);
1310 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1311 CEPH_OSD_OP_WRITE,
1312 flags | CEPH_OSD_FLAG_ONDISK |
1313 CEPH_OSD_FLAG_WRITE,
1314 snapc, do_sync,
1315 truncate_seq, truncate_size, mtime,
1316 nofail, 1);
1317 if (!req)
1318 return -ENOMEM;
1319
1320 /* it may be a short write due to an object boundary */
1321 req->r_pages = pages;
1322 req->r_num_pages = calc_pages_for(off, len);
1323 dout("writepages %llu~%llu (%d pages)\n", off, len,
1324 req->r_num_pages);
1325
1326 rc = ceph_osdc_start_request(osdc, req, nofail);
1327 if (!rc)
1328 rc = ceph_osdc_wait_request(osdc, req);
1329
1330 ceph_osdc_put_request(req);
1331 if (rc == 0)
1332 rc = len;
1333 dout("writepages result %d\n", rc);
1334 return rc;
1335}
1336
1337/*
1338 * handle incoming message
1339 */
1340static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1341{
1342 struct ceph_osd *osd = con->private;
1343 struct ceph_osd_client *osdc;
1344 int type = le16_to_cpu(msg->hdr.type);
1345
1346 if (!osd)
1347 goto out;
1348 osdc = osd->o_osdc;
1349
1350 switch (type) {
1351 case CEPH_MSG_OSD_MAP:
1352 ceph_osdc_handle_map(osdc, msg);
1353 break;
1354 case CEPH_MSG_OSD_OPREPLY:
1355 handle_reply(osdc, msg, con);
1356 break;
1357
1358 default:
1359 pr_err("received unknown message type %d %s\n", type,
1360 ceph_msg_type_name(type));
1361 }
1362out:
1363 ceph_msg_put(msg);
1364}
1365
1366/*
1367 * lookup and return message for incoming reply. set up reply message
1368 * pages.
1369 */
1370static struct ceph_msg *get_reply(struct ceph_connection *con,
1371 struct ceph_msg_header *hdr,
1372 int *skip)
1373{
1374 struct ceph_osd *osd = con->private;
1375 struct ceph_osd_client *osdc = osd->o_osdc;
1376 struct ceph_msg *m;
1377 struct ceph_osd_request *req;
1378 int front = le32_to_cpu(hdr->front_len);
1379 int data_len = le32_to_cpu(hdr->data_len);
1380 u64 tid;
1381
1382 tid = le64_to_cpu(hdr->tid);
1383 mutex_lock(&osdc->request_mutex);
1384 req = __lookup_request(osdc, tid);
1385 if (!req) {
1386 *skip = 1;
1387 m = NULL;
1388 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1389 osd->o_osd);
1390 goto out;
1391 }
1392
1393 if (req->r_con_filling_msg) {
1394 dout("get_reply revoking msg %p from old con %p\n",
1395 req->r_reply, req->r_con_filling_msg);
1396 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1397 ceph_con_put(req->r_con_filling_msg);
1398 req->r_con_filling_msg = NULL;
1399 }
1400
1401 if (front > req->r_reply->front.iov_len) {
1402 pr_warning("get_reply front %d > preallocated %d\n",
1403 front, (int)req->r_reply->front.iov_len);
1404 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1405 if (!m)
1406 goto out;
1407 ceph_msg_put(req->r_reply);
1408 req->r_reply = m;
1409 }
1410 m = ceph_msg_get(req->r_reply);
1411
1412 if (data_len > 0) {
1413 unsigned data_off = le16_to_cpu(hdr->data_off);
1414 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1415
1416 if (unlikely(req->r_num_pages < want)) {
1417 pr_warning("tid %lld reply %d > expected %d pages\n",
1418 tid, want, m->nr_pages);
1419 *skip = 1;
1420 ceph_msg_put(m);
1421 m = NULL;
1422 goto out;
1423 }
1424 m->pages = req->r_pages;
1425 m->nr_pages = req->r_num_pages;
1426 }
1427 *skip = 0;
1428 req->r_con_filling_msg = ceph_con_get(con);
1429 dout("get_reply tid %lld %p\n", tid, m);
1430
1431out:
1432 mutex_unlock(&osdc->request_mutex);
1433 return m;
1434
1435}
1436
1437static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1438 struct ceph_msg_header *hdr,
1439 int *skip)
1440{
1441 struct ceph_osd *osd = con->private;
1442 int type = le16_to_cpu(hdr->type);
1443 int front = le32_to_cpu(hdr->front_len);
1444
1445 switch (type) {
1446 case CEPH_MSG_OSD_MAP:
1447 return ceph_msg_new(type, front, GFP_NOFS);
1448 case CEPH_MSG_OSD_OPREPLY:
1449 return get_reply(con, hdr, skip);
1450 default:
1451 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1452 osd->o_osd);
1453 *skip = 1;
1454 return NULL;
1455 }
1456}
1457
1458/*
1459 * Wrappers to refcount containing ceph_osd struct
1460 */
1461static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1462{
1463 struct ceph_osd *osd = con->private;
1464 if (get_osd(osd))
1465 return con;
1466 return NULL;
1467}
1468
1469static void put_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 put_osd(osd);
1473}
1474
1475/*
1476 * authentication
1477 */
1478static int get_authorizer(struct ceph_connection *con,
1479 void **buf, int *len, int *proto,
1480 void **reply_buf, int *reply_len, int force_new)
1481{
1482 struct ceph_osd *o = con->private;
1483 struct ceph_osd_client *osdc = o->o_osdc;
1484 struct ceph_auth_client *ac = osdc->client->monc.auth;
1485 int ret = 0;
1486
1487 if (force_new && o->o_authorizer) {
1488 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1489 o->o_authorizer = NULL;
1490 }
1491 if (o->o_authorizer == NULL) {
1492 ret = ac->ops->create_authorizer(
1493 ac, CEPH_ENTITY_TYPE_OSD,
1494 &o->o_authorizer,
1495 &o->o_authorizer_buf,
1496 &o->o_authorizer_buf_len,
1497 &o->o_authorizer_reply_buf,
1498 &o->o_authorizer_reply_buf_len);
1499 if (ret)
1500 return ret;
1501 }
1502
1503 *proto = ac->protocol;
1504 *buf = o->o_authorizer_buf;
1505 *len = o->o_authorizer_buf_len;
1506 *reply_buf = o->o_authorizer_reply_buf;
1507 *reply_len = o->o_authorizer_reply_buf_len;
1508 return 0;
1509}
1510
1511
1512static int verify_authorizer_reply(struct ceph_connection *con, int len)
1513{
1514 struct ceph_osd *o = con->private;
1515 struct ceph_osd_client *osdc = o->o_osdc;
1516 struct ceph_auth_client *ac = osdc->client->monc.auth;
1517
1518 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1519}
1520
1521static int invalidate_authorizer(struct ceph_connection *con)
1522{
1523 struct ceph_osd *o = con->private;
1524 struct ceph_osd_client *osdc = o->o_osdc;
1525 struct ceph_auth_client *ac = osdc->client->monc.auth;
1526
1527 if (ac->ops->invalidate_authorizer)
1528 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1529
1530 return ceph_monc_validate_auth(&osdc->client->monc);
1531}
1532
1533static const struct ceph_connection_operations osd_con_ops = {
1534 .get = get_osd_con,
1535 .put = put_osd_con,
1536 .dispatch = dispatch,
1537 .get_authorizer = get_authorizer,
1538 .verify_authorizer_reply = verify_authorizer_reply,
1539 .invalidate_authorizer = invalidate_authorizer,
1540 .alloc_msg = alloc_msg,
1541 .fault = osd_reset,
1542};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6..00000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
53
54 struct ceph_connection *r_con_filling_msg;
55
56 struct ceph_msg *r_request, *r_reply;
57 int r_result;
58 int r_flags; /* any additional flags for the osd */
59 u32 r_sent; /* >0 if r_request is sending/sent */
60 int r_got_reply;
61
62 struct ceph_osd_client *r_osdc;
63 struct kref r_kref;
64 bool r_mempool;
65 struct completion r_completion, r_safe_completion;
66 ceph_osdc_callback_t r_callback, r_safe_callback;
67 struct ceph_eversion r_reassert_version;
68 struct list_head r_unsafe_item;
69
70 struct inode *r_inode; /* for use by callbacks */
71
72 char r_oid[40]; /* object name */
73 int r_oid_len;
74 unsigned long r_stamp; /* send OR check time */
75 bool r_resend; /* msg send failed, needs retry */
76
77 struct ceph_file_layout r_file_layout;
78 struct ceph_snap_context *r_snapc; /* snap context for writes */
79 unsigned r_num_pages; /* size of page array (follows) */
80 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */
83};
84
85struct ceph_osd_client {
86 struct ceph_client *client;
87
88 struct ceph_osdmap *osdmap; /* current map */
89 struct rw_semaphore map_sem;
90 struct completion map_waiters;
91 u64 last_requested_map;
92
93 struct mutex request_mutex;
94 struct rb_root osds; /* osds */
95 struct list_head osd_lru; /* idle osds */
96 u64 timeout_tid; /* tid of timeout triggering rq */
97 u64 last_tid; /* tid of last request */
98 struct rb_root requests; /* pending requests */
99 struct list_head req_lru; /* pending requests lru */
100 int num_requests;
101 struct delayed_work timeout_work;
102 struct delayed_work osds_timeout_work;
103#ifdef CONFIG_DEBUG_FS
104 struct dentry *debugfs_file;
105#endif
106
107 mempool_t *req_mempool;
108
109 struct ceph_msgpool msgpool_op;
110 struct ceph_msgpool msgpool_op_reply;
111};
112
113extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
116
117extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
118 struct ceph_msg *msg);
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg);
121
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 offset, u64 *len, int op, int flags,
126 struct ceph_snap_context *snapc,
127 int do_sync, u32 truncate_seq,
128 u64 truncate_size,
129 struct timespec *mtime,
130 bool use_mempool, int num_reply);
131
132static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
133{
134 kref_get(&req->r_kref);
135}
136extern void ceph_osdc_release_request(struct kref *kref);
137static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
138{
139 kref_put(&req->r_kref, ceph_osdc_release_request);
140}
141
142extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
143 struct ceph_osd_request *req,
144 bool nofail);
145extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
146 struct ceph_osd_request *req);
147extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
148
149extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
150 struct ceph_vino vino,
151 struct ceph_file_layout *layout,
152 u64 off, u64 *plen,
153 u32 truncate_seq, u64 truncate_size,
154 struct page **pages, int nr_pages);
155
156extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
157 struct ceph_vino vino,
158 struct ceph_file_layout *layout,
159 struct ceph_snap_context *sc,
160 u64 off, u64 len,
161 u32 truncate_seq, u64 truncate_size,
162 struct timespec *mtime,
163 struct page **pages, int nr_pages,
164 int flags, int do_sync, bool nofail);
165
166#endif
167
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index 416d46adbf8..00000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1087 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
527/*
528 * decode a full map.
529 */
530struct ceph_osdmap *osdmap_decode(void **p, void *end)
531{
532 struct ceph_osdmap *map;
533 u16 version;
534 u32 len, max, i;
535 u8 ev;
536 int err = -EINVAL;
537 void *start = *p;
538 struct ceph_pg_pool_info *pi;
539
540 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
541
542 map = kzalloc(sizeof(*map), GFP_NOFS);
543 if (map == NULL)
544 return ERR_PTR(-ENOMEM);
545 map->pg_temp = RB_ROOT;
546
547 ceph_decode_16_safe(p, end, version, bad);
548 if (version > CEPH_OSDMAP_VERSION) {
549 pr_warning("got unknown v %d > %d of osdmap\n", version,
550 CEPH_OSDMAP_VERSION);
551 goto bad;
552 }
553
554 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
555 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
556 map->epoch = ceph_decode_32(p);
557 ceph_decode_copy(p, &map->created, sizeof(map->created));
558 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
559
560 ceph_decode_32_safe(p, end, max, bad);
561 while (max--) {
562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
564 if (!pi)
565 goto bad;
566 pi->id = ceph_decode_32(p);
567 ev = ceph_decode_8(p); /* encoding version */
568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION);
571 kfree(pi);
572 goto bad;
573 }
574 __decode_pool(p, pi);
575 __insert_pg_pool(&map->pg_pools, pi);
576 }
577
578 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
579 goto bad;
580
581 ceph_decode_32_safe(p, end, map->pool_max, bad);
582
583 ceph_decode_32_safe(p, end, map->flags, bad);
584
585 max = ceph_decode_32(p);
586
587 /* (re)alloc osd arrays */
588 err = osdmap_set_max_osd(map, max);
589 if (err < 0)
590 goto bad;
591 dout("osdmap_decode max_osd = %d\n", map->max_osd);
592
593 /* osds */
594 err = -EINVAL;
595 ceph_decode_need(p, end, 3*sizeof(u32) +
596 map->max_osd*(1 + sizeof(*map->osd_weight) +
597 sizeof(*map->osd_addr)), bad);
598 *p += 4; /* skip length field (should match max) */
599 ceph_decode_copy(p, map->osd_state, map->max_osd);
600
601 *p += 4; /* skip length field (should match max) */
602 for (i = 0; i < map->max_osd; i++)
603 map->osd_weight[i] = ceph_decode_32(p);
604
605 *p += 4; /* skip length field (should match max) */
606 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
607 for (i = 0; i < map->max_osd; i++)
608 ceph_decode_addr(&map->osd_addr[i]);
609
610 /* pg_temp */
611 ceph_decode_32_safe(p, end, len, bad);
612 for (i = 0; i < len; i++) {
613 int n, j;
614 struct ceph_pg pgid;
615 struct ceph_pg_mapping *pg;
616
617 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
618 ceph_decode_copy(p, &pgid, sizeof(pgid));
619 n = ceph_decode_32(p);
620 ceph_decode_need(p, end, n * sizeof(u32), bad);
621 err = -ENOMEM;
622 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
623 if (!pg)
624 goto bad;
625 pg->pgid = pgid;
626 pg->len = n;
627 for (j = 0; j < n; j++)
628 pg->osds[j] = ceph_decode_32(p);
629
630 err = __insert_pg_mapping(pg, &map->pg_temp);
631 if (err)
632 goto bad;
633 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
634 }
635
636 /* crush */
637 ceph_decode_32_safe(p, end, len, bad);
638 dout("osdmap_decode crush len %d from off 0x%x\n", len,
639 (int)(*p - start));
640 ceph_decode_need(p, end, len, bad);
641 map->crush = crush_decode(*p, end);
642 *p += len;
643 if (IS_ERR(map->crush)) {
644 err = PTR_ERR(map->crush);
645 map->crush = NULL;
646 goto bad;
647 }
648
649 /* ignore the rest of the map */
650 *p = end;
651
652 dout("osdmap_decode done %p %p\n", *p, end);
653 return map;
654
655bad:
656 dout("osdmap_decode fail\n");
657 ceph_osdmap_destroy(map);
658 return ERR_PTR(err);
659}
660
661/*
662 * decode and apply an incremental map update.
663 */
664struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
665 struct ceph_osdmap *map,
666 struct ceph_messenger *msgr)
667{
668 struct crush_map *newcrush = NULL;
669 struct ceph_fsid fsid;
670 u32 epoch = 0;
671 struct ceph_timespec modified;
672 u32 len, pool;
673 __s32 new_pool_max, new_flags, max;
674 void *start = *p;
675 int err = -EINVAL;
676 u16 version;
677 struct rb_node *rbp;
678
679 ceph_decode_16_safe(p, end, version, bad);
680 if (version > CEPH_OSDMAP_INC_VERSION) {
681 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
682 CEPH_OSDMAP_INC_VERSION);
683 goto bad;
684 }
685
686 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
687 bad);
688 ceph_decode_copy(p, &fsid, sizeof(fsid));
689 epoch = ceph_decode_32(p);
690 BUG_ON(epoch != map->epoch+1);
691 ceph_decode_copy(p, &modified, sizeof(modified));
692 new_pool_max = ceph_decode_32(p);
693 new_flags = ceph_decode_32(p);
694
695 /* full map? */
696 ceph_decode_32_safe(p, end, len, bad);
697 if (len > 0) {
698 dout("apply_incremental full map len %d, %p to %p\n",
699 len, *p, end);
700 return osdmap_decode(p, min(*p+len, end));
701 }
702
703 /* new crush? */
704 ceph_decode_32_safe(p, end, len, bad);
705 if (len > 0) {
706 dout("apply_incremental new crush map len %d, %p to %p\n",
707 len, *p, end);
708 newcrush = crush_decode(*p, min(*p+len, end));
709 if (IS_ERR(newcrush))
710 return ERR_CAST(newcrush);
711 *p += len;
712 }
713
714 /* new flags? */
715 if (new_flags >= 0)
716 map->flags = new_flags;
717 if (new_pool_max >= 0)
718 map->pool_max = new_pool_max;
719
720 ceph_decode_need(p, end, 5*sizeof(u32), bad);
721
722 /* new max? */
723 max = ceph_decode_32(p);
724 if (max >= 0) {
725 err = osdmap_set_max_osd(map, max);
726 if (err < 0)
727 goto bad;
728 }
729
730 map->epoch++;
731 map->modified = map->modified;
732 if (newcrush) {
733 if (map->crush)
734 crush_destroy(map->crush);
735 map->crush = newcrush;
736 newcrush = NULL;
737 }
738
739 /* new_pool */
740 ceph_decode_32_safe(p, end, len, bad);
741 while (len--) {
742 __u8 ev;
743 struct ceph_pg_pool_info *pi;
744
745 ceph_decode_32_safe(p, end, pool, bad);
746 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
747 ev = ceph_decode_8(p); /* encoding version */
748 if (ev > CEPH_PG_POOL_VERSION) {
749 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
750 ev, CEPH_PG_POOL_VERSION);
751 goto bad;
752 }
753 pi = __lookup_pg_pool(&map->pg_pools, pool);
754 if (!pi) {
755 pi = kzalloc(sizeof(*pi), GFP_NOFS);
756 if (!pi) {
757 err = -ENOMEM;
758 goto bad;
759 }
760 pi->id = pool;
761 __insert_pg_pool(&map->pg_pools, pi);
762 }
763 __decode_pool(p, pi);
764 }
765 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
766 goto bad;
767
768 /* old_pool */
769 ceph_decode_32_safe(p, end, len, bad);
770 while (len--) {
771 struct ceph_pg_pool_info *pi;
772
773 ceph_decode_32_safe(p, end, pool, bad);
774 pi = __lookup_pg_pool(&map->pg_pools, pool);
775 if (pi)
776 __remove_pg_pool(&map->pg_pools, pi);
777 }
778
779 /* new_up */
780 err = -EINVAL;
781 ceph_decode_32_safe(p, end, len, bad);
782 while (len--) {
783 u32 osd;
784 struct ceph_entity_addr addr;
785 ceph_decode_32_safe(p, end, osd, bad);
786 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
787 ceph_decode_addr(&addr);
788 pr_info("osd%d up\n", osd);
789 BUG_ON(osd >= map->max_osd);
790 map->osd_state[osd] |= CEPH_OSD_UP;
791 map->osd_addr[osd] = addr;
792 }
793
794 /* new_down */
795 ceph_decode_32_safe(p, end, len, bad);
796 while (len--) {
797 u32 osd;
798 ceph_decode_32_safe(p, end, osd, bad);
799 (*p)++; /* clean flag */
800 pr_info("osd%d down\n", osd);
801 if (osd < map->max_osd)
802 map->osd_state[osd] &= ~CEPH_OSD_UP;
803 }
804
805 /* new_weight */
806 ceph_decode_32_safe(p, end, len, bad);
807 while (len--) {
808 u32 osd, off;
809 ceph_decode_need(p, end, sizeof(u32)*2, bad);
810 osd = ceph_decode_32(p);
811 off = ceph_decode_32(p);
812 pr_info("osd%d weight 0x%x %s\n", osd, off,
813 off == CEPH_OSD_IN ? "(in)" :
814 (off == CEPH_OSD_OUT ? "(out)" : ""));
815 if (osd < map->max_osd)
816 map->osd_weight[osd] = off;
817 }
818
819 /* new_pg_temp */
820 rbp = rb_first(&map->pg_temp);
821 ceph_decode_32_safe(p, end, len, bad);
822 while (len--) {
823 struct ceph_pg_mapping *pg;
824 int j;
825 struct ceph_pg pgid;
826 u32 pglen;
827 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
828 ceph_decode_copy(p, &pgid, sizeof(pgid));
829 pglen = ceph_decode_32(p);
830
831 /* remove any? */
832 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
833 node)->pgid, pgid) <= 0) {
834 struct ceph_pg_mapping *cur =
835 rb_entry(rbp, struct ceph_pg_mapping, node);
836
837 rbp = rb_next(rbp);
838 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
839 rb_erase(&cur->node, &map->pg_temp);
840 kfree(cur);
841 }
842
843 if (pglen) {
844 /* insert */
845 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
846 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
847 if (!pg) {
848 err = -ENOMEM;
849 goto bad;
850 }
851 pg->pgid = pgid;
852 pg->len = pglen;
853 for (j = 0; j < pglen; j++)
854 pg->osds[j] = ceph_decode_32(p);
855 err = __insert_pg_mapping(pg, &map->pg_temp);
856 if (err) {
857 kfree(pg);
858 goto bad;
859 }
860 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
861 pglen);
862 }
863 }
864 while (rbp) {
865 struct ceph_pg_mapping *cur =
866 rb_entry(rbp, struct ceph_pg_mapping, node);
867
868 rbp = rb_next(rbp);
869 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
870 rb_erase(&cur->node, &map->pg_temp);
871 kfree(cur);
872 }
873
874 /* ignore the rest */
875 *p = end;
876 return map;
877
878bad:
879 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
880 epoch, (int)(*p - start), *p, start, end);
881 print_hex_dump(KERN_DEBUG, "osdmap: ",
882 DUMP_PREFIX_OFFSET, 16, 1,
883 start, end - start, true);
884 if (newcrush)
885 crush_destroy(newcrush);
886 return ERR_PTR(err);
887}
888
889
890
891
892/*
893 * calculate file layout from given offset, length.
894 * fill in correct oid, logical length, and object extent
895 * offset, length.
896 *
897 * for now, we write only a single su, until we can
898 * pass a stride back to the caller.
899 */
900void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
901 u64 off, u64 *plen,
902 u64 *ono,
903 u64 *oxoff, u64 *oxlen)
904{
905 u32 osize = le32_to_cpu(layout->fl_object_size);
906 u32 su = le32_to_cpu(layout->fl_stripe_unit);
907 u32 sc = le32_to_cpu(layout->fl_stripe_count);
908 u32 bl, stripeno, stripepos, objsetno;
909 u32 su_per_object;
910 u64 t, su_offset;
911
912 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
913 osize, su);
914 su_per_object = osize / su;
915 dout("osize %u / su %u = su_per_object %u\n", osize, su,
916 su_per_object);
917
918 BUG_ON((su & ~PAGE_MASK) != 0);
919 /* bl = *off / su; */
920 t = off;
921 do_div(t, su);
922 bl = t;
923 dout("off %llu / su %u = bl %u\n", off, su, bl);
924
925 stripeno = bl / sc;
926 stripepos = bl % sc;
927 objsetno = stripeno / su_per_object;
928
929 *ono = objsetno * sc + stripepos;
930 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
931
932 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
933 t = off;
934 su_offset = do_div(t, su);
935 *oxoff = su_offset + (stripeno % su_per_object) * su;
936
937 /*
938 * Calculate the length of the extent being written to the selected
939 * object. This is the minimum of the full length requested (plen) or
940 * the remainder of the current stripe being written to.
941 */
942 *oxlen = min_t(u64, *plen, su - su_offset);
943 *plen = *oxlen;
944
945 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
946}
947
948/*
949 * calculate an object layout (i.e. pgid) from an oid,
950 * file_layout, and osdmap
951 */
952int ceph_calc_object_layout(struct ceph_object_layout *ol,
953 const char *oid,
954 struct ceph_file_layout *fl,
955 struct ceph_osdmap *osdmap)
956{
957 unsigned num, num_mask;
958 struct ceph_pg pgid;
959 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
960 int poolid = le32_to_cpu(fl->fl_pg_pool);
961 struct ceph_pg_pool_info *pool;
962 unsigned ps;
963
964 BUG_ON(!osdmap);
965
966 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
967 if (!pool)
968 return -EIO;
969 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
970 if (preferred >= 0) {
971 ps += preferred;
972 num = le32_to_cpu(pool->v.lpg_num);
973 num_mask = pool->lpg_num_mask;
974 } else {
975 num = le32_to_cpu(pool->v.pg_num);
976 num_mask = pool->pg_num_mask;
977 }
978
979 pgid.ps = cpu_to_le16(ps);
980 pgid.preferred = cpu_to_le16(preferred);
981 pgid.pool = fl->fl_pg_pool;
982 if (preferred >= 0)
983 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
984 (int)preferred);
985 else
986 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
987
988 ol->ol_pgid = pgid;
989 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
990 return 0;
991}
992
993/*
994 * Calculate raw osd vector for the given pgid. Return pointer to osd
995 * array, or NULL on failure.
996 */
997static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
998 int *osds, int *num)
999{
1000 struct ceph_pg_mapping *pg;
1001 struct ceph_pg_pool_info *pool;
1002 int ruleno;
1003 unsigned poolid, ps, pps;
1004 int preferred;
1005
1006 /* pg_temp? */
1007 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1008 if (pg) {
1009 *num = pg->len;
1010 return pg->osds;
1011 }
1012
1013 /* crush */
1014 poolid = le32_to_cpu(pgid.pool);
1015 ps = le16_to_cpu(pgid.ps);
1016 preferred = (s16)le16_to_cpu(pgid.preferred);
1017
1018 /* don't forcefeed bad device ids to crush */
1019 if (preferred >= osdmap->max_osd ||
1020 preferred >= osdmap->crush->max_devices)
1021 preferred = -1;
1022
1023 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1024 if (!pool)
1025 return NULL;
1026 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1027 pool->v.type, pool->v.size);
1028 if (ruleno < 0) {
1029 pr_err("no crush rule pool %d type %d size %d\n",
1030 poolid, pool->v.type, pool->v.size);
1031 return NULL;
1032 }
1033
1034 if (preferred >= 0)
1035 pps = ceph_stable_mod(ps,
1036 le32_to_cpu(pool->v.lpgp_num),
1037 pool->lpgp_num_mask);
1038 else
1039 pps = ceph_stable_mod(ps,
1040 le32_to_cpu(pool->v.pgp_num),
1041 pool->pgp_num_mask);
1042 pps += poolid;
1043 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1044 min_t(int, pool->v.size, *num),
1045 preferred, osdmap->osd_weight);
1046 return osds;
1047}
1048
1049/*
1050 * Return acting set for given pgid.
1051 */
1052int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1053 int *acting)
1054{
1055 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1056 int i, o, num = CEPH_PG_MAX_SIZE;
1057
1058 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1059 if (!osds)
1060 return -1;
1061
1062 /* primary is first up osd */
1063 o = 0;
1064 for (i = 0; i < num; i++)
1065 if (ceph_osd_is_up(osdmap, osds[i]))
1066 acting[o++] = osds[i];
1067 return o;
1068}
1069
1070/*
1071 * Return primary osd for given pgid, or -1 if none.
1072 */
1073int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1074{
1075 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1076 int i, num = CEPH_PG_MAX_SIZE;
1077
1078 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1079 if (!osds)
1080 return -1;
1081
1082 /* primary is first up osd */
1083 for (i = 0; i < num; i++)
1084 if (ceph_osd_is_up(osdmap, osds[i]))
1085 return osds[i];
1086 return -1;
1087}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510..00000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index b6859f47d36..00000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,55 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e108..00000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 8fcc023056c..00000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,396 +0,0 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206
207 /** attrs **/
208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
212
213 /* write */
214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
215 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
216 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
217 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
218
219 /** subop **/
220 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
221 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
222 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
223 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
224 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
225
226 /** lock **/
227 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
228 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
229 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
230 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
231 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
232 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
233
234 /** exec **/
235 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
236
237 /** pg **/
238 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
239};
240
241static inline int ceph_osd_op_type_lock(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
244}
245static inline int ceph_osd_op_type_data(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
248}
249static inline int ceph_osd_op_type_attr(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
252}
253static inline int ceph_osd_op_type_exec(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
256}
257static inline int ceph_osd_op_type_pg(int op)
258{
259 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
260}
261
262static inline int ceph_osd_op_mode_subop(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
265}
266static inline int ceph_osd_op_mode_read(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
269}
270static inline int ceph_osd_op_mode_modify(int op)
271{
272 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
273}
274
275#define CEPH_OSD_TMAP_HDR 'h'
276#define CEPH_OSD_TMAP_SET 's'
277#define CEPH_OSD_TMAP_RM 'r'
278
279extern const char *ceph_osd_op_name(int op);
280
281
282/*
283 * osd op flags
284 *
285 * An op may be READ, WRITE, or READ|WRITE.
286 */
287enum {
288 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
289 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
290 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
291 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
292 CEPH_OSD_FLAG_READ = 16, /* op may read */
293 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
294 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
295 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
296 CEPH_OSD_FLAG_BALANCE_READS = 256,
297 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
298 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
299 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
300};
301
302enum {
303 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
304};
305
306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
325/*
326 * an individual object operation. each may be accompanied by some data
327 * payload
328 */
329struct ceph_osd_op {
330 __le16 op; /* CEPH_OSD_OP_* */
331 __le32 flags; /* CEPH_OSD_FLAG_* */
332 union {
333 struct {
334 __le64 offset, length;
335 __le64 truncate_size;
336 __le32 truncate_seq;
337 } __attribute__ ((packed)) extent;
338 struct {
339 __le32 name_len;
340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
343 } __attribute__ ((packed)) xattr;
344 struct {
345 __u8 class_len;
346 __u8 method_len;
347 __u8 argc;
348 __le32 indata_len;
349 } __attribute__ ((packed)) cls;
350 struct {
351 __le64 cookie, count;
352 } __attribute__ ((packed)) pgls;
353 };
354 __le32 payload_len;
355} __attribute__ ((packed));
356
357/*
358 * osd request message header. each request may include multiple
359 * ceph_osd_op object operations.
360 */
361struct ceph_osd_request_head {
362 __le32 client_inc; /* client incarnation */
363 struct ceph_object_layout layout; /* pgid */
364 __le32 osdmap_epoch; /* client's osdmap epoch */
365
366 __le32 flags;
367
368 struct ceph_timespec mtime; /* for mutations only */
369 struct ceph_eversion reassert_version; /* if we are replaying op */
370
371 __le32 object_len; /* length of object name */
372
373 __le64 snapid; /* snapid to read */
374 __le64 snap_seq; /* writer's snap context */
375 __le32 num_snaps;
376
377 __le16 num_ops;
378 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
379} __attribute__ ((packed));
380
381struct ceph_osd_reply_head {
382 __le32 client_inc; /* client incarnation */
383 __le32 flags;
384 struct ceph_object_layout layout;
385 __le32 osdmap_epoch;
386 struct ceph_eversion reassert_version; /* for replaying uncommitted */
387
388 __le32 result; /* result code */
389
390 __le32 object_len; /* length of object name */
391 __le32 num_ops;
392 struct ceph_osd_op ops[0]; /* ops[], object */
393} __attribute__ ((packed));
394
395
396#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badb..39c243acd06 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -119,6 +121,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
119 INIT_LIST_HEAD(&realm->children); 121 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item); 122 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item); 123 INIT_LIST_HEAD(&realm->empty_item);
124 INIT_LIST_HEAD(&realm->dirty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps); 125 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock); 126 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm); 127 __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +438,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{ 438{
436 struct inode *inode = &ci->vfs_inode; 439 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap; 440 struct ceph_cap_snap *capsnap;
438 int used; 441 int used, dirty;
439 442
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 443 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) { 444 if (!capsnap) {
@@ -445,6 +448,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
445 448
446 spin_lock(&inode->i_lock); 449 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci); 450 used = __ceph_caps_used(ci);
451 dirty = __ceph_caps_dirty(ci);
448 if (__ceph_have_pending_cap_snap(ci)) { 452 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps, 453 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any 454 as no new writes are allowed to start when pending, so any
@@ -452,27 +456,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
452 cap_snap. lucky us. */ 456 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode); 457 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap); 458 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 459 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
460 (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
461 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc; 462 struct ceph_snap_context *snapc = ci->i_head_snapc;
457 463
464 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
465 capsnap, snapc);
458 igrab(inode); 466 igrab(inode);
459 467
460 atomic_set(&capsnap->nref, 1); 468 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci; 469 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item); 470 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item); 471 INIT_LIST_HEAD(&capsnap->flushing_item);
464 472
465 capsnap->follows = snapc->seq - 1; 473 capsnap->follows = snapc->seq;
466 capsnap->issued = __ceph_caps_issued(ci, NULL); 474 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci); 475 capsnap->dirty = dirty;
468 476
469 capsnap->mode = inode->i_mode; 477 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid; 478 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid; 479 capsnap->gid = inode->i_gid;
472 480
473 /* fixme? */ 481 if (dirty & CEPH_CAP_XATTR_EXCL) {
474 capsnap->xattr_blob = NULL; 482 __ceph_build_xattrs_blob(ci);
475 capsnap->xattr_len = 0; 483 capsnap->xattr_blob =
484 ceph_buffer_get(ci->i_xattrs.blob);
485 capsnap->xattr_version = ci->i_xattrs.version;
486 } else {
487 capsnap->xattr_blob = NULL;
488 capsnap->xattr_version = 0;
489 }
476 490
477 /* dirty page count moved from _head to this cap_snap; 491 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this 492 all subsequent writes page dirties occur _after_ this
@@ -480,7 +494,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 494 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0; 495 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc; 496 capsnap->context = snapc;
483 ci->i_head_snapc = NULL; 497 ci->i_head_snapc =
498 ceph_get_snap_context(ci->i_snap_realm->cached_context);
499 dout(" new snapc is %p\n", ci->i_head_snapc);
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 500 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485 501
486 if (used & CEPH_CAP_FILE_WR) { 502 if (used & CEPH_CAP_FILE_WR) {
@@ -512,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
513{ 529{
514 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
516 532
517 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -539,6 +555,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
539 return 1; /* caller may want to ceph_flush_snaps */ 555 return 1; /* caller may want to ceph_flush_snaps */
540} 556}
541 557
558/*
559 * Queue cap_snaps for snap writeback for this realm and its children.
560 * Called under snap_rwsem, so realm topology won't change.
561 */
562static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
563{
564 struct ceph_inode_info *ci;
565 struct inode *lastinode = NULL;
566 struct ceph_snap_realm *child;
567
568 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
569
570 spin_lock(&realm->inodes_with_caps_lock);
571 list_for_each_entry(ci, &realm->inodes_with_caps,
572 i_snap_realm_item) {
573 struct inode *inode = igrab(&ci->vfs_inode);
574 if (!inode)
575 continue;
576 spin_unlock(&realm->inodes_with_caps_lock);
577 if (lastinode)
578 iput(lastinode);
579 lastinode = inode;
580 ceph_queue_cap_snap(ci);
581 spin_lock(&realm->inodes_with_caps_lock);
582 }
583 spin_unlock(&realm->inodes_with_caps_lock);
584 if (lastinode)
585 iput(lastinode);
586
587 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
588 list_for_each_entry(child, &realm->children, child_item)
589 queue_realm_cap_snaps(child);
590
591 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
592}
542 593
543/* 594/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies 595 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
@@ -556,6 +607,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
556 struct ceph_snap_realm *realm; 607 struct ceph_snap_realm *realm;
557 int invalidate = 0; 608 int invalidate = 0;
558 int err = -ENOMEM; 609 int err = -ENOMEM;
610 LIST_HEAD(dirty_realms);
559 611
560 dout("update_snap_trace deletion=%d\n", deletion); 612 dout("update_snap_trace deletion=%d\n", deletion);
561more: 613more:
@@ -578,45 +630,6 @@ more:
578 } 630 }
579 } 631 }
580 632
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */ 633 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 634 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0) 635 if (err < 0)
@@ -624,6 +637,8 @@ more:
624 invalidate += err; 637 invalidate += err;
625 638
626 if (le64_to_cpu(ri->seq) > realm->seq) { 639 if (le64_to_cpu(ri->seq) > realm->seq) {
640 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
641 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
627 /* update realm parameters, snap lists */ 642 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq); 643 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created); 644 realm->created = le64_to_cpu(ri->created);
@@ -641,9 +656,17 @@ more:
641 if (err < 0) 656 if (err < 0)
642 goto fail; 657 goto fail;
643 658
659 /* queue realm for cap_snap creation */
660 list_add(&realm->dirty_item, &dirty_realms);
661
644 invalidate = 1; 662 invalidate = 1;
645 } else if (!realm->cached_context) { 663 } else if (!realm->cached_context) {
664 dout("update_snap_trace %llx %p seq %lld new\n",
665 realm->ino, realm, realm->seq);
646 invalidate = 1; 666 invalidate = 1;
667 } else {
668 dout("update_snap_trace %llx %p seq %lld unchanged\n",
669 realm->ino, realm, realm->seq);
647 } 670 }
648 671
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 672 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +679,14 @@ more:
656 if (invalidate) 679 if (invalidate)
657 rebuild_snap_realms(realm); 680 rebuild_snap_realms(realm);
658 681
682 /*
683 * queue cap snaps _after_ we've built the new snap contexts,
684 * so that i_head_snapc can be set appropriately.
685 */
686 list_for_each_entry(realm, &dirty_realms, dirty_item) {
687 queue_realm_cap_snaps(realm);
688 }
689
659 __cleanup_empty_realms(mdsc); 690 __cleanup_empty_realms(mdsc);
660 return 0; 691 return 0;
661 692
@@ -688,7 +719,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
688 igrab(inode); 719 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock); 720 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock); 721 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session); 722 __ceph_flush_snaps(ci, &session, 0);
692 spin_unlock(&inode->i_lock); 723 spin_unlock(&inode->i_lock);
693 iput(inode); 724 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock); 725 spin_lock(&mdsc->snap_flush_lock);
@@ -718,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
718 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
719 struct ceph_msg *msg) 750 struct ceph_msg *msg)
720{ 751{
721 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
722 int mds = session->s_mds; 753 int mds = session->s_mds;
723 u64 split; 754 u64 split;
724 int op; 755 int op;
@@ -789,6 +820,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
789 }; 820 };
790 struct inode *inode = ceph_find_inode(sb, vino); 821 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci; 822 struct ceph_inode_info *ci;
823 struct ceph_snap_realm *oldrealm;
792 824
793 if (!inode) 825 if (!inode)
794 continue; 826 continue;
@@ -814,18 +846,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
814 dout(" will move %p to split realm %llx %p\n", 846 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm); 847 inode, realm->ino, realm);
816 /* 848 /*
817 * Remove the inode from the realm's inode 849 * Move the inode to the new realm
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */ 850 */
823 spin_lock(&realm->inodes_with_caps_lock); 851 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item); 852 list_del_init(&ci->i_snap_realm_item);
853 list_add(&ci->i_snap_realm_item,
854 &realm->inodes_with_caps);
855 oldrealm = ci->i_snap_realm;
856 ci->i_snap_realm = realm;
825 spin_unlock(&realm->inodes_with_caps_lock); 857 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock); 858 spin_unlock(&inode->i_lock);
827 859
828 ceph_queue_cap_snap(ci); 860 ceph_get_snap_realm(mdsc, realm);
861 ceph_put_snap_realm(mdsc, oldrealm);
829 862
830 iput(inode); 863 iput(inode);
831 continue; 864 continue;
@@ -853,43 +886,9 @@ skip_inode:
853 ceph_update_snap_trace(mdsc, p, e, 886 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY); 887 op == CEPH_SNAP_OP_DESTROY);
855 888
856 if (op == CEPH_SNAP_OP_SPLIT) { 889 if (op == CEPH_SNAP_OP_SPLIT)
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (list_empty(&ci->i_snap_realm_item)) {
873 struct ceph_snap_realm *oldrealm =
874 ci->i_snap_realm;
875
876 dout(" moving %p to split realm %llx %p\n",
877 inode, realm->ino, realm);
878 spin_lock(&realm->inodes_with_caps_lock);
879 list_add(&ci->i_snap_realm_item,
880 &realm->inodes_with_caps);
881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
886 spin_unlock(&inode->i_lock);
887 iput(inode);
888 }
889
890 /* we took a reference when we created the realm, above */ 890 /* we took a reference when we created the realm, above */
891 ceph_put_snap_realm(mdsc, realm); 891 ceph_put_snap_realm(mdsc, realm);
892 }
893 892
894 __cleanup_empty_realms(mdsc); 893 __cleanup_empty_realms(mdsc);
895 894
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index 7503aee828c..cd5097d7c80 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,70 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31
32 case CEPH_OSD_OP_APPEND: return "append";
33 case CEPH_OSD_OP_STARTSYNC: return "startsync";
34 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
35 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
36
37 case CEPH_OSD_OP_TMAPUP: return "tmapup";
38 case CEPH_OSD_OP_TMAPGET: return "tmapget";
39 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
40
41 case CEPH_OSD_OP_GETXATTR: return "getxattr";
42 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
43 case CEPH_OSD_OP_SETXATTR: return "setxattr";
44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68 7
69const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
70{ 9{
@@ -129,6 +68,8 @@ const char *ceph_mds_op_name(int op)
129 case CEPH_MDS_OP_LSSNAP: return "lssnap"; 68 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap"; 69 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap"; 70 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
71 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
72 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
132 } 73 }
133 return "???"; 74 return "???";
134} 75}
@@ -174,17 +115,3 @@ const char *ceph_snap_op_name(int o)
174 } 115 }
175 return "???"; 116 return "???";
176} 117}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fa87f51e38e..08b460ae053 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,7 +1,8 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h>
5#include <linux/fs.h> 6#include <linux/fs.h>
6#include <linux/inet.h> 7#include <linux/inet.h>
7#include <linux/in6.h> 8#include <linux/in6.h>
@@ -14,10 +15,13 @@
14#include <linux/statfs.h> 15#include <linux/statfs.h>
15#include <linux/string.h> 16#include <linux/string.h>
16 17
17#include "decode.h"
18#include "super.h" 18#include "super.h"
19#include "mon_client.h" 19#include "mds_client.h"
20#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
21 25
22/* 26/*
23 * Ceph superblock operations 27 * Ceph superblock operations
@@ -25,36 +29,22 @@
25 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
26 */ 30 */
27 31
28
29/*
30 * find filename portion of a path (/foo/bar/baz -> baz)
31 */
32const char *ceph_file_part(const char *s, int len)
33{
34 const char *e = s + len;
35
36 while (e != s && *(e-1) != '/')
37 e--;
38 return e;
39}
40
41
42/* 32/*
43 * super ops 33 * super ops
44 */ 34 */
45static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
46{ 36{
47 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
48 38
49 dout("put_super\n"); 39 dout("put_super\n");
50 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
51 41
52 /* 42 /*
53 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
54 * the device name. 44 * the device name.
55 */ 45 */
56 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
57 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
58 s->s_bdi = NULL; 48 s->s_bdi = NULL;
59 } 49 }
60 50
@@ -63,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
63 53
64static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
65{ 55{
66 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
67 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
68 struct ceph_statfs st; 58 struct ceph_statfs st;
69 u64 fsid; 59 u64 fsid;
70 int err; 60 int err;
71 61
72 dout("statfs\n"); 62 dout("statfs\n");
73 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
74 if (err < 0) 64 if (err < 0)
75 return err; 65 return err;
76 66
@@ -101,236 +91,30 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
101} 91}
102 92
103 93
104static int ceph_syncfs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
105{
106 dout("sync_fs %d\n", wait);
107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
109 dout("sync_fs %d done\n", wait);
110 return 0;
111}
112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
141
142/**
143 * ceph_show_options - Show mount options in /proc/mounts
144 * @m: seq_file to write to
145 * @mnt: mount descriptor
146 */
147static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
148{ 95{
149 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
150 struct ceph_mount_args *args = client->mount_args;
151
152 if (args->flags & CEPH_OPT_FSID)
153 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
154 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
155 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
156 if (args->flags & CEPH_OPT_NOSHARE)
157 seq_puts(m, ",noshare");
158 if (args->flags & CEPH_OPT_DIRSTAT)
159 seq_puts(m, ",dirstat");
160 if ((args->flags & CEPH_OPT_RBYTES) == 0)
161 seq_puts(m, ",norbytes");
162 if (args->flags & CEPH_OPT_NOCRC)
163 seq_puts(m, ",nocrc");
164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
165 seq_puts(m, ",noasyncreaddir");
166 97
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) 98 if (!wait) {
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout); 99 dout("sync_fs (non-blocking)\n");
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) 100 ceph_flush_dirty_caps(fsc->mdsc);
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); 101 dout("sync_fs (non-blocking) done\n");
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) 102 return 0;
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout); 103 }
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
197 if (args->name)
198 seq_printf(m, ",name=%s", args->name);
199 if (args->secret)
200 seq_puts(m, ",secret=<hidden>");
201 return 0;
202}
203
204/*
205 * caches
206 */
207struct kmem_cache *ceph_inode_cachep;
208struct kmem_cache *ceph_cap_cachep;
209struct kmem_cache *ceph_dentry_cachep;
210struct kmem_cache *ceph_file_cachep;
211
212static void ceph_inode_init_once(void *foo)
213{
214 struct ceph_inode_info *ci = foo;
215 inode_init_once(&ci->vfs_inode);
216}
217
218static int __init init_caches(void)
219{
220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
221 sizeof(struct ceph_inode_info),
222 __alignof__(struct ceph_inode_info),
223 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
224 ceph_inode_init_once);
225 if (ceph_inode_cachep == NULL)
226 return -ENOMEM;
227
228 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
229 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
230 if (ceph_cap_cachep == NULL)
231 goto bad_cap;
232
233 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
234 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
235 if (ceph_dentry_cachep == NULL)
236 goto bad_dentry;
237
238 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
239 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
240 if (ceph_file_cachep == NULL)
241 goto bad_file;
242 104
105 dout("sync_fs (blocking)\n");
106 ceph_osdc_sync(&fsc->client->osdc);
107 ceph_mdsc_sync(fsc->mdsc);
108 dout("sync_fs (blocking) done\n");
243 return 0; 109 return 0;
244
245bad_file:
246 kmem_cache_destroy(ceph_dentry_cachep);
247bad_dentry:
248 kmem_cache_destroy(ceph_cap_cachep);
249bad_cap:
250 kmem_cache_destroy(ceph_inode_cachep);
251 return -ENOMEM;
252}
253
254static void destroy_caches(void)
255{
256 kmem_cache_destroy(ceph_inode_cachep);
257 kmem_cache_destroy(ceph_cap_cachep);
258 kmem_cache_destroy(ceph_dentry_cachep);
259 kmem_cache_destroy(ceph_file_cachep);
260}
261
262
263/*
264 * ceph_umount_begin - initiate forced umount. Tear down down the
265 * mount, skipping steps that may hang while waiting for server(s).
266 */
267static void ceph_umount_begin(struct super_block *sb)
268{
269 struct ceph_client *client = ceph_sb_to_client(sb);
270
271 dout("ceph_umount_begin - starting forced umount\n");
272 if (!client)
273 return;
274 client->mount_state = CEPH_MOUNT_SHUTDOWN;
275 return;
276} 110}
277 111
278static const struct super_operations ceph_super_ops = {
279 .alloc_inode = ceph_alloc_inode,
280 .destroy_inode = ceph_destroy_inode,
281 .write_inode = ceph_write_inode,
282 .sync_fs = ceph_syncfs,
283 .put_super = ceph_put_super,
284 .show_options = ceph_show_options,
285 .statfs = ceph_statfs,
286 .umount_begin = ceph_umount_begin,
287};
288
289
290const char *ceph_msg_type_name(int type)
291{
292 switch (type) {
293 case CEPH_MSG_SHUTDOWN: return "shutdown";
294 case CEPH_MSG_PING: return "ping";
295 case CEPH_MSG_AUTH: return "auth";
296 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
297 case CEPH_MSG_MON_MAP: return "mon_map";
298 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
299 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
300 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
301 case CEPH_MSG_STATFS: return "statfs";
302 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
303 case CEPH_MSG_MDS_MAP: return "mds_map";
304 case CEPH_MSG_CLIENT_SESSION: return "client_session";
305 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
306 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
307 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
308 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
309 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
310 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
311 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
312 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
313 case CEPH_MSG_OSD_MAP: return "osd_map";
314 case CEPH_MSG_OSD_OP: return "osd_op";
315 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
316 default: return "unknown";
317 }
318}
319
320
321/* 112/*
322 * mount options 113 * mount options
323 */ 114 */
324enum { 115enum {
325 Opt_fsidmajor,
326 Opt_fsidminor,
327 Opt_monport,
328 Opt_wsize, 116 Opt_wsize,
329 Opt_rsize, 117 Opt_rsize,
330 Opt_osdtimeout,
331 Opt_osdkeepalivetimeout,
332 Opt_mount_timeout,
333 Opt_osd_idle_ttl,
334 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
335 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -340,30 +124,18 @@ enum {
340 Opt_last_int, 124 Opt_last_int,
341 /* int args above */ 125 /* int args above */
342 Opt_snapdirname, 126 Opt_snapdirname,
343 Opt_name,
344 Opt_secret,
345 Opt_last_string, 127 Opt_last_string,
346 /* string args above */ 128 /* string args above */
347 Opt_ip,
348 Opt_noshare,
349 Opt_dirstat, 129 Opt_dirstat,
350 Opt_nodirstat, 130 Opt_nodirstat,
351 Opt_rbytes, 131 Opt_rbytes,
352 Opt_norbytes, 132 Opt_norbytes,
353 Opt_nocrc,
354 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
355}; 134};
356 135
357static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
358 {Opt_fsidmajor, "fsidmajor=%ld"},
359 {Opt_fsidminor, "fsidminor=%ld"},
360 {Opt_monport, "monport=%d"},
361 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
362 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
363 {Opt_osdtimeout, "osdtimeout=%d"},
364 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
365 {Opt_mount_timeout, "mount_timeout=%d"},
366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -372,372 +144,458 @@ static match_table_t arg_tokens = {
372 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
373 /* int args above */ 145 /* int args above */
374 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
375 {Opt_name, "name=%s"},
376 {Opt_secret, "secret=%s"},
377 /* string args above */ 147 /* string args above */
378 {Opt_ip, "ip=%s"},
379 {Opt_noshare, "noshare"},
380 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
381 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
382 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
383 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
384 {Opt_nocrc, "nocrc"},
385 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
386 {-1, NULL} 153 {-1, NULL}
387}; 154};
388 155
389 156static int parse_fsopt_token(char *c, void *private)
390static struct ceph_mount_args *parse_mount_args(int flags, char *options,
391 const char *dev_name,
392 const char **path)
393{ 157{
394 struct ceph_mount_args *args; 158 struct ceph_mount_options *fsopt = private;
395 const char *c;
396 int err = -ENOMEM;
397 substring_t argstr[MAX_OPT_ARGS]; 159 substring_t argstr[MAX_OPT_ARGS];
160 int token, intval, ret;
161
162 token = match_token((char *)c, fsopt_tokens, argstr);
163 if (token < 0)
164 return -EINVAL;
165
166 if (token < Opt_last_int) {
167 ret = match_int(&argstr[0], &intval);
168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
172 }
173 dout("got int token %d val %d\n", token, intval);
174 } else if (token > Opt_last_int && token < Opt_last_string) {
175 dout("got string token %d val %s\n", token,
176 argstr[0].from);
177 } else {
178 dout("got token %d\n", token);
179 }
398 180
399 args = kzalloc(sizeof(*args), GFP_KERNEL); 181 switch (token) {
400 if (!args) 182 case Opt_snapdirname:
401 return ERR_PTR(-ENOMEM); 183 kfree(fsopt->snapdir_name);
402 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
403 GFP_KERNEL); 185 argstr[0].to-argstr[0].from,
404 if (!args->mon_addr) 186 GFP_KERNEL);
405 goto out; 187 if (!fsopt->snapdir_name)
406 188 return -ENOMEM;
407 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 189 break;
408 190
409 /* start with defaults */ 191 /* misc */
410 args->sb_flags = flags; 192 case Opt_wsize:
411 args->flags = CEPH_OPT_DEFAULT; 193 fsopt->wsize = intval;
412 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 194 break;
413 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 195 case Opt_rsize:
414 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 196 fsopt->rsize = intval;
415 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ 197 break;
416 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 198 case Opt_caps_wanted_delay_min:
417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 199 fsopt->caps_wanted_delay_min = intval;
418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 200 break;
419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 201 case Opt_caps_wanted_delay_max:
420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 202 fsopt->caps_wanted_delay_max = intval;
421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT; 203 break;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 204 case Opt_readdir_max_entries:
423 args->congestion_kb = default_congestion_kb(); 205 fsopt->max_readdir = intval;
424 206 break;
425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 207 case Opt_readdir_max_bytes:
426 err = -EINVAL; 208 fsopt->max_readdir_bytes = intval;
427 if (!dev_name) 209 break;
428 goto out; 210 case Opt_congestion_kb:
429 *path = strstr(dev_name, ":/"); 211 fsopt->congestion_kb = intval;
430 if (*path == NULL) { 212 break;
431 pr_err("device name is missing path (no :/ in %s)\n", 213 case Opt_dirstat:
432 dev_name); 214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
433 goto out; 215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
434 } 230 }
231 return 0;
232}
435 233
436 /* get mon ip(s) */ 234static void destroy_mount_options(struct ceph_mount_options *args)
437 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 235{
438 CEPH_MAX_MON, &args->num_mon); 236 dout("destroy_mount_options %p\n", args);
439 if (err < 0) 237 kfree(args->snapdir_name);
440 goto out; 238 kfree(args);
239}
240
241static int strcmp_null(const char *s1, const char *s2)
242{
243 if (!s1 && !s2)
244 return 0;
245 if (s1 && !s2)
246 return -1;
247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
251
252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
253 struct ceph_options *new_opt,
254 struct ceph_fs_client *fsc)
255{
256 struct ceph_mount_options *fsopt1 = new_fsopt;
257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
259 int ret;
260
261 ret = memcmp(fsopt1, fsopt2, ofs);
262 if (ret)
263 return ret;
264
265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
296 fsopt->congestion_kb = default_congestion_kb();
297
298 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
299 err = -EINVAL;
300 if (!dev_name)
301 goto out;
302 *path = strstr(dev_name, ":/");
303 if (*path == NULL) {
304 pr_err("device name is missing path (no :/ in %s)\n",
305 dev_name);
306 goto out;
307 }
308 dev_name_end = *path;
309 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
441 310
442 /* path on server */ 311 /* path on server */
443 *path += 2; 312 *path += 2;
444 dout("server path '%s'\n", *path); 313 dout("server path '%s'\n", *path);
445 314
446 /* parse mount options */ 315 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
447 while ((c = strsep(&options, ",")) != NULL) { 316 parse_fsopt_token, (void *)fsopt);
448 int token, intval, ret; 317 if (err)
449 if (!*c) 318 goto out;
450 continue; 319
451 err = -EINVAL; 320 /* success */
452 token = match_token((char *)c, arg_tokens, argstr); 321 *pfsopt = fsopt;
453 if (token < 0) { 322 return 0;
454 pr_err("bad mount option at '%s'\n", c);
455 goto out;
456 }
457 if (token < Opt_last_int) {
458 ret = match_int(&argstr[0], &intval);
459 if (ret < 0) {
460 pr_err("bad mount option arg (not int) "
461 "at '%s'\n", c);
462 continue;
463 }
464 dout("got int token %d val %d\n", token, intval);
465 } else if (token > Opt_last_int && token < Opt_last_string) {
466 dout("got string token %d val %s\n", token,
467 argstr[0].from);
468 } else {
469 dout("got token %d\n", token);
470 }
471 switch (token) {
472 case Opt_fsidmajor:
473 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
474 break;
475 case Opt_fsidminor:
476 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
477 break;
478 case Opt_ip:
479 err = ceph_parse_ips(argstr[0].from,
480 argstr[0].to,
481 &args->my_addr,
482 1, NULL);
483 if (err < 0)
484 goto out;
485 args->flags |= CEPH_OPT_MYIP;
486 break;
487
488 case Opt_snapdirname:
489 kfree(args->snapdir_name);
490 args->snapdir_name = kstrndup(argstr[0].from,
491 argstr[0].to-argstr[0].from,
492 GFP_KERNEL);
493 break;
494 case Opt_name:
495 args->name = kstrndup(argstr[0].from,
496 argstr[0].to-argstr[0].from,
497 GFP_KERNEL);
498 break;
499 case Opt_secret:
500 args->secret = kstrndup(argstr[0].from,
501 argstr[0].to-argstr[0].from,
502 GFP_KERNEL);
503 break;
504
505 /* misc */
506 case Opt_wsize:
507 args->wsize = intval;
508 break;
509 case Opt_rsize:
510 args->rsize = intval;
511 break;
512 case Opt_osdtimeout:
513 args->osd_timeout = intval;
514 break;
515 case Opt_osdkeepalivetimeout:
516 args->osd_keepalive_timeout = intval;
517 break;
518 case Opt_mount_timeout:
519 args->mount_timeout = intval;
520 break;
521 case Opt_caps_wanted_delay_min:
522 args->caps_wanted_delay_min = intval;
523 break;
524 case Opt_caps_wanted_delay_max:
525 args->caps_wanted_delay_max = intval;
526 break;
527 case Opt_readdir_max_entries:
528 args->max_readdir = intval;
529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
533 case Opt_congestion_kb:
534 args->congestion_kb = intval;
535 break;
536
537 case Opt_noshare:
538 args->flags |= CEPH_OPT_NOSHARE;
539 break;
540
541 case Opt_dirstat:
542 args->flags |= CEPH_OPT_DIRSTAT;
543 break;
544 case Opt_nodirstat:
545 args->flags &= ~CEPH_OPT_DIRSTAT;
546 break;
547 case Opt_rbytes:
548 args->flags |= CEPH_OPT_RBYTES;
549 break;
550 case Opt_norbytes:
551 args->flags &= ~CEPH_OPT_RBYTES;
552 break;
553 case Opt_nocrc:
554 args->flags |= CEPH_OPT_NOCRC;
555 break;
556 case Opt_noasyncreaddir:
557 args->flags |= CEPH_OPT_NOASYNCREADDIR;
558 break;
559
560 default:
561 BUG_ON(token);
562 }
563 }
564 return args;
565 323
566out: 324out:
567 kfree(args->mon_addr); 325 destroy_mount_options(fsopt);
568 kfree(args); 326 return err;
569 return ERR_PTR(err);
570} 327}
571 328
572static void destroy_mount_args(struct ceph_mount_args *args) 329/**
330 * ceph_show_options - Show mount options in /proc/mounts
331 * @m: seq_file to write to
332 * @mnt: mount descriptor
333 */
334static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
573{ 335{
574 dout("destroy_mount_args %p\n", args); 336 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
575 kfree(args->snapdir_name); 337 struct ceph_mount_options *fsopt = fsc->mount_options;
576 args->snapdir_name = NULL; 338 struct ceph_options *opt = fsc->client->options;
577 kfree(args->name); 339
578 args->name = NULL; 340 if (opt->flags & CEPH_OPT_FSID)
579 kfree(args->secret); 341 seq_printf(m, ",fsid=%pU", &opt->fsid);
580 args->secret = NULL; 342 if (opt->flags & CEPH_OPT_NOSHARE)
581 kfree(args); 343 seq_puts(m, ",noshare");
344 if (opt->flags & CEPH_OPT_NOCRC)
345 seq_puts(m, ",nocrc");
346
347 if (opt->name)
348 seq_printf(m, ",name=%s", opt->name);
349 if (opt->secret)
350 seq_puts(m, ",secret=<hidden>");
351
352 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
353 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
354 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
355 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
356 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
357 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
358 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
359 seq_printf(m, ",osdkeepalivetimeout=%d",
360 opt->osd_keepalive_timeout);
361
362 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
363 seq_puts(m, ",dirstat");
364 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
365 seq_puts(m, ",norbytes");
366 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
367 seq_puts(m, ",noasyncreaddir");
368
369 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
375 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
376 seq_printf(m, ",caps_wanted_delay_min=%d",
377 fsopt->caps_wanted_delay_min);
378 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
379 seq_printf(m, ",caps_wanted_delay_max=%d",
380 fsopt->caps_wanted_delay_max);
381 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
382 seq_printf(m, ",cap_release_safety=%d",
383 fsopt->cap_release_safety);
384 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
385 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
386 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
387 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
388 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
389 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
390 return 0;
582} 391}
583 392
584/* 393/*
585 * create a fresh client instance 394 * handle any mon messages the standard library doesn't understand.
395 * return error if we don't either.
586 */ 396 */
587static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 397static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
588{ 398{
589 struct ceph_client *client; 399 struct ceph_fs_client *fsc = client->private;
400 int type = le16_to_cpu(msg->hdr.type);
401
402 switch (type) {
403 case CEPH_MSG_MDS_MAP:
404 ceph_mdsc_handle_map(fsc->mdsc, msg);
405 return 0;
406
407 default:
408 return -1;
409 }
410}
411
412/*
413 * create a new fs client
414 */
415struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
416 struct ceph_options *opt)
417{
418 struct ceph_fs_client *fsc;
590 int err = -ENOMEM; 419 int err = -ENOMEM;
591 420
592 client = kzalloc(sizeof(*client), GFP_KERNEL); 421 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
593 if (client == NULL) 422 if (!fsc)
594 return ERR_PTR(-ENOMEM); 423 return ERR_PTR(-ENOMEM);
595 424
596 mutex_init(&client->mount_mutex); 425 fsc->client = ceph_create_client(opt, fsc);
597 426 if (IS_ERR(fsc->client)) {
598 init_waitqueue_head(&client->auth_wq); 427 err = PTR_ERR(fsc->client);
428 goto fail;
429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
432 fsc->client->monc.want_mdsmap = 1;
599 433
600 client->sb = NULL; 434 fsc->mount_options = fsopt;
601 client->mount_state = CEPH_MOUNT_MOUNTING;
602 client->mount_args = args;
603 435
604 client->msgr = NULL; 436 fsc->sb = NULL;
437 fsc->mount_state = CEPH_MOUNT_MOUNTING;
605 438
606 client->auth_err = 0; 439 atomic_long_set(&fsc->writeback_count, 0);
607 atomic_long_set(&client->writeback_count, 0);
608 440
609 err = bdi_init(&client->backing_dev_info); 441 err = bdi_init(&fsc->backing_dev_info);
610 if (err < 0) 442 if (err < 0)
611 goto fail; 443 goto fail_client;
612 444
613 err = -ENOMEM; 445 err = -ENOMEM;
614 client->wb_wq = create_workqueue("ceph-writeback"); 446 fsc->wb_wq = create_workqueue("ceph-writeback");
615 if (client->wb_wq == NULL) 447 if (fsc->wb_wq == NULL)
616 goto fail_bdi; 448 goto fail_bdi;
617 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
618 if (client->pg_inv_wq == NULL) 450 if (fsc->pg_inv_wq == NULL)
619 goto fail_wb_wq; 451 goto fail_wb_wq;
620 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
621 if (client->trunc_wq == NULL) 453 if (fsc->trunc_wq == NULL)
622 goto fail_pg_inv_wq; 454 goto fail_pg_inv_wq;
623 455
624 /* set up mempools */ 456 /* set up mempools */
625 err = -ENOMEM; 457 err = -ENOMEM;
626 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 458 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
627 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 459 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
628 if (!client->wb_pagevec_pool) 460 if (!fsc->wb_pagevec_pool)
629 goto fail_trunc_wq; 461 goto fail_trunc_wq;
630 462
631 /* caps */ 463 /* caps */
632 client->min_caps = args->max_readdir; 464 fsc->min_caps = fsopt->max_readdir;
633 ceph_adjust_min_caps(client->min_caps); 465
466 return fsc;
634 467
635 /* subsystems */
636 err = ceph_monc_init(&client->monc, client);
637 if (err < 0)
638 goto fail_mempool;
639 err = ceph_osdc_init(&client->osdc, client);
640 if (err < 0)
641 goto fail_monc;
642 err = ceph_mdsc_init(&client->mdsc, client);
643 if (err < 0)
644 goto fail_osdc;
645 return client;
646
647fail_osdc:
648 ceph_osdc_stop(&client->osdc);
649fail_monc:
650 ceph_monc_stop(&client->monc);
651fail_mempool:
652 mempool_destroy(client->wb_pagevec_pool);
653fail_trunc_wq: 468fail_trunc_wq:
654 destroy_workqueue(client->trunc_wq); 469 destroy_workqueue(fsc->trunc_wq);
655fail_pg_inv_wq: 470fail_pg_inv_wq:
656 destroy_workqueue(client->pg_inv_wq); 471 destroy_workqueue(fsc->pg_inv_wq);
657fail_wb_wq: 472fail_wb_wq:
658 destroy_workqueue(client->wb_wq); 473 destroy_workqueue(fsc->wb_wq);
659fail_bdi: 474fail_bdi:
660 bdi_destroy(&client->backing_dev_info); 475 bdi_destroy(&fsc->backing_dev_info);
476fail_client:
477 ceph_destroy_client(fsc->client);
661fail: 478fail:
662 kfree(client); 479 kfree(fsc);
663 return ERR_PTR(err); 480 return ERR_PTR(err);
664} 481}
665 482
666static void ceph_destroy_client(struct ceph_client *client) 483void destroy_fs_client(struct ceph_fs_client *fsc)
667{ 484{
668 dout("destroy_client %p\n", client); 485 dout("destroy_fs_client %p\n", fsc);
669 486
670 /* unmount */ 487 destroy_workqueue(fsc->wb_wq);
671 ceph_mdsc_stop(&client->mdsc); 488 destroy_workqueue(fsc->pg_inv_wq);
672 ceph_osdc_stop(&client->osdc); 489 destroy_workqueue(fsc->trunc_wq);
673 490
674 /* 491 bdi_destroy(&fsc->backing_dev_info);
675 * make sure mds and osd connections close out before destroying
676 * the auth module, which is needed to free those connections'
677 * ceph_authorizers.
678 */
679 ceph_msgr_flush();
680
681 ceph_monc_stop(&client->monc);
682 492
683 ceph_adjust_min_caps(-client->min_caps); 493 mempool_destroy(fsc->wb_pagevec_pool);
684 494
685 ceph_debugfs_client_cleanup(client); 495 destroy_mount_options(fsc->mount_options);
686 destroy_workqueue(client->wb_wq);
687 destroy_workqueue(client->pg_inv_wq);
688 destroy_workqueue(client->trunc_wq);
689 496
690 bdi_destroy(&client->backing_dev_info); 497 ceph_fs_debugfs_cleanup(fsc);
691 498
692 if (client->msgr) 499 ceph_destroy_client(fsc->client);
693 ceph_messenger_destroy(client->msgr);
694 mempool_destroy(client->wb_pagevec_pool);
695 500
696 destroy_mount_args(client->mount_args); 501 kfree(fsc);
697 502 dout("destroy_fs_client %p done\n", fsc);
698 kfree(client);
699 dout("destroy_client %p done\n", client);
700} 503}
701 504
702/* 505/*
703 * Initially learn our fsid, or verify an fsid matches. 506 * caches
704 */ 507 */
705int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 508struct kmem_cache *ceph_inode_cachep;
509struct kmem_cache *ceph_cap_cachep;
510struct kmem_cache *ceph_dentry_cachep;
511struct kmem_cache *ceph_file_cachep;
512
513static void ceph_inode_init_once(void *foo)
706{ 514{
707 if (client->have_fsid) { 515 struct ceph_inode_info *ci = foo;
708 if (ceph_fsid_compare(&client->fsid, fsid)) { 516 inode_init_once(&ci->vfs_inode);
709 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, 517}
710 PR_FSID(&client->fsid), PR_FSID(fsid)); 518
711 return -1; 519static int __init init_caches(void)
712 } 520{
713 } else { 521 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
714 pr_info("client%lld fsid " FSID_FORMAT "\n", 522 sizeof(struct ceph_inode_info),
715 client->monc.auth->global_id, PR_FSID(fsid)); 523 __alignof__(struct ceph_inode_info),
716 memcpy(&client->fsid, fsid, sizeof(*fsid)); 524 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
717 ceph_debugfs_client_init(client); 525 ceph_inode_init_once);
718 client->have_fsid = true; 526 if (ceph_inode_cachep == NULL)
719 } 527 return -ENOMEM;
528
529 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
530 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
531 if (ceph_cap_cachep == NULL)
532 goto bad_cap;
533
534 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
535 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
536 if (ceph_dentry_cachep == NULL)
537 goto bad_dentry;
538
539 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
540 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
541 if (ceph_file_cachep == NULL)
542 goto bad_file;
543
720 return 0; 544 return 0;
545
546bad_file:
547 kmem_cache_destroy(ceph_dentry_cachep);
548bad_dentry:
549 kmem_cache_destroy(ceph_cap_cachep);
550bad_cap:
551 kmem_cache_destroy(ceph_inode_cachep);
552 return -ENOMEM;
553}
554
555static void destroy_caches(void)
556{
557 kmem_cache_destroy(ceph_inode_cachep);
558 kmem_cache_destroy(ceph_cap_cachep);
559 kmem_cache_destroy(ceph_dentry_cachep);
560 kmem_cache_destroy(ceph_file_cachep);
721} 561}
722 562
563
723/* 564/*
724 * true if we have the mon map (and have thus joined the cluster) 565 * ceph_umount_begin - initiate forced umount. Tear down down the
566 * mount, skipping steps that may hang while waiting for server(s).
725 */ 567 */
726static int have_mon_and_osd_map(struct ceph_client *client) 568static void ceph_umount_begin(struct super_block *sb)
727{ 569{
728 return client->monc.monmap && client->monc.monmap->epoch && 570 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
729 client->osdc.osdmap && client->osdc.osdmap->epoch; 571
572 dout("ceph_umount_begin - starting forced umount\n");
573 if (!fsc)
574 return;
575 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
576 return;
730} 577}
731 578
579static const struct super_operations ceph_super_ops = {
580 .alloc_inode = ceph_alloc_inode,
581 .destroy_inode = ceph_destroy_inode,
582 .write_inode = ceph_write_inode,
583 .sync_fs = ceph_sync_fs,
584 .put_super = ceph_put_super,
585 .show_options = ceph_show_options,
586 .statfs = ceph_statfs,
587 .umount_begin = ceph_umount_begin,
588};
589
732/* 590/*
733 * Bootstrap mount by opening the root directory. Note the mount 591 * Bootstrap mount by opening the root directory. Note the mount
734 * @started time from caller, and time out if this takes too long. 592 * @started time from caller, and time out if this takes too long.
735 */ 593 */
736static struct dentry *open_root_dentry(struct ceph_client *client, 594static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
737 const char *path, 595 const char *path,
738 unsigned long started) 596 unsigned long started)
739{ 597{
740 struct ceph_mds_client *mdsc = &client->mdsc; 598 struct ceph_mds_client *mdsc = fsc->mdsc;
741 struct ceph_mds_request *req = NULL; 599 struct ceph_mds_request *req = NULL;
742 int err; 600 int err;
743 struct dentry *root; 601 struct dentry *root;
@@ -751,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
751 req->r_ino1.ino = CEPH_INO_ROOT; 609 req->r_ino1.ino = CEPH_INO_ROOT;
752 req->r_ino1.snap = CEPH_NOSNAP; 610 req->r_ino1.snap = CEPH_NOSNAP;
753 req->r_started = started; 611 req->r_started = started;
754 req->r_timeout = client->mount_args->mount_timeout * HZ; 612 req->r_timeout = fsc->client->options->mount_timeout * HZ;
755 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 613 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
756 req->r_num_caps = 2; 614 req->r_num_caps = 2;
757 err = ceph_mdsc_do_request(mdsc, NULL, req); 615 err = ceph_mdsc_do_request(mdsc, NULL, req);
758 if (err == 0) { 616 if (err == 0) {
759 dout("open_root_inode success\n"); 617 dout("open_root_inode success\n");
760 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 618 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
761 client->sb->s_root == NULL) 619 fsc->sb->s_root == NULL)
762 root = d_alloc_root(req->r_target_inode); 620 root = d_alloc_root(req->r_target_inode);
763 else 621 else
764 root = d_obtain_alias(req->r_target_inode); 622 root = d_obtain_alias(req->r_target_inode);
@@ -771,105 +629,84 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
771 return root; 629 return root;
772} 630}
773 631
632
633
634
774/* 635/*
775 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
776 */ 637 */
777static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 638static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
778 const char *path) 639 const char *path)
779{ 640{
780 struct ceph_entity_addr *myaddr = NULL;
781 int err; 641 int err;
782 unsigned long timeout = client->mount_args->mount_timeout * HZ;
783 unsigned long started = jiffies; /* note the start time */ 642 unsigned long started = jiffies; /* note the start time */
784 struct dentry *root; 643 struct dentry *root;
644 int first = 0; /* first vfsmount for this super_block */
785 645
786 dout("mount start\n"); 646 dout("mount start\n");
787 mutex_lock(&client->mount_mutex); 647 mutex_lock(&fsc->client->mount_mutex);
788
789 /* initialize the messenger */
790 if (client->msgr == NULL) {
791 if (ceph_test_opt(client, MYIP))
792 myaddr = &client->mount_args->my_addr;
793 client->msgr = ceph_messenger_create(myaddr);
794 if (IS_ERR(client->msgr)) {
795 err = PTR_ERR(client->msgr);
796 client->msgr = NULL;
797 goto out;
798 }
799 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
800 }
801 648
802 /* open session, and wait for mon, mds, and osd maps */ 649 err = __ceph_open_session(fsc->client, started);
803 err = ceph_monc_open_session(&client->monc);
804 if (err < 0) 650 if (err < 0)
805 goto out; 651 goto out;
806 652
807 while (!have_mon_and_osd_map(client)) {
808 err = -EIO;
809 if (timeout && time_after_eq(jiffies, started + timeout))
810 goto out;
811
812 /* wait */
813 dout("mount waiting for mon_map\n");
814 err = wait_event_interruptible_timeout(client->auth_wq,
815 have_mon_and_osd_map(client) || (client->auth_err < 0),
816 timeout);
817 if (err == -EINTR || err == -ERESTARTSYS)
818 goto out;
819 if (client->auth_err < 0) {
820 err = client->auth_err;
821 goto out;
822 }
823 }
824
825 dout("mount opening root\n"); 653 dout("mount opening root\n");
826 root = open_root_dentry(client, "", started); 654 root = open_root_dentry(fsc, "", started);
827 if (IS_ERR(root)) { 655 if (IS_ERR(root)) {
828 err = PTR_ERR(root); 656 err = PTR_ERR(root);
829 goto out; 657 goto out;
830 } 658 }
831 if (client->sb->s_root) 659 if (fsc->sb->s_root) {
832 dput(root); 660 dput(root);
833 else 661 } else {
834 client->sb->s_root = root; 662 fsc->sb->s_root = root;
663 first = 1;
664
665 err = ceph_fs_debugfs_init(fsc);
666 if (err < 0)
667 goto fail;
668 }
835 669
836 if (path[0] == 0) { 670 if (path[0] == 0) {
837 dget(root); 671 dget(root);
838 } else { 672 } else {
839 dout("mount opening base mountpoint\n"); 673 dout("mount opening base mountpoint\n");
840 root = open_root_dentry(client, path, started); 674 root = open_root_dentry(fsc, path, started);
841 if (IS_ERR(root)) { 675 if (IS_ERR(root)) {
842 err = PTR_ERR(root); 676 err = PTR_ERR(root);
843 dput(client->sb->s_root); 677 goto fail;
844 client->sb->s_root = NULL;
845 goto out;
846 } 678 }
847 } 679 }
848 680
849 mnt->mnt_root = root; 681 fsc->mount_state = CEPH_MOUNT_MOUNTED;
850 mnt->mnt_sb = client->sb;
851
852 client->mount_state = CEPH_MOUNT_MOUNTED;
853 dout("mount success\n"); 682 dout("mount success\n");
854 err = 0; 683 mutex_unlock(&fsc->client->mount_mutex);
684 return root;
855 685
856out: 686out:
857 mutex_unlock(&client->mount_mutex); 687 mutex_unlock(&fsc->client->mount_mutex);
858 return err; 688 return ERR_PTR(err);
689
690fail:
691 if (first) {
692 dput(fsc->sb->s_root);
693 fsc->sb->s_root = NULL;
694 }
695 goto out;
859} 696}
860 697
861static int ceph_set_super(struct super_block *s, void *data) 698static int ceph_set_super(struct super_block *s, void *data)
862{ 699{
863 struct ceph_client *client = data; 700 struct ceph_fs_client *fsc = data;
864 int ret; 701 int ret;
865 702
866 dout("set_super %p data %p\n", s, data); 703 dout("set_super %p data %p\n", s, data);
867 704
868 s->s_flags = client->mount_args->sb_flags; 705 s->s_flags = fsc->mount_options->sb_flags;
869 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 706 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
870 707
871 s->s_fs_info = client; 708 s->s_fs_info = fsc;
872 client->sb = s; 709 fsc->sb = s;
873 710
874 s->s_op = &ceph_super_ops; 711 s->s_op = &ceph_super_ops;
875 s->s_export_op = &ceph_export_ops; 712 s->s_export_op = &ceph_export_ops;
@@ -884,7 +721,7 @@ static int ceph_set_super(struct super_block *s, void *data)
884 721
885fail: 722fail:
886 s->s_fs_info = NULL; 723 s->s_fs_info = NULL;
887 client->sb = NULL; 724 fsc->sb = NULL;
888 return ret; 725 return ret;
889} 726}
890 727
@@ -893,30 +730,23 @@ fail:
893 */ 730 */
894static int ceph_compare_super(struct super_block *sb, void *data) 731static int ceph_compare_super(struct super_block *sb, void *data)
895{ 732{
896 struct ceph_client *new = data; 733 struct ceph_fs_client *new = data;
897 struct ceph_mount_args *args = new->mount_args; 734 struct ceph_mount_options *fsopt = new->mount_options;
898 struct ceph_client *other = ceph_sb_to_client(sb); 735 struct ceph_options *opt = new->client->options;
899 int i; 736 struct ceph_fs_client *other = ceph_sb_to_client(sb);
900 737
901 dout("ceph_compare_super %p\n", sb); 738 dout("ceph_compare_super %p\n", sb);
902 if (args->flags & CEPH_OPT_FSID) { 739
903 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 740 if (compare_mount_options(fsopt, opt, other)) {
904 dout("fsid doesn't match\n"); 741 dout("monitor(s)/mount options don't match\n");
905 return 0; 742 return 0;
906 } 743 }
907 } else { 744 if ((opt->flags & CEPH_OPT_FSID) &&
908 /* do we share (a) monitor? */ 745 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
909 for (i = 0; i < new->monc.monmap->num_mon; i++) 746 dout("fsid doesn't match\n");
910 if (ceph_monmap_contains(other->monc.monmap, 747 return 0;
911 &new->monc.monmap->mon_inst[i].addr))
912 break;
913 if (i == new->monc.monmap->num_mon) {
914 dout("mon ip not part of monmap\n");
915 return 0;
916 }
917 dout("mon ip matches existing sb %p\n", sb);
918 } 748 }
919 if (args->sb_flags != other->mount_args->sb_flags) { 749 if (fsopt->sb_flags != other->mount_options->sb_flags) {
920 dout("flags differ\n"); 750 dout("flags differ\n");
921 return 0; 751 return 0;
922 } 752 }
@@ -928,98 +758,113 @@ static int ceph_compare_super(struct super_block *sb, void *data)
928 */ 758 */
929static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 759static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
930 760
931static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 761static int ceph_register_bdi(struct super_block *sb,
762 struct ceph_fs_client *fsc)
932{ 763{
933 int err; 764 int err;
934 765
935 /* set ra_pages based on rsize mount option? */ 766 /* set ra_pages based on rsize mount option? */
936 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 767 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
937 client->backing_dev_info.ra_pages = 768 fsc->backing_dev_info.ra_pages =
938 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 769 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
939 >> PAGE_SHIFT; 770 >> PAGE_SHIFT;
940 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 771 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
941 atomic_long_inc_return(&bdi_seq)); 772 atomic_long_inc_return(&bdi_seq));
942 if (!err) 773 if (!err)
943 sb->s_bdi = &client->backing_dev_info; 774 sb->s_bdi = &fsc->backing_dev_info;
944 return err; 775 return err;
945} 776}
946 777
947static int ceph_get_sb(struct file_system_type *fs_type, 778static struct dentry *ceph_mount(struct file_system_type *fs_type,
948 int flags, const char *dev_name, void *data, 779 int flags, const char *dev_name, void *data)
949 struct vfsmount *mnt)
950{ 780{
951 struct super_block *sb; 781 struct super_block *sb;
952 struct ceph_client *client; 782 struct ceph_fs_client *fsc;
783 struct dentry *res;
953 int err; 784 int err;
954 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 785 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
955 const char *path = NULL; 786 const char *path = NULL;
956 struct ceph_mount_args *args; 787 struct ceph_mount_options *fsopt = NULL;
788 struct ceph_options *opt = NULL;
957 789
958 dout("ceph_get_sb\n"); 790 dout("ceph_mount\n");
959 args = parse_mount_args(flags, data, dev_name, &path); 791 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
960 if (IS_ERR(args)) { 792 if (err < 0) {
961 err = PTR_ERR(args); 793 res = ERR_PTR(err);
962 goto out_final; 794 goto out_final;
963 } 795 }
964 796
965 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
966 client = ceph_create_client(args); 798 fsc = create_fs_client(fsopt, opt);
967 if (IS_ERR(client)) { 799 if (IS_ERR(fsc)) {
968 err = PTR_ERR(client); 800 res = ERR_CAST(fsc);
801 kfree(fsopt);
802 kfree(opt);
969 goto out_final; 803 goto out_final;
970 } 804 }
971 805
972 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 806 err = ceph_mdsc_init(fsc);
807 if (err < 0) {
808 res = ERR_PTR(err);
809 goto out;
810 }
811
812 if (ceph_test_opt(fsc->client, NOSHARE))
973 compare_super = NULL; 813 compare_super = NULL;
974 sb = sget(fs_type, compare_super, ceph_set_super, client); 814 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
975 if (IS_ERR(sb)) { 815 if (IS_ERR(sb)) {
976 err = PTR_ERR(sb); 816 res = ERR_CAST(sb);
977 goto out; 817 goto out;
978 } 818 }
979 819
980 if (ceph_sb_to_client(sb) != client) { 820 if (ceph_sb_to_client(sb) != fsc) {
981 ceph_destroy_client(client); 821 ceph_mdsc_destroy(fsc);
982 client = ceph_sb_to_client(sb); 822 destroy_fs_client(fsc);
983 dout("get_sb got existing client %p\n", client); 823 fsc = ceph_sb_to_client(sb);
824 dout("get_sb got existing client %p\n", fsc);
984 } else { 825 } else {
985 dout("get_sb using new client %p\n", client); 826 dout("get_sb using new client %p\n", fsc);
986 err = ceph_register_bdi(sb, client); 827 err = ceph_register_bdi(sb, fsc);
987 if (err < 0) 828 if (err < 0) {
829 res = ERR_PTR(err);
988 goto out_splat; 830 goto out_splat;
831 }
989 } 832 }
990 833
991 err = ceph_mount(client, mnt, path); 834 res = ceph_real_mount(fsc, path);
992 if (err < 0) 835 if (IS_ERR(res))
993 goto out_splat; 836 goto out_splat;
994 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 837 dout("root %p inode %p ino %llx.%llx\n", res,
995 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); 838 res->d_inode, ceph_vinop(res->d_inode));
996 return 0; 839 return res;
997 840
998out_splat: 841out_splat:
999 ceph_mdsc_close_sessions(&client->mdsc); 842 ceph_mdsc_close_sessions(fsc->mdsc);
1000 deactivate_locked_super(sb); 843 deactivate_locked_super(sb);
1001 goto out_final; 844 goto out_final;
1002 845
1003out: 846out:
1004 ceph_destroy_client(client); 847 ceph_mdsc_destroy(fsc);
848 destroy_fs_client(fsc);
1005out_final: 849out_final:
1006 dout("ceph_get_sb fail %d\n", err); 850 dout("ceph_mount fail %ld\n", PTR_ERR(res));
1007 return err; 851 return res;
1008} 852}
1009 853
1010static void ceph_kill_sb(struct super_block *s) 854static void ceph_kill_sb(struct super_block *s)
1011{ 855{
1012 struct ceph_client *client = ceph_sb_to_client(s); 856 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1013 dout("kill_sb %p\n", s); 857 dout("kill_sb %p\n", s);
1014 ceph_mdsc_pre_umount(&client->mdsc); 858 ceph_mdsc_pre_umount(fsc->mdsc);
1015 kill_anon_super(s); /* will call put_super after sb is r/o */ 859 kill_anon_super(s); /* will call put_super after sb is r/o */
1016 ceph_destroy_client(client); 860 ceph_mdsc_destroy(fsc);
861 destroy_fs_client(fsc);
1017} 862}
1018 863
1019static struct file_system_type ceph_fs_type = { 864static struct file_system_type ceph_fs_type = {
1020 .owner = THIS_MODULE, 865 .owner = THIS_MODULE,
1021 .name = "ceph", 866 .name = "ceph",
1022 .get_sb = ceph_get_sb, 867 .mount = ceph_mount,
1023 .kill_sb = ceph_kill_sb, 868 .kill_sb = ceph_kill_sb,
1024 .fs_flags = FS_RENAME_DOES_D_MOVE, 869 .fs_flags = FS_RENAME_DOES_D_MOVE,
1025}; 870};
@@ -1029,38 +874,20 @@ static struct file_system_type ceph_fs_type = {
1029 874
1030static int __init init_ceph(void) 875static int __init init_ceph(void)
1031{ 876{
1032 int ret = 0; 877 int ret = init_caches();
1033
1034 ret = ceph_debugfs_init();
1035 if (ret < 0)
1036 goto out;
1037
1038 ret = ceph_msgr_init();
1039 if (ret < 0)
1040 goto out_debugfs;
1041
1042 ret = init_caches();
1043 if (ret) 878 if (ret)
1044 goto out_msgr; 879 goto out;
1045
1046 ceph_caps_init();
1047 880
1048 ret = register_filesystem(&ceph_fs_type); 881 ret = register_filesystem(&ceph_fs_type);
1049 if (ret) 882 if (ret)
1050 goto out_icache; 883 goto out_icache;
1051 884
1052 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 885 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1053 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 886
1054 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1055 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1056 return 0; 887 return 0;
1057 888
1058out_icache: 889out_icache:
1059 destroy_caches(); 890 destroy_caches();
1060out_msgr:
1061 ceph_msgr_exit();
1062out_debugfs:
1063 ceph_debugfs_cleanup();
1064out: 891out:
1065 return ret; 892 return ret;
1066} 893}
@@ -1069,10 +896,7 @@ static void __exit exit_ceph(void)
1069{ 896{
1070 dout("exit_ceph\n"); 897 dout("exit_ceph\n");
1071 unregister_filesystem(&ceph_fs_type); 898 unregister_filesystem(&ceph_fs_type);
1072 ceph_caps_finalize();
1073 destroy_caches(); 899 destroy_caches();
1074 ceph_msgr_exit();
1075 ceph_debugfs_cleanup();
1076} 900}
1077 901
1078module_init(init_ceph); 902module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 10a4a406e88..1886294e12f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,36 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * mount options 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43 30
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
45 32
46#define ceph_set_opt(client, opt) \ 33#define ceph_set_mount_opt(fsc, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt; 34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
48#define ceph_test_opt(client, opt) \ 35#define ceph_test_mount_opt(fsc, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt)) 36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
50 37
38#define CEPH_MAX_READDIR_DEFAULT 1024
39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
51 41
52struct ceph_mount_args { 42struct ceph_mount_options {
53 int sb_flags;
54 int flags; 43 int flags;
55 struct ceph_fsid fsid; 44 int sb_flags;
56 struct ceph_entity_addr my_addr; 45
57 int num_mon;
58 struct ceph_entity_addr *mon_addr;
59 int mount_timeout;
60 int osd_idle_ttl;
61 int osd_timeout;
62 int osd_keepalive_timeout;
63 int wsize; 46 int wsize;
64 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
65 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -67,82 +50,25 @@ struct ceph_mount_args {
67 int cap_release_safety; 50 int cap_release_safety;
68 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
69 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
70 char *snapdir_name; /* default ".snap" */
71 char *name;
72 char *secret;
73};
74
75/*
76 * defaults
77 */
78#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
79#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
83#define CEPH_MAX_READDIR_DEFAULT 1024
84#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
85
86#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
87#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
88
89#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
90#define CEPH_AUTH_NAME_DEFAULT "guest"
91/*
92 * Delay telling the MDS we no longer want caps, in case we reopen
93 * the file. Delay a minimum amount of time, even if we send a cap
94 * message for some other reason. Otherwise, take the oppotunity to
95 * update the mds to avoid sending another message later.
96 */
97#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
98#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
99
100#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
101
102/* mount state */
103enum {
104 CEPH_MOUNT_MOUNTING,
105 CEPH_MOUNT_MOUNTED,
106 CEPH_MOUNT_UNMOUNTING,
107 CEPH_MOUNT_UNMOUNTED,
108 CEPH_MOUNT_SHUTDOWN,
109};
110 53
111/* 54 /*
112 * subtract jiffies 55 * everything above this point can be memcmp'd; everything below
113 */ 56 * is handled in compare_mount_options()
114static inline unsigned long time_sub(unsigned long a, unsigned long b) 57 */
115{
116 BUG_ON(time_after(b, a));
117 return (long)a - (long)b;
118}
119
120/*
121 * per-filesystem client state
122 *
123 * possibly shared by multiple mount points, if they are
124 * mounting the same ceph filesystem/cluster.
125 */
126struct ceph_client {
127 struct ceph_fsid fsid;
128 bool have_fsid;
129 58
130 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
131 struct ceph_mount_args *mount_args; 60};
132 61
62struct ceph_fs_client {
133 struct super_block *sb; 63 struct super_block *sb;
134 64
135 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
136 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
137
138 int auth_err;
139 67
68 unsigned long mount_state;
140 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
141 70
142 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
143 struct ceph_mon_client monc;
144 struct ceph_mds_client mdsc;
145 struct ceph_osd_client osdc;
146 72
147 /* writeback */ 73 /* writeback */
148 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -154,14 +80,14 @@ struct ceph_client {
154 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
155 81
156#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
157 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
158 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
159 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
160 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
161 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
162#endif 87#endif
163}; 88};
164 89
90
165/* 91/*
166 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
167 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -210,8 +136,7 @@ struct ceph_cap_snap {
210 uid_t uid; 136 uid_t uid;
211 gid_t gid; 137 gid_t gid;
212 138
213 void *xattr_blob; 139 struct ceph_buffer *xattr_blob;
214 int xattr_len;
215 u64 xattr_version; 140 u64 xattr_version;
216 141
217 u64 size; 142 u64 size;
@@ -223,8 +148,11 @@ struct ceph_cap_snap {
223 148
224static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) 149static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
225{ 150{
226 if (atomic_dec_and_test(&capsnap->nref)) 151 if (atomic_dec_and_test(&capsnap->nref)) {
152 if (capsnap->xattr_blob)
153 ceph_buffer_put(capsnap->xattr_blob);
227 kfree(capsnap); 154 kfree(capsnap);
155 }
228} 156}
229 157
230/* 158/*
@@ -267,6 +195,20 @@ struct ceph_inode_xattr {
267 int should_free_val; 195 int should_free_val;
268}; 196};
269 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
270struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
271 /* 213 /*
272 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -288,11 +230,6 @@ struct ceph_inode_xattrs_info {
288/* 230/*
289 * Ceph inode. 231 * Ceph inode.
290 */ 232 */
291#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
292#define CEPH_I_NODELAY 4 /* do not delay cap release */
293#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
294#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
295
296struct ceph_inode_info { 233struct ceph_inode_info {
297 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
298 235
@@ -336,7 +273,8 @@ struct ceph_inode_info {
336 unsigned i_cap_exporting_issued; 273 unsigned i_cap_exporting_issued;
337 struct ceph_cap_reservation i_cap_migration_resv; 274 struct ceph_cap_reservation i_cap_migration_resv;
338 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 275 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
339 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ 276 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
277 dirty|flushing caps */
340 unsigned i_snap_caps; /* cap bits for snapped files */ 278 unsigned i_snap_caps; /* cap bits for snapped files */
341 279
342 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 280 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
@@ -382,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
382 return container_of(inode, struct ceph_inode_info, vfs_inode); 320 return container_of(inode, struct ceph_inode_info, vfs_inode);
383} 321}
384 322
323static inline struct ceph_vino ceph_vino(struct inode *inode)
324{
325 return ceph_inode(inode)->i_vino;
326}
327
328/*
329 * ino_t is <64 bits on many architectures, blech.
330 *
331 * don't include snap in ino hash, at least for now.
332 */
333static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
334{
335 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
336#if BITS_PER_LONG == 32
337 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
338 if (!ino)
339 ino = 1;
340#endif
341 return ino;
342}
343
344/* for printf-style formatting */
345#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
346
347static inline u64 ceph_ino(struct inode *inode)
348{
349 return ceph_inode(inode)->i_vino.ino;
350}
351static inline u64 ceph_snap(struct inode *inode)
352{
353 return ceph_inode(inode)->i_vino.snap;
354}
355
356static inline int ceph_ino_compare(struct inode *inode, void *data)
357{
358 struct ceph_vino *pvino = (struct ceph_vino *)data;
359 struct ceph_inode_info *ci = ceph_inode(inode);
360 return ci->i_vino.ino == pvino->ino &&
361 ci->i_vino.snap == pvino->snap;
362}
363
364static inline struct inode *ceph_find_inode(struct super_block *sb,
365 struct ceph_vino vino)
366{
367 ino_t t = ceph_vino_to_ino(vino);
368 return ilookup5(sb, t, ceph_ino_compare, &vino);
369}
370
371
372/*
373 * Ceph inode.
374 */
375#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
376#define CEPH_I_NODELAY 4 /* do not delay cap release */
377#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
378#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
379
385static inline void ceph_i_clear(struct inode *inode, unsigned mask) 380static inline void ceph_i_clear(struct inode *inode, unsigned mask)
386{ 381{
387 struct ceph_inode_info *ci = ceph_inode(inode); 382 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -405,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
405 struct ceph_inode_info *ci = ceph_inode(inode); 400 struct ceph_inode_info *ci = ceph_inode(inode);
406 bool r; 401 bool r;
407 402
408 smp_mb(); 403 spin_lock(&inode->i_lock);
409 r = (ci->i_ceph_flags & mask) == mask; 404 r = (ci->i_ceph_flags & mask) == mask;
405 spin_unlock(&inode->i_lock);
410 return r; 406 return r;
411} 407}
412 408
@@ -423,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
423 struct ceph_inode_frag *pfrag, 419 struct ceph_inode_frag *pfrag,
424 int *found); 420 int *found);
425 421
426/*
427 * Ceph dentry state
428 */
429struct ceph_dentry_info {
430 struct ceph_mds_session *lease_session;
431 u32 lease_gen, lease_shared_gen;
432 u32 lease_seq;
433 unsigned long lease_renew_after, lease_renew_from;
434 struct list_head lru;
435 struct dentry *dentry;
436 u64 time;
437 u64 offset;
438};
439
440static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 422static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
441{ 423{
442 return (struct ceph_dentry_info *)dentry->d_fsdata; 424 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -447,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
447 return ((loff_t)frag << 32) | (loff_t)off; 429 return ((loff_t)frag << 32) | (loff_t)off;
448} 430}
449 431
450/*
451 * ino_t is <64 bits on many architectures, blech.
452 *
453 * don't include snap in ino hash, at least for now.
454 */
455static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
456{
457 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
458#if BITS_PER_LONG == 32
459 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
460 if (!ino)
461 ino = 1;
462#endif
463 return ino;
464}
465
466static inline int ceph_set_ino_cb(struct inode *inode, void *data) 432static inline int ceph_set_ino_cb(struct inode *inode, void *data)
467{ 433{
468 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 434 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -470,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470 return 0; 436 return 0;
471} 437}
472 438
473static inline struct ceph_vino ceph_vino(struct inode *inode)
474{
475 return ceph_inode(inode)->i_vino;
476}
477
478/* for printf-style formatting */
479#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
480
481static inline u64 ceph_ino(struct inode *inode)
482{
483 return ceph_inode(inode)->i_vino.ino;
484}
485static inline u64 ceph_snap(struct inode *inode)
486{
487 return ceph_inode(inode)->i_vino.snap;
488}
489
490static inline int ceph_ino_compare(struct inode *inode, void *data)
491{
492 struct ceph_vino *pvino = (struct ceph_vino *)data;
493 struct ceph_inode_info *ci = ceph_inode(inode);
494 return ci->i_vino.ino == pvino->ino &&
495 ci->i_vino.snap == pvino->snap;
496}
497
498static inline struct inode *ceph_find_inode(struct super_block *sb,
499 struct ceph_vino vino)
500{
501 ino_t t = ceph_vino_to_ino(vino);
502 return ilookup5(sb, t, ceph_ino_compare, &vino);
503}
504
505
506/* 439/*
507 * caps helpers 440 * caps helpers
508 */ 441 */
@@ -560,23 +493,25 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
560/* what the mds thinks we want */ 493/* what the mds thinks we want */
561extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); 494extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
562 495
563extern void ceph_caps_init(void); 496extern void ceph_caps_init(struct ceph_mds_client *mdsc);
564extern void ceph_caps_finalize(void); 497extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
565extern void ceph_adjust_min_caps(int delta); 498extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
566extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); 499extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
567extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); 500 struct ceph_cap_reservation *ctx, int need);
568extern void ceph_reservation_status(struct ceph_client *client, 501extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
502 struct ceph_cap_reservation *ctx);
503extern void ceph_reservation_status(struct ceph_fs_client *client,
569 int *total, int *avail, int *used, 504 int *total, int *avail, int *used,
570 int *reserved, int *min); 505 int *reserved, int *min);
571 506
572static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 507static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
573{ 508{
574 return (struct ceph_client *)inode->i_sb->s_fs_info; 509 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
575} 510}
576 511
577static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 512static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
578{ 513{
579 return (struct ceph_client *)sb->s_fs_info; 514 return (struct ceph_fs_client *)sb->s_fs_info;
580} 515}
581 516
582 517
@@ -606,51 +541,6 @@ struct ceph_file_info {
606 541
607 542
608/* 543/*
609 * snapshots
610 */
611
612/*
613 * A "snap context" is the set of existing snapshots when we
614 * write data. It is used by the OSD to guide its COW behavior.
615 *
616 * The ceph_snap_context is refcounted, and attached to each dirty
617 * page, indicating which context the dirty data belonged when it was
618 * dirtied.
619 */
620struct ceph_snap_context {
621 atomic_t nref;
622 u64 seq;
623 int num_snaps;
624 u64 snaps[];
625};
626
627static inline struct ceph_snap_context *
628ceph_get_snap_context(struct ceph_snap_context *sc)
629{
630 /*
631 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
632 atomic_read(&sc->nref)+1);
633 */
634 if (sc)
635 atomic_inc(&sc->nref);
636 return sc;
637}
638
639static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
640{
641 if (!sc)
642 return;
643 /*
644 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
645 atomic_read(&sc->nref)-1);
646 */
647 if (atomic_dec_and_test(&sc->nref)) {
648 /*printk(" deleting snap_context %p\n", sc);*/
649 kfree(sc);
650 }
651}
652
653/*
654 * A "snap realm" describes a subset of the file hierarchy sharing 544 * A "snap realm" describes a subset of the file hierarchy sharing
655 * the same set of snapshots that apply to it. The realms themselves 545 * the same set of snapshots that apply to it. The realms themselves
656 * are organized into a hierarchy, such that children inherit (some of) 546 * are organized into a hierarchy, such that children inherit (some of)
@@ -679,6 +569,8 @@ struct ceph_snap_realm {
679 569
680 struct list_head empty_item; /* if i have ref==0 */ 570 struct list_head empty_item; /* if i have ref==0 */
681 571
572 struct list_head dirty_item; /* if realm needs new context */
573
682 /* the current set of snaps for this realm */ 574 /* the current set of snaps for this realm */
683 struct ceph_snap_context *cached_context; 575 struct ceph_snap_context *cached_context;
684 576
@@ -686,16 +578,33 @@ struct ceph_snap_realm {
686 spinlock_t inodes_with_caps_lock; 578 spinlock_t inodes_with_caps_lock;
687}; 579};
688 580
689 581static inline int default_congestion_kb(void)
690
691/*
692 * calculate the number of pages a given length and offset map onto,
693 * if we align the data.
694 */
695static inline int calc_pages_for(u64 off, u64 len)
696{ 582{
697 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 583 int congestion_kb;
698 (off >> PAGE_CACHE_SHIFT); 584
585 /*
586 * Copied from NFS
587 *
588 * congestion size, scale with available memory.
589 *
590 * 64MB: 8192k
591 * 128MB: 11585k
592 * 256MB: 16384k
593 * 512MB: 23170k
594 * 1GB: 32768k
595 * 2GB: 46340k
596 * 4GB: 65536k
597 * 8GB: 92681k
598 * 16GB: 131072k
599 *
600 * This allows larger machines to have larger/more transfers.
601 * Limit the default to 256M
602 */
603 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
604 if (congestion_kb > 256*1024)
605 congestion_kb = 256*1024;
606
607 return congestion_kb;
699} 608}
700 609
701 610
@@ -728,23 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728 ci_item)->writing; 637 ci_item)->writing;
729} 638}
730 639
731
732/* super.c */
733extern struct kmem_cache *ceph_inode_cachep;
734extern struct kmem_cache *ceph_cap_cachep;
735extern struct kmem_cache *ceph_dentry_cachep;
736extern struct kmem_cache *ceph_file_cachep;
737
738extern const char *ceph_msg_type_name(int type);
739extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
740
741#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
742 "%02x%02x%02x%02x%02x%02x"
743#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
744 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
745 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
746 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
747
748/* inode.c */ 640/* inode.c */
749extern const struct inode_operations ceph_file_iops; 641extern const struct inode_operations ceph_file_iops;
750 642
@@ -806,20 +698,24 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
806 __ceph_remove_cap(cap); 698 __ceph_remove_cap(cap);
807 spin_unlock(&inode->i_lock); 699 spin_unlock(&inode->i_lock);
808} 700}
809extern void ceph_put_cap(struct ceph_cap *cap); 701extern void ceph_put_cap(struct ceph_mds_client *mdsc,
702 struct ceph_cap *cap);
810 703
811extern void ceph_queue_caps_release(struct inode *inode); 704extern void ceph_queue_caps_release(struct inode *inode);
812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 705extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
813extern int ceph_fsync(struct file *file, int datasync); 706extern int ceph_fsync(struct file *file, int datasync);
814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 707extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
815 struct ceph_mds_session *session); 708 struct ceph_mds_session *session);
709extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
710 int mds);
816extern int ceph_get_cap_mds(struct inode *inode); 711extern int ceph_get_cap_mds(struct inode *inode);
817extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 712extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
818extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 713extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
819extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 714extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
820 struct ceph_snap_context *snapc); 715 struct ceph_snap_context *snapc);
821extern void __ceph_flush_snaps(struct ceph_inode_info *ci, 716extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
822 struct ceph_mds_session **psession); 717 struct ceph_mds_session **psession,
718 int again);
823extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 719extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
824 struct ceph_mds_session *session); 720 struct ceph_mds_session *session);
825extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); 721extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -847,17 +743,23 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
847/* file.c */ 743/* file.c */
848extern const struct file_operations ceph_file_fops; 744extern const struct file_operations ceph_file_fops;
849extern const struct address_space_operations ceph_aops; 745extern const struct address_space_operations ceph_aops;
746extern int ceph_copy_to_page_vector(struct page **pages,
747 const char *data,
748 loff_t off, size_t len);
749extern int ceph_copy_from_page_vector(struct page **pages,
750 char *data,
751 loff_t off, size_t len);
752extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
850extern int ceph_open(struct inode *inode, struct file *file); 753extern int ceph_open(struct inode *inode, struct file *file);
851extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 754extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
852 struct nameidata *nd, int mode, 755 struct nameidata *nd, int mode,
853 int locked_dir); 756 int locked_dir);
854extern int ceph_release(struct inode *inode, struct file *filp); 757extern int ceph_release(struct inode *inode, struct file *filp);
855extern void ceph_release_page_vector(struct page **pages, int num_pages);
856 758
857/* dir.c */ 759/* dir.c */
858extern const struct file_operations ceph_dir_fops; 760extern const struct file_operations ceph_dir_fops;
859extern const struct inode_operations ceph_dir_iops; 761extern const struct inode_operations ceph_dir_iops;
860extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 762extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
861 ceph_snapdir_dentry_ops; 763 ceph_snapdir_dentry_ops;
862 764
863extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 765extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -882,11 +784,13 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
882/* export.c */ 784/* export.c */
883extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
884 786
885/* debugfs.c */ 787/* locks.c */
886extern int ceph_debugfs_init(void); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
887extern void ceph_debugfs_cleanup(void); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
888extern int ceph_debugfs_client_init(struct ceph_client *client); 790extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
889extern void ceph_debugfs_client_cleanup(struct ceph_client *client); 791extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
792 int p_locks, int f_locks);
793extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
890 794
891static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) 795static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
892{ 796{
@@ -896,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
896 return NULL; 800 return NULL;
897} 801}
898 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
899#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec..00000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 68aeebc6968..6e12a6ba5f7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -337,6 +340,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
337} 340}
338 341
339static int __build_xattrs(struct inode *inode) 342static int __build_xattrs(struct inode *inode)
343 __releases(inode->i_lock)
344 __acquires(inode->i_lock)
340{ 345{
341 u32 namelen; 346 u32 namelen;
342 u32 numattr = 0; 347 u32 numattr = 0;
@@ -483,6 +488,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
483 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
484 ci->i_xattrs.prealloc_blob = NULL; 489 ci->i_xattrs.prealloc_blob = NULL;
485 ci->i_xattrs.dirty = false; 490 ci->i_xattrs.dirty = false;
491 ci->i_xattrs.version++;
486 } 492 }
487} 493}
488 494
@@ -617,12 +623,12 @@ out:
617static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 623static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
618 const char *value, size_t size, int flags) 624 const char *value, size_t size, int flags)
619{ 625{
620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 626 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
621 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
622 struct ceph_inode_info *ci = ceph_inode(inode); 628 struct ceph_inode_info *ci = ceph_inode(inode);
623 struct inode *parent_inode = dentry->d_parent->d_inode; 629 struct inode *parent_inode = dentry->d_parent->d_inode;
624 struct ceph_mds_request *req; 630 struct ceph_mds_request *req;
625 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_client *mdsc = fsc->mdsc;
626 int err; 632 int err;
627 int i, nr_pages; 633 int i, nr_pages;
628 struct page **pages = NULL; 634 struct page **pages = NULL;
@@ -710,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
710 716
711 /* preallocate memory for xattr name, value, index node */ 717 /* preallocate memory for xattr name, value, index node */
712 err = -ENOMEM; 718 err = -ENOMEM;
713 newname = kmalloc(name_len + 1, GFP_NOFS); 719 newname = kmemdup(name, name_len + 1, GFP_NOFS);
714 if (!newname) 720 if (!newname)
715 goto out; 721 goto out;
716 memcpy(newname, name, name_len + 1);
717 722
718 if (val_len) { 723 if (val_len) {
719 newval = kmalloc(val_len + 1, GFP_NOFS); 724 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -774,8 +779,8 @@ out:
774 779
775static int ceph_send_removexattr(struct dentry *dentry, const char *name) 780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
776{ 781{
777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 782 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
778 struct ceph_mds_client *mdsc = &client->mdsc; 783 struct ceph_mds_client *mdsc = fsc->mdsc;
779 struct inode *inode = dentry->d_inode; 784 struct inode *inode = dentry->d_inode;
780 struct inode *parent_inode = dentry->d_parent->d_inode; 785 struct inode *parent_inode = dentry->d_parent->d_inode;
781 struct ceph_mds_request *req; 786 struct ceph_mds_request *req;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b..e5b9df993b9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/mutex.h> 21#include <linux/mutex.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/tty.h>
23 24
24#include "internal.h" 25#include "internal.h"
25 26
@@ -39,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
39#endif 40#endif
40 /* permit direct mmap, for read, write or exec */ 41 /* permit direct mmap, for read, write or exec */
41 BDI_CAP_MAP_DIRECT | 42 BDI_CAP_MAP_DIRECT |
42 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), 43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
43}; 46};
44 47
45static struct kobj_map *cdev_map; 48static struct kobj_map *cdev_map;
@@ -453,6 +456,7 @@ static void cdev_purge(struct cdev *cdev)
453 */ 456 */
454const struct file_operations def_chr_fops = { 457const struct file_operations def_chr_fops = {
455 .open = chrdev_open, 458 .open = chrdev_open,
459 .llseek = noop_llseek,
456}; 460};
457 461
458static struct kobject *exact_match(dev_t dev, int *part, void *data) 462static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f35259680..0ed213970ce 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,9 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK 5 select CRYPTO
6 select CRYPTO_MD5
7 select CRYPTO_ARC4
6 help 8 help
7 This is the client VFS module for the Common Internet File System 9 This is the client VFS module for the Common Internet File System
8 (CIFS) protocol which is the successor to the Server Message Block 10 (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +73,14 @@ config CIFS_WEAK_PW_HASH
71 If unsure, say N. 73 If unsure, say N.
72 74
73config CIFS_UPCALL 75config CIFS_UPCALL
74 bool "Kerberos/SPNEGO advanced session setup" 76 bool "Kerberos/SPNEGO advanced session setup"
75 depends on CIFS && KEYS 77 depends on CIFS && KEYS
76 help 78 select DNS_RESOLVER
77 Enables an upcall mechanism for CIFS which accesses 79 help
78 userspace helper utilities to provide SPNEGO packaged (RFC 4178) 80 Enables an upcall mechanism for CIFS which accesses userspace helper
79 Kerberos tickets which are needed to mount to certain secure servers 81 utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
80 (for which more secure Kerberos authentication is required). If 82 which are needed to mount to certain secure servers (for which more
81 unsure, say N. 83 secure Kerberos authentication is required). If unsure, say N.
82 84
83config CIFS_XATTR 85config CIFS_XATTR
84 bool "CIFS extended attributes" 86 bool "CIFS extended attributes"
@@ -122,6 +124,7 @@ config CIFS_DEBUG2
122config CIFS_DFS_UPCALL 124config CIFS_DFS_UPCALL
123 bool "DFS feature support" 125 bool "DFS feature support"
124 depends on CIFS && KEYS 126 depends on CIFS && KEYS
127 select DNS_RESOLVER
125 help 128 help
126 Distributed File System (DFS) support is used to access shares 129 Distributed File System (DFS) support is used to access shares
127 transparently in an enterprise name space, even if the share 130 transparently in an enterprise name space, even if the share
@@ -131,6 +134,15 @@ config CIFS_DFS_UPCALL
131 IP addresses) which is needed for implicit mounts of DFS junction 134 IP addresses) which is needed for implicit mounts of DFS junction
132 points. If unsure, say N. 135 points. If unsure, say N.
133 136
137config CIFS_FSCACHE
138 bool "Provide CIFS client caching support (EXPERIMENTAL)"
139 depends on EXPERIMENTAL
140 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
141 help
142 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
143 to be cached locally on disk through the general filesystem cache
144 manager. If unsure, say N.
145
134config CIFS_EXPERIMENTAL 146config CIFS_EXPERIMENTAL
135 bool "CIFS Experimental Features (EXPERIMENTAL)" 147 bool "CIFS Experimental Features (EXPERIMENTAL)"
136 depends on CIFS && EXPERIMENTAL 148 depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e8..adefa60a9bd 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12 12
13cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o 13cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
14
15cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075..ee68d103654 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
301 gid Set the default gid for inodes (similar to above). 301 gid Set the default gid for inodes (similar to above).
302 file_mode If CIFS Unix extensions are not supported by the server 302 file_mode If CIFS Unix extensions are not supported by the server
303 this overrides the default mode for file inodes. 303 this overrides the default mode for file inodes.
304 fsc Enable local disk caching using FS-Cache (off by default). This
305 option could be useful to improve performance on a slow link,
306 heavily loaded server and/or network where reading from the
307 disk is faster than reading from the server (over the network).
308 This could also impact scalability positively as the
309 number of calls to the server are reduced. However, local
310 caching is not suitable for all workloads for e.g. read-once
311 type workloads. So, you need to consider carefully your
312 workload/scenario before using this option. Currently, local
313 disk caching is functional for CIFS files opened as read-only.
304 dir_mode If CIFS Unix extensions are not supported by the server 314 dir_mode If CIFS Unix extensions are not supported by the server
305 this overrides the default mode for directory inodes. 315 this overrides the default mode for directory inodes.
306 port attempt to contact the server on this tcp port, before 316 port attempt to contact the server on this tcp port, before
@@ -517,6 +527,11 @@ A partial list of the supported mount options follows:
517 SFU does). In the future the bottom 9 bits of the 527 SFU does). In the future the bottom 9 bits of the
518 mode also will be emulated using queries of the security 528 mode also will be emulated using queries of the security
519 descriptor (ACL). 529 descriptor (ACL).
530 mfsymlinks Enable support for Minshall+French symlinks
531 (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
532 This option is ignored when specified together with the
533 'sfu' option. Minshall+French symlinks are used even if
534 the server supports the CIFS Unix Extensions.
520 sign Must use packet signing (helps avoid unwanted data modification 535 sign Must use packet signing (helps avoid unwanted data modification
521 by intermediate systems in the route). Note that signing 536 by intermediate systems in the route). Note that signing
522 does not work with lanman or plaintext authentication. 537 does not work with lanman or plaintext authentication.
@@ -568,8 +583,9 @@ module can be displayed via modinfo.
568Misc /proc/fs/cifs Flags and Debug Info 583Misc /proc/fs/cifs Flags and Debug Info
569======================================= 584=======================================
570Informational pseudo-files: 585Informational pseudo-files:
571DebugData Displays information about active CIFS sessions 586DebugData Displays information about active CIFS sessions and
572 and shares, as well as the cifs.ko version. 587 shares, features enabled as well as the cifs.ko
588 version.
573Stats Lists summary resource usage information as well as per 589Stats Lists summary resource usage information as well as per
574 share statistics, if CONFIG_CIFS_STATS in enabled 590 share statistics, if CONFIG_CIFS_STATS in enabled
575 in the kernel configuration. 591 in the kernel configuration.
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 00000000000..224d7bbd1fc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
1/*
2 * fs/cifs/cache.c - CIFS filesystem cache index structure definitions
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#include "fscache.h"
22#include "cifs_debug.h"
23
24/*
25 * CIFS filesystem definition for FS-Cache
26 */
27struct fscache_netfs cifs_fscache_netfs = {
28 .name = "cifs",
29 .version = 0,
30};
31
32/*
33 * Register CIFS for caching with FS-Cache
34 */
35int cifs_fscache_register(void)
36{
37 return fscache_register_netfs(&cifs_fscache_netfs);
38}
39
40/*
41 * Unregister CIFS for caching
42 */
43void cifs_fscache_unregister(void)
44{
45 fscache_unregister_netfs(&cifs_fscache_netfs);
46}
47
48/*
49 * Key layout of CIFS server cache index object
50 */
51struct cifs_server_key {
52 uint16_t family; /* address family */
53 uint16_t port; /* IP port */
54 union {
55 struct in_addr ipv4_addr;
56 struct in6_addr ipv6_addr;
57 } addr[0];
58};
59
60/*
61 * Server object keyed by {IPaddress,port,family} tuple
62 */
63static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
64 void *buffer, uint16_t maxbuf)
65{
66 const struct TCP_Server_Info *server = cookie_netfs_data;
67 const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
68 struct cifs_server_key *key = buffer;
69 uint16_t key_len = sizeof(struct cifs_server_key);
70
71 memset(key, 0, key_len);
72
73 /*
74 * Should not be a problem as sin_family/sin6_family overlays
75 * sa_family field
76 */
77 switch (sa->sa_family) {
78 case AF_INET:
79 key->family = server->addr.sockAddr.sin_family;
80 key->port = server->addr.sockAddr.sin_port;
81 key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
82 key_len += sizeof(key->addr[0].ipv4_addr);
83 break;
84
85 case AF_INET6:
86 key->family = server->addr.sockAddr6.sin6_family;
87 key->port = server->addr.sockAddr6.sin6_port;
88 key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
89 key_len += sizeof(key->addr[0].ipv6_addr);
90 break;
91
92 default:
93 cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
94 key_len = 0;
95 break;
96 }
97
98 return key_len;
99}
100
101/*
102 * Server object for FS-Cache
103 */
104const struct fscache_cookie_def cifs_fscache_server_index_def = {
105 .name = "CIFS.server",
106 .type = FSCACHE_COOKIE_TYPE_INDEX,
107 .get_key = cifs_server_get_key,
108};
109
110/*
111 * Auxiliary data attached to CIFS superblock within the cache
112 */
113struct cifs_fscache_super_auxdata {
114 u64 resource_id; /* unique server resource id */
115};
116
117static char *extract_sharename(const char *treename)
118{
119 const char *src;
120 char *delim, *dst;
121 int len;
122
123 /* skip double chars at the beginning */
124 src = treename + 2;
125
126 /* share name is always preceded by '\\' now */
127 delim = strchr(src, '\\');
128 if (!delim)
129 return ERR_PTR(-EINVAL);
130 delim++;
131 len = strlen(delim);
132
133 /* caller has to free the memory */
134 dst = kstrndup(delim, len, GFP_KERNEL);
135 if (!dst)
136 return ERR_PTR(-ENOMEM);
137
138 return dst;
139}
140
141/*
142 * Superblock object currently keyed by share name
143 */
144static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
145 uint16_t maxbuf)
146{
147 const struct cifsTconInfo *tcon = cookie_netfs_data;
148 char *sharename;
149 uint16_t len;
150
151 sharename = extract_sharename(tcon->treeName);
152 if (IS_ERR(sharename)) {
153 cFYI(1, "CIFS: couldn't extract sharename\n");
154 sharename = NULL;
155 return 0;
156 }
157
158 len = strlen(sharename);
159 if (len > maxbuf)
160 return 0;
161
162 memcpy(buffer, sharename, len);
163
164 kfree(sharename);
165
166 return len;
167}
168
169static uint16_t
170cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
171 uint16_t maxbuf)
172{
173 struct cifs_fscache_super_auxdata auxdata;
174 const struct cifsTconInfo *tcon = cookie_netfs_data;
175
176 memset(&auxdata, 0, sizeof(auxdata));
177 auxdata.resource_id = tcon->resource_id;
178
179 if (maxbuf > sizeof(auxdata))
180 maxbuf = sizeof(auxdata);
181
182 memcpy(buffer, &auxdata, maxbuf);
183
184 return maxbuf;
185}
186
187static enum
188fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
189 const void *data,
190 uint16_t datalen)
191{
192 struct cifs_fscache_super_auxdata auxdata;
193 const struct cifsTconInfo *tcon = cookie_netfs_data;
194
195 if (datalen != sizeof(auxdata))
196 return FSCACHE_CHECKAUX_OBSOLETE;
197
198 memset(&auxdata, 0, sizeof(auxdata));
199 auxdata.resource_id = tcon->resource_id;
200
201 if (memcmp(data, &auxdata, datalen) != 0)
202 return FSCACHE_CHECKAUX_OBSOLETE;
203
204 return FSCACHE_CHECKAUX_OKAY;
205}
206
207/*
208 * Superblock object for FS-Cache
209 */
210const struct fscache_cookie_def cifs_fscache_super_index_def = {
211 .name = "CIFS.super",
212 .type = FSCACHE_COOKIE_TYPE_INDEX,
213 .get_key = cifs_super_get_key,
214 .get_aux = cifs_fscache_super_get_aux,
215 .check_aux = cifs_fscache_super_check_aux,
216};
217
218/*
219 * Auxiliary data attached to CIFS inode within the cache
220 */
221struct cifs_fscache_inode_auxdata {
222 struct timespec last_write_time;
223 struct timespec last_change_time;
224 u64 eof;
225};
226
227static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
228 void *buffer, uint16_t maxbuf)
229{
230 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
231 uint16_t keylen;
232
233 /* use the UniqueId as the key */
234 keylen = sizeof(cifsi->uniqueid);
235 if (keylen > maxbuf)
236 keylen = 0;
237 else
238 memcpy(buffer, &cifsi->uniqueid, keylen);
239
240 return keylen;
241}
242
243static void
244cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
245{
246 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
247
248 *size = cifsi->vfs_inode.i_size;
249}
250
251static uint16_t
252cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
253 uint16_t maxbuf)
254{
255 struct cifs_fscache_inode_auxdata auxdata;
256 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
257
258 memset(&auxdata, 0, sizeof(auxdata));
259 auxdata.eof = cifsi->server_eof;
260 auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
261 auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
262
263 if (maxbuf > sizeof(auxdata))
264 maxbuf = sizeof(auxdata);
265
266 memcpy(buffer, &auxdata, maxbuf);
267
268 return maxbuf;
269}
270
271static enum
272fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
273 const void *data,
274 uint16_t datalen)
275{
276 struct cifs_fscache_inode_auxdata auxdata;
277 struct cifsInodeInfo *cifsi = cookie_netfs_data;
278
279 if (datalen != sizeof(auxdata))
280 return FSCACHE_CHECKAUX_OBSOLETE;
281
282 memset(&auxdata, 0, sizeof(auxdata));
283 auxdata.eof = cifsi->server_eof;
284 auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
285 auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
286
287 if (memcmp(data, &auxdata, datalen) != 0)
288 return FSCACHE_CHECKAUX_OBSOLETE;
289
290 return FSCACHE_CHECKAUX_OKAY;
291}
292
293static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
294{
295 struct cifsInodeInfo *cifsi = cookie_netfs_data;
296 struct pagevec pvec;
297 pgoff_t first;
298 int loop, nr_pages;
299
300 pagevec_init(&pvec, 0);
301 first = 0;
302
303 cFYI(1, "cifs inode 0x%p now uncached", cifsi);
304
305 for (;;) {
306 nr_pages = pagevec_lookup(&pvec,
307 cifsi->vfs_inode.i_mapping, first,
308 PAGEVEC_SIZE - pagevec_count(&pvec));
309 if (!nr_pages)
310 break;
311
312 for (loop = 0; loop < nr_pages; loop++)
313 ClearPageFsCache(pvec.pages[loop]);
314
315 first = pvec.pages[nr_pages - 1]->index + 1;
316
317 pvec.nr = nr_pages;
318 pagevec_release(&pvec);
319 cond_resched();
320 }
321}
322
323const struct fscache_cookie_def cifs_fscache_inode_object_def = {
324 .name = "CIFS.uniqueid",
325 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
326 .get_key = cifs_fscache_inode_get_key,
327 .get_attr = cifs_fscache_inode_get_attr,
328 .get_aux = cifs_fscache_inode_get_aux,
329 .check_aux = cifs_fscache_inode_check_aux,
330 .now_uncached = cifs_fscache_inode_now_uncached,
331};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4fce6e61b34..103ab8b605b 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,11 +119,36 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: ");
123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs");
125 seq_putc(m, ' ');
126#endif
127#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache");
129 seq_putc(m, ' ');
130#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman");
133 seq_putc(m, ' ');
134#endif
135#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix");
137 seq_putc(m, ' ');
138#endif
139#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego");
141 seq_putc(m, ' ');
142#endif
143#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr");
145#endif
146 seq_putc(m, '\n');
122 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
123 seq_printf(m, "Servers:"); 148 seq_printf(m, "Servers:");
124 149
125 i = 0; 150 i = 0;
126 read_lock(&cifs_tcp_ses_lock); 151 spin_lock(&cifs_tcp_ses_lock);
127 list_for_each(tmp1, &cifs_tcp_ses_list) { 152 list_for_each(tmp1, &cifs_tcp_ses_list) {
128 server = list_entry(tmp1, struct TCP_Server_Info, 153 server = list_entry(tmp1, struct TCP_Server_Info,
129 tcp_ses_list); 154 tcp_ses_list);
@@ -205,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
205 spin_unlock(&GlobalMid_Lock); 230 spin_unlock(&GlobalMid_Lock);
206 } 231 }
207 } 232 }
208 read_unlock(&cifs_tcp_ses_lock); 233 spin_unlock(&cifs_tcp_ses_lock);
209 seq_putc(m, '\n'); 234 seq_putc(m, '\n');
210 235
211 /* BB add code to dump additional info such as TCP session info now */ 236 /* BB add code to dump additional info such as TCP session info now */
@@ -245,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
245 atomic_set(&totBufAllocCount, 0); 270 atomic_set(&totBufAllocCount, 0);
246 atomic_set(&totSmBufAllocCount, 0); 271 atomic_set(&totSmBufAllocCount, 0);
247#endif /* CONFIG_CIFS_STATS2 */ 272#endif /* CONFIG_CIFS_STATS2 */
248 read_lock(&cifs_tcp_ses_lock); 273 spin_lock(&cifs_tcp_ses_lock);
249 list_for_each(tmp1, &cifs_tcp_ses_list) { 274 list_for_each(tmp1, &cifs_tcp_ses_list) {
250 server = list_entry(tmp1, struct TCP_Server_Info, 275 server = list_entry(tmp1, struct TCP_Server_Info,
251 tcp_ses_list); 276 tcp_ses_list);
@@ -278,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
278 } 303 }
279 } 304 }
280 } 305 }
281 read_unlock(&cifs_tcp_ses_lock); 306 spin_unlock(&cifs_tcp_ses_lock);
282 } 307 }
283 308
284 return count; 309 return count;
@@ -318,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
318 GlobalCurrentXid, GlobalMaxActiveXid); 343 GlobalCurrentXid, GlobalMaxActiveXid);
319 344
320 i = 0; 345 i = 0;
321 read_lock(&cifs_tcp_ses_lock); 346 spin_lock(&cifs_tcp_ses_lock);
322 list_for_each(tmp1, &cifs_tcp_ses_list) { 347 list_for_each(tmp1, &cifs_tcp_ses_list) {
323 server = list_entry(tmp1, struct TCP_Server_Info, 348 server = list_entry(tmp1, struct TCP_Server_Info,
324 tcp_ses_list); 349 tcp_ses_list);
@@ -372,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
372 } 397 }
373 } 398 }
374 } 399 }
375 read_unlock(&cifs_tcp_ses_lock); 400 spin_unlock(&cifs_tcp_ses_lock);
376 401
377 seq_putc(m, '\n'); 402 seq_putc(m, '\n');
378 return 0; 403 return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0..8942b28cf80 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
34extern int traceSMB; /* flag which enables the function below */ 34extern int traceSMB; /* flag which enables the function below */
35void dump_smb(struct smb_hdr *, int); 35void dump_smb(struct smb_hdr *, int);
36#define CIFS_INFO 0x01 36#define CIFS_INFO 0x01
37#define CIFS_RC 0x02 37#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 38#define CIFS_TIMER 0x04
39 39
40/* 40/*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ac19a6f3dae..c68a056f27f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
44void cifs_dfs_release_automount_timer(void) 44void cifs_dfs_release_automount_timer(void)
45{ 45{
46 BUG_ON(!list_empty(&cifs_dfs_automount_list)); 46 BUG_ON(!list_empty(&cifs_dfs_automount_list));
47 cancel_delayed_work(&cifs_dfs_automount_task); 47 cancel_delayed_work_sync(&cifs_dfs_automount_task);
48 flush_scheduled_work();
49} 48}
50 49
51/** 50/**
@@ -141,7 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 } 140 }
142 141
143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
144 if (rc != 0) { 143 if (rc < 0) {
145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
146 __func__, *devname, rc); 145 __func__, *devname, rc);
147 goto compose_mount_options_err; 146 goto compose_mount_options_err;
@@ -150,8 +149,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
150 * assuming that we have 'unc=' and 'ip=' in 149 * assuming that we have 'unc=' and 'ip=' in
151 * the original sb_mountdata 150 * the original sb_mountdata
152 */ 151 */
153 md_len = strlen(sb_mountdata) + strlen(srvIP) + 152 md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
154 strlen(ref->node_name) + 12;
155 mountdata = kzalloc(md_len+1, GFP_KERNEL); 153 mountdata = kzalloc(md_len+1, GFP_KERNEL);
156 if (mountdata == NULL) { 154 if (mountdata == NULL) {
157 rc = -ENOMEM; 155 rc = -ENOMEM;
@@ -230,28 +228,22 @@ compose_mount_options_err:
230 goto compose_mount_options_out; 228 goto compose_mount_options_out;
231} 229}
232 230
233 231/**
234static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent, 232 * cifs_dfs_do_refmount - mounts specified path using provided refferal
235 struct dentry *dentry, const struct dfs_info3_param *ref) 233 * @cifs_sb: parent/root superblock
234 * @fullpath: full path in UNC format
235 * @ref: server's referral
236 */
237static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
238 const char *fullpath, const struct dfs_info3_param *ref)
236{ 239{
237 struct cifs_sb_info *cifs_sb;
238 struct vfsmount *mnt; 240 struct vfsmount *mnt;
239 char *mountdata; 241 char *mountdata;
240 char *devname = NULL; 242 char *devname = NULL;
241 char *fullpath;
242
243 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
244 /*
245 * this function gives us a path with a double backslash prefix. We
246 * require a single backslash for DFS.
247 */
248 fullpath = build_path_from_dentry(dentry);
249 if (!fullpath)
250 return ERR_PTR(-ENOMEM);
251 243
244 /* strip first '\' from fullpath */
252 mountdata = cifs_compose_mount_options(cifs_sb->mountdata, 245 mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
253 fullpath + 1, ref, &devname); 246 fullpath + 1, ref, &devname);
254 kfree(fullpath);
255 247
256 if (IS_ERR(mountdata)) 248 if (IS_ERR(mountdata))
257 return (struct vfsmount *)mountdata; 249 return (struct vfsmount *)mountdata;
@@ -313,6 +305,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
313 int xid, i; 305 int xid, i;
314 int rc = 0; 306 int rc = 0;
315 struct vfsmount *mnt = ERR_PTR(-ENOENT); 307 struct vfsmount *mnt = ERR_PTR(-ENOENT);
308 struct tcon_link *tlink;
316 309
317 cFYI(1, "in %s", __func__); 310 cFYI(1, "in %s", __func__);
318 BUG_ON(IS_ROOT(dentry)); 311 BUG_ON(IS_ROOT(dentry));
@@ -322,14 +315,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
322 dput(nd->path.dentry); 315 dput(nd->path.dentry);
323 nd->path.dentry = dget(dentry); 316 nd->path.dentry = dget(dentry);
324 317
325 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
326 ses = cifs_sb->tcon->ses;
327
328 if (!ses) {
329 rc = -EINVAL;
330 goto out_err;
331 }
332
333 /* 318 /*
334 * The MSDFS spec states that paths in DFS referral requests and 319 * The MSDFS spec states that paths in DFS referral requests and
335 * responses must be prefixed by a single '\' character instead of 320 * responses must be prefixed by a single '\' character instead of
@@ -342,10 +327,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
342 goto out_err; 327 goto out_err;
343 } 328 }
344 329
345 rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls, 330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb);
332 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink);
334 goto out_err;
335 }
336 ses = tlink_tcon(tlink)->ses;
337
338 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
346 &num_referrals, &referrals, 339 &num_referrals, &referrals,
347 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
348 341
342 cifs_put_tlink(tlink);
343
349 for (i = 0; i < num_referrals; i++) { 344 for (i = 0; i < num_referrals; i++) {
350 int len; 345 int len;
351 dump_referral(referrals+i); 346 dump_referral(referrals+i);
@@ -357,8 +352,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
357 rc = -EINVAL; 352 rc = -EINVAL;
358 goto out_err; 353 goto out_err;
359 } 354 }
360 mnt = cifs_dfs_do_refmount(nd->path.mnt, 355 mnt = cifs_dfs_do_refmount(cifs_sb,
361 nd->path.dentry, referrals + i); 356 full_path, referrals + i);
362 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 357 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
363 referrals[i].node_name, mnt); 358 referrals[i].node_name, mnt);
364 359
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb91..525ba59a410 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/radix-tree.h>
19
18#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
20 22
@@ -35,23 +37,29 @@
35#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ 37#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */
36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 38#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ 39#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
38 43
39struct cifs_sb_info { 44struct cifs_sb_info {
40 struct cifsTconInfo *tcon; /* primary mount */ 45 struct radix_tree_root tlink_tree;
41 struct list_head nested_tcon_q; 46#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */
47 spinlock_t tlink_tree_lock;
42 struct nls_table *local_nls; 48 struct nls_table *local_nls;
43 unsigned int rsize; 49 unsigned int rsize;
44 unsigned int wsize; 50 unsigned int wsize;
51 atomic_t active;
45 uid_t mnt_uid; 52 uid_t mnt_uid;
46 gid_t mnt_gid; 53 gid_t mnt_gid;
47 mode_t mnt_file_mode; 54 mode_t mnt_file_mode;
48 mode_t mnt_dir_mode; 55 mode_t mnt_dir_mode;
49 int mnt_cifs_flags; 56 unsigned int mnt_cifs_flags;
50 int prepathlen; 57 int prepathlen;
51 char *prepath; /* relative path under the share to mount to */ 58 char *prepath; /* relative path under the share to mount to */
52#ifdef CONFIG_CIFS_DFS_UPCALL 59#ifdef CONFIG_CIFS_DFS_UPCALL
53 char *mountdata; /* mount options received at mount time */ 60 char *mountdata; /* mount options received at mount time */
54#endif 61#endif
55 struct backing_dev_info bdi; 62 struct backing_dev_info bdi;
63 struct delayed_work prune_tlinks;
56}; 64};
57#endif /* _CIFS_FS_SB_H */ 65#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 379bd7d9c05..87044906cd1 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
84/* strlen of ";uid=0x" */ 84/* strlen of ";uid=0x" */
85#define UID_KEY_LEN 7 85#define UID_KEY_LEN 7
86 86
87/* strlen of ";creduid=0x" */
88#define CREDUID_KEY_LEN 11
89
87/* strlen of ";user=" */ 90/* strlen of ";user=" */
88#define USER_KEY_LEN 6 91#define USER_KEY_LEN 6
89 92
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
107 IP_KEY_LEN + INET6_ADDRSTRLEN + 110 IP_KEY_LEN + INET6_ADDRSTRLEN +
108 MAX_MECH_STR_LEN + 111 MAX_MECH_STR_LEN +
109 UID_KEY_LEN + (sizeof(uid_t) * 2) + 112 UID_KEY_LEN + (sizeof(uid_t) * 2) +
113 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
110 USER_KEY_LEN + strlen(sesInfo->userName) + 114 USER_KEY_LEN + strlen(sesInfo->userName) +
111 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; 115 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
112 116
@@ -144,6 +148,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
144 sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); 148 sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
145 149
146 dp = description + strlen(description); 150 dp = description + strlen(description);
151 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
152
153 dp = description + strlen(description);
147 sprintf(dp, ";user=%s", sesInfo->userName); 154 sprintf(dp, ";user=%s", sesInfo->userName);
148 155
149 dp = description + strlen(description); 156 dp = description + strlen(description);
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6..7fe6b52df50 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
30 * This is a compressed table of upper and lower case conversion. 30 * This is a compressed table of upper and lower case conversion.
31 * 31 *
32 */ 32 */
33#ifndef _CIFS_UNICODE_H
34#define _CIFS_UNICODE_H
33 35
34#include <asm/byteorder.h> 36#include <asm/byteorder.h>
35#include <linux/types.h> 37#include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
67#endif /* UNIUPR_NOUPPER */ 69#endif /* UNIUPR_NOUPPER */
68 70
69#ifndef UNIUPR_NOLOWER 71#ifndef UNIUPR_NOLOWER
70extern signed char UniLowerTable[512]; 72extern signed char CifsUniLowerTable[512];
71extern struct UniCaseRange UniLowerRange[]; 73extern const struct UniCaseRange CifsUniLowerRange[];
72#endif /* UNIUPR_NOLOWER */ 74#endif /* UNIUPR_NOLOWER */
73 75
74#ifdef __KERNEL__ 76#ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
337 * UniTolower: Convert a unicode character to lower case 339 * UniTolower: Convert a unicode character to lower case
338 */ 340 */
339static inline wchar_t 341static inline wchar_t
340UniTolower(wchar_t uc) 342UniTolower(register wchar_t uc)
341{ 343{
342 register struct UniCaseRange *rp; 344 register const struct UniCaseRange *rp;
343 345
344 if (uc < sizeof(UniLowerTable)) { 346 if (uc < sizeof(CifsUniLowerTable)) {
345 /* Latin characters */ 347 /* Latin characters */
346 return uc + UniLowerTable[uc]; /* Use base tables */ 348 return uc + CifsUniLowerTable[uc]; /* Use base tables */
347 } else { 349 } else {
348 rp = UniLowerRange; /* Use range tables */ 350 rp = CifsUniLowerRange; /* Use range tables */
349 while (rp->start) { 351 while (rp->start) {
350 if (uc < rp->start) /* Before start of range */ 352 if (uc < rp->start) /* Before start of range */
351 return uc; /* Uppercase = input */ 353 return uc; /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
374} 376}
375 377
376#endif 378#endif
379
380#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e51..0ac7c5a8633 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
140/* 140/*
141 * Latin lower case 141 * Latin lower case
142 */ 142 */
143static signed char CifsUniLowerTable[512] = { 143signed char CifsUniLowerTable[512] = {
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ 146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
242/* 242/*
243 * Lower Case Range 243 * Lower Case Range
244 */ 244 */
245static const struct UniCaseRange CifsUniLowerRange[] = { 245const struct UniCaseRange CifsUniLowerRange[] = {
246 0x0380, 0x03ab, UniCaseRangeL0380, 246 {0x0380, 0x03ab, UniCaseRangeL0380},
247 0x0400, 0x042f, UniCaseRangeL0400, 247 {0x0400, 0x042f, UniCaseRangeL0400},
248 0x0490, 0x04cb, UniCaseRangeL0490, 248 {0x0490, 0x04cb, UniCaseRangeL0490},
249 0x1e00, 0x1ff7, UniCaseRangeL1e00, 249 {0x1e00, 0x1ff7, UniCaseRangeL1e00},
250 0xff20, 0xff3a, UniCaseRangeLff20, 250 {0xff20, 0xff3a, UniCaseRangeLff20},
251 0, 0, 0 251 {0}
252}; 252};
253#endif 253#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c..c9b4792ae82 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
557{ 557{
558 struct cifs_ntsd *pntsd = NULL; 558 struct cifs_ntsd *pntsd = NULL;
559 int xid, rc; 559 int xid, rc;
560 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
561
562 if (IS_ERR(tlink))
563 return NULL;
560 564
561 xid = GetXid(); 565 xid = GetXid();
562 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 566 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
563 FreeXid(xid); 567 FreeXid(xid);
564 568
569 cifs_put_tlink(tlink);
565 570
566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 571 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
567 return pntsd; 572 return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
574 int oplock = 0; 579 int oplock = 0;
575 int xid, rc; 580 int xid, rc;
576 __u16 fid; 581 __u16 fid;
582 struct cifsTconInfo *tcon;
583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
584
585 if (IS_ERR(tlink))
586 return NULL;
577 587
588 tcon = tlink_tcon(tlink);
578 xid = GetXid(); 589 xid = GetXid();
579 590
580 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0, 591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 &fid, &oplock, NULL, cifs_sb->local_nls, 592 &fid, &oplock, NULL, cifs_sb->local_nls,
582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 if (rc) { 594 if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
585 goto out; 596 goto out;
586 } 597 }
587 598
588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 599 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 600 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
590 601
591 CIFSSMBClose(xid, cifs_sb->tcon, fid); 602 CIFSSMBClose(xid, tcon, fid);
592 out: 603 out:
604 cifs_put_tlink(tlink);
593 FreeXid(xid); 605 FreeXid(xid);
594 return pntsd; 606 return pntsd;
595} 607}
@@ -603,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
603 struct cifsFileInfo *open_file = NULL; 615 struct cifsFileInfo *open_file = NULL;
604 616
605 if (inode) 617 if (inode)
606 open_file = find_readable_file(CIFS_I(inode)); 618 open_file = find_readable_file(CIFS_I(inode), true);
607 if (!open_file) 619 if (!open_file)
608 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 620 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
609 621
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
616 struct cifs_ntsd *pnntsd, u32 acllen) 628 struct cifs_ntsd *pnntsd, u32 acllen)
617{ 629{
618 int xid, rc; 630 int xid, rc;
631 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
632
633 if (IS_ERR(tlink))
634 return PTR_ERR(tlink);
619 635
620 xid = GetXid(); 636 xid = GetXid();
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 637 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
622 FreeXid(xid); 638 FreeXid(xid);
639 cifs_put_tlink(tlink);
623 640
624 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 641 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
625 return rc; 642 return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
631 int oplock = 0; 648 int oplock = 0;
632 int xid, rc; 649 int xid, rc;
633 __u16 fid; 650 __u16 fid;
651 struct cifsTconInfo *tcon;
652 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
634 653
654 if (IS_ERR(tlink))
655 return PTR_ERR(tlink);
656
657 tcon = tlink_tcon(tlink);
635 xid = GetXid(); 658 xid = GetXid();
636 659
637 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0, 660 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
638 &fid, &oplock, NULL, cifs_sb->local_nls, 661 &fid, &oplock, NULL, cifs_sb->local_nls,
639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 662 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) { 663 if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
642 goto out; 665 goto out;
643 } 666 }
644 667
645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 668 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
646 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 669 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
647 670
648 CIFSSMBClose(xid, cifs_sb->tcon, fid); 671 CIFSSMBClose(xid, tcon, fid);
649 out: 672out:
650 FreeXid(xid); 673 FreeXid(xid);
674 cifs_put_tlink(tlink);
651 return rc; 675 return rc;
652} 676}
653 677
@@ -661,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
661 685
662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); 686 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
663 687
664 open_file = find_readable_file(CIFS_I(inode)); 688 open_file = find_readable_file(CIFS_I(inode), true);
665 if (!open_file) 689 if (!open_file)
666 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 690 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
667 691
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc4..f856732161a 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
27#include "md5.h" 27#include "md5.h"
28#include "cifs_unicode.h" 28#include "cifs_unicode.h"
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "ntlmssp.h"
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/random.h> 32#include <linux/random.h>
32 33
@@ -42,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
42 unsigned char *p24); 43 unsigned char *p24);
43 44
44static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
45 const struct mac_key *key, char *signature) 46 struct TCP_Server_Info *server, char *signature)
46{ 47{
47 struct MD5Context context; 48 int rc;
48 49
49 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 50 if (cifs_pdu == NULL || signature == NULL || server == NULL)
50 return -EINVAL; 51 return -EINVAL;
51 52
52 cifs_MD5_init(&context); 53 if (!server->secmech.sdescmd5) {
53 cifs_MD5_update(&context, (char *)&key->data, key->len); 54 cERROR(1, "%s: Can't generate signature\n", __func__);
54 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 55 return -1;
56 }
57
58 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
59 if (rc) {
60 cERROR(1, "%s: Oould not init md5\n", __func__);
61 return rc;
62 }
63
64 crypto_shash_update(&server->secmech.sdescmd5->shash,
65 server->session_key.response, server->session_key.len);
66
67 crypto_shash_update(&server->secmech.sdescmd5->shash,
68 cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
69
70 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
55 71
56 cifs_MD5_final(signature, &context);
57 return 0; 72 return 0;
58} 73}
59 74
@@ -78,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
78 server->sequence_number++; 93 server->sequence_number++;
79 spin_unlock(&GlobalMid_Lock); 94 spin_unlock(&GlobalMid_Lock);
80 95
81 rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, 96 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
82 smb_signature);
83 if (rc) 97 if (rc)
84 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 98 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
85 else 99 else
@@ -89,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
89} 103}
90 104
91static int cifs_calc_signature2(const struct kvec *iov, int n_vec, 105static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
92 const struct mac_key *key, char *signature) 106 struct TCP_Server_Info *server, char *signature)
93{ 107{
94 struct MD5Context context;
95 int i; 108 int i;
109 int rc;
96 110
97 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 111 if (iov == NULL || signature == NULL || server == NULL)
98 return -EINVAL; 112 return -EINVAL;
99 113
100 cifs_MD5_init(&context); 114 if (!server->secmech.sdescmd5) {
101 cifs_MD5_update(&context, (char *)&key->data, key->len); 115 cERROR(1, "%s: Can't generate signature\n", __func__);
116 return -1;
117 }
118
119 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
120 if (rc) {
121 cERROR(1, "%s: Oould not init md5\n", __func__);
122 return rc;
123 }
124
125 crypto_shash_update(&server->secmech.sdescmd5->shash,
126 server->session_key.response, server->session_key.len);
127
102 for (i = 0; i < n_vec; i++) { 128 for (i = 0; i < n_vec; i++) {
103 if (iov[i].iov_len == 0) 129 if (iov[i].iov_len == 0)
104 continue; 130 continue;
@@ -111,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
111 if (i == 0) { 137 if (i == 0) {
112 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 138 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
113 break; /* nothing to sign or corrupt header */ 139 break; /* nothing to sign or corrupt header */
114 cifs_MD5_update(&context, iov[0].iov_base+4, 140 crypto_shash_update(&server->secmech.sdescmd5->shash,
115 iov[0].iov_len-4); 141 iov[i].iov_base + 4, iov[i].iov_len - 4);
116 } else 142 } else
117 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); 143 crypto_shash_update(&server->secmech.sdescmd5->shash,
144 iov[i].iov_base, iov[i].iov_len);
118 } 145 }
119 146
120 cifs_MD5_final(signature, &context); 147 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
121 148
122 return 0; 149 return rc;
123} 150}
124 151
125
126int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 152int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
127 __u32 *pexpected_response_sequence_number) 153 __u32 *pexpected_response_sequence_number)
128{ 154{
@@ -145,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
145 server->sequence_number++; 171 server->sequence_number++;
146 spin_unlock(&GlobalMid_Lock); 172 spin_unlock(&GlobalMid_Lock);
147 173
148 rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, 174 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
149 smb_signature);
150 if (rc) 175 if (rc)
151 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 176 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
152 else 177 else
@@ -156,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
156} 181}
157 182
158int cifs_verify_signature(struct smb_hdr *cifs_pdu, 183int cifs_verify_signature(struct smb_hdr *cifs_pdu,
159 const struct mac_key *mac_key, 184 struct TCP_Server_Info *server,
160 __u32 expected_sequence_number) 185 __u32 expected_sequence_number)
161{ 186{
162 unsigned int rc; 187 unsigned int rc;
163 char server_response_sig[8]; 188 char server_response_sig[8];
164 char what_we_think_sig_should_be[20]; 189 char what_we_think_sig_should_be[20];
165 190
166 if ((cifs_pdu == NULL) || (mac_key == NULL)) 191 if (cifs_pdu == NULL || server == NULL)
167 return -EINVAL; 192 return -EINVAL;
168 193
169 if (cifs_pdu->Command == SMB_COM_NEGOTIATE) 194 if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
192 cpu_to_le32(expected_sequence_number); 217 cpu_to_le32(expected_sequence_number);
193 cifs_pdu->Signature.Sequence.Reserved = 0; 218 cifs_pdu->Signature.Sequence.Reserved = 0;
194 219
195 rc = cifs_calculate_signature(cifs_pdu, mac_key, 220 rc = cifs_calculate_signature(cifs_pdu, server,
196 what_we_think_sig_should_be); 221 what_we_think_sig_should_be);
197 222
198 if (rc) 223 if (rc)
@@ -208,75 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
208 233
209} 234}
210 235
211/* We fill in key by putting in 40 byte array which was allocated by caller */ 236/* first calculate 24 bytes ntlm response and then 16 byte session key */
212int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 237int setup_ntlm_response(struct cifsSesInfo *ses)
213 const char *password)
214{ 238{
215 char temp_key[16]; 239 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
216 if ((key == NULL) || (rn == NULL)) 240 char temp_key[CIFS_SESS_KEY_SIZE];
217 return -EINVAL;
218 241
219 E_md4hash(password, temp_key); 242 if (!ses)
220 mdfour(key->data.ntlm, temp_key, 16);
221 memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
222 key->len = 40;
223 return 0;
224}
225
226int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
227 const struct nls_table *nls_info)
228{
229 char temp_hash[16];
230 struct HMACMD5Context ctx;
231 char *ucase_buf;
232 __le16 *unicode_buf;
233 unsigned int i, user_name_len, dom_name_len;
234
235 if (ses == NULL)
236 return -EINVAL;
237
238 E_md4hash(ses->password, temp_hash);
239
240 hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
241 user_name_len = strlen(ses->userName);
242 if (user_name_len > MAX_USERNAME_SIZE)
243 return -EINVAL;
244 if (ses->domainName == NULL)
245 return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
246 dom_name_len = strlen(ses->domainName);
247 if (dom_name_len > MAX_USERNAME_SIZE)
248 return -EINVAL; 243 return -EINVAL;
249 244
250 ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); 245 ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
251 if (ucase_buf == NULL) 246 if (!ses->auth_key.response) {
252 return -ENOMEM; 247 cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
253 unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
254 if (unicode_buf == NULL) {
255 kfree(ucase_buf);
256 return -ENOMEM; 248 return -ENOMEM;
257 } 249 }
250 ses->auth_key.len = temp_len;
251
252 SMBNTencrypt(ses->password, ses->server->cryptkey,
253 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
254
255 E_md4hash(ses->password, temp_key);
256 mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
258 257
259 for (i = 0; i < user_name_len; i++)
260 ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
261 ucase_buf[i] = 0;
262 user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
263 MAX_USERNAME_SIZE*2, nls_info);
264 unicode_buf[user_name_len] = 0;
265 user_name_len++;
266
267 for (i = 0; i < dom_name_len; i++)
268 ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
269 ucase_buf[i] = 0;
270 dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
271 MAX_USERNAME_SIZE*2, nls_info);
272
273 unicode_buf[user_name_len + dom_name_len] = 0;
274 hmac_md5_update((const unsigned char *) unicode_buf,
275 (user_name_len+dom_name_len)*2, &ctx);
276
277 hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
278 kfree(ucase_buf);
279 kfree(unicode_buf);
280 return 0; 258 return 0;
281} 259}
282 260
@@ -319,109 +297,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
319} 297}
320#endif /* CIFS_WEAK_PW_HASH */ 298#endif /* CIFS_WEAK_PW_HASH */
321 299
322static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 300/* Build a proper attribute value/target info pairs blob.
301 * Fill in netbios and dns domain name and workstation name
302 * and client time (total five av pairs and + one end of fields indicator.
303 * Allocate domain name which gets freed when session struct is deallocated.
304 */
305static int
306build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
307{
308 unsigned int dlen;
309 unsigned int wlen;
310 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
311 __le64 curtime;
312 char *defdmname = "WORKGROUP";
313 unsigned char *blobptr;
314 struct ntlmssp2_name *attrptr;
315
316 if (!ses->domainName) {
317 ses->domainName = kstrdup(defdmname, GFP_KERNEL);
318 if (!ses->domainName)
319 return -ENOMEM;
320 }
321
322 dlen = strlen(ses->domainName);
323 wlen = strlen(ses->server->hostname);
324
325 /* The length of this blob is a size which is
326 * six times the size of a structure which holds name/size +
327 * two times the unicode length of a domain name +
328 * two times the unicode length of a server name +
329 * size of a timestamp (which is 8 bytes).
330 */
331 ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
332 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
333 if (!ses->auth_key.response) {
334 ses->auth_key.len = 0;
335 cERROR(1, "Challenge target info allocation failure");
336 return -ENOMEM;
337 }
338
339 blobptr = ses->auth_key.response;
340 attrptr = (struct ntlmssp2_name *) blobptr;
341
342 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
343 attrptr->length = cpu_to_le16(2 * dlen);
344 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
345 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
346
347 blobptr += 2 * dlen;
348 attrptr = (struct ntlmssp2_name *) blobptr;
349
350 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
351 attrptr->length = cpu_to_le16(2 * wlen);
352 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
353 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
354
355 blobptr += 2 * wlen;
356 attrptr = (struct ntlmssp2_name *) blobptr;
357
358 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
359 attrptr->length = cpu_to_le16(2 * dlen);
360 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
361 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
362
363 blobptr += 2 * dlen;
364 attrptr = (struct ntlmssp2_name *) blobptr;
365
366 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
367 attrptr->length = cpu_to_le16(2 * wlen);
368 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
369 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
370
371 blobptr += 2 * wlen;
372 attrptr = (struct ntlmssp2_name *) blobptr;
373
374 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
375 attrptr->length = cpu_to_le16(sizeof(__le64));
376 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
377 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
378 memcpy(blobptr, &curtime, sizeof(__le64));
379
380 return 0;
381}
382
383/* Server has provided av pairs/target info in the type 2 challenge
384 * packet and we have plucked it and stored within smb session.
385 * We parse that blob here to find netbios domain name to be used
386 * as part of ntlmv2 authentication (in Target String), if not already
387 * specified on the command line.
388 * If this function returns without any error but without fetching
389 * domain name, authentication may fail against some server but
390 * may not fail against other (those who are not very particular
391 * about target string i.e. for some, just user name might suffice.
392 */
393static int
394find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
395{
396 unsigned int attrsize;
397 unsigned int type;
398 unsigned int onesize = sizeof(struct ntlmssp2_name);
399 unsigned char *blobptr;
400 unsigned char *blobend;
401 struct ntlmssp2_name *attrptr;
402
403 if (!ses->auth_key.len || !ses->auth_key.response)
404 return 0;
405
406 blobptr = ses->auth_key.response;
407 blobend = blobptr + ses->auth_key.len;
408
409 while (blobptr + onesize < blobend) {
410 attrptr = (struct ntlmssp2_name *) blobptr;
411 type = le16_to_cpu(attrptr->type);
412 if (type == NTLMSSP_AV_EOL)
413 break;
414 blobptr += 2; /* advance attr type */
415 attrsize = le16_to_cpu(attrptr->length);
416 blobptr += 2; /* advance attr size */
417 if (blobptr + attrsize > blobend)
418 break;
419 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
420 if (!attrsize)
421 break;
422 if (!ses->domainName) {
423 ses->domainName =
424 kmalloc(attrsize + 1, GFP_KERNEL);
425 if (!ses->domainName)
426 return -ENOMEM;
427 cifs_from_ucs2(ses->domainName,
428 (__le16 *)blobptr, attrsize, attrsize,
429 nls_cp, false);
430 break;
431 }
432 }
433 blobptr += attrsize; /* advance attr value */
434 }
435
436 return 0;
437}
438
439static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
323 const struct nls_table *nls_cp) 440 const struct nls_table *nls_cp)
324{ 441{
325 int rc = 0; 442 int rc = 0;
326 int len; 443 int len;
327 char nt_hash[16]; 444 char nt_hash[CIFS_NTHASH_SIZE];
328 struct HMACMD5Context *pctxt;
329 wchar_t *user; 445 wchar_t *user;
330 wchar_t *domain; 446 wchar_t *domain;
447 wchar_t *server;
331 448
332 pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); 449 if (!ses->server->secmech.sdeschmacmd5) {
333 450 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
334 if (pctxt == NULL) 451 return -1;
335 return -ENOMEM; 452 }
336 453
337 /* calculate md4 hash of password */ 454 /* calculate md4 hash of password */
338 E_md4hash(ses->password, nt_hash); 455 E_md4hash(ses->password, nt_hash);
339 456
340 /* convert Domainname to unicode and uppercase */ 457 crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
341 hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); 458 CIFS_NTHASH_SIZE);
459
460 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
461 if (rc) {
462 cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
463 return rc;
464 }
342 465
343 /* convert ses->userName to unicode and uppercase */ 466 /* convert ses->userName to unicode and uppercase */
344 len = strlen(ses->userName); 467 len = strlen(ses->userName);
345 user = kmalloc(2 + (len * 2), GFP_KERNEL); 468 user = kmalloc(2 + (len * 2), GFP_KERNEL);
346 if (user == NULL) 469 if (user == NULL) {
470 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
471 rc = -ENOMEM;
347 goto calc_exit_2; 472 goto calc_exit_2;
473 }
348 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); 474 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
349 UniStrupr(user); 475 UniStrupr(user);
350 hmac_md5_update((char *)user, 2*len, pctxt); 476
477 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
478 (char *)user, 2 * len);
351 479
352 /* convert ses->domainName to unicode and uppercase */ 480 /* convert ses->domainName to unicode and uppercase */
353 if (ses->domainName) { 481 if (ses->domainName) {
354 len = strlen(ses->domainName); 482 len = strlen(ses->domainName);
355 483
356 domain = kmalloc(2 + (len * 2), GFP_KERNEL); 484 domain = kmalloc(2 + (len * 2), GFP_KERNEL);
357 if (domain == NULL) 485 if (domain == NULL) {
486 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
487 rc = -ENOMEM;
358 goto calc_exit_1; 488 goto calc_exit_1;
489 }
359 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 490 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
360 nls_cp); 491 nls_cp);
361 /* the following line was removed since it didn't work well 492 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
362 with lower cased domain name that passed as an option. 493 (char *)domain, 2 * len);
363 Maybe converting the domain name earlier makes sense */
364 /* UniStrupr(domain); */
365
366 hmac_md5_update((char *)domain, 2*len, pctxt);
367
368 kfree(domain); 494 kfree(domain);
495 } else if (ses->serverName) {
496 len = strlen(ses->serverName);
497
498 server = kmalloc(2 + (len * 2), GFP_KERNEL);
499 if (server == NULL) {
500 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
501 rc = -ENOMEM;
502 goto calc_exit_1;
503 }
504 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
505 nls_cp);
506 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
507 (char *)server, 2 * len);
508 kfree(server);
369 } 509 }
510
511 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
512 ntlmv2_hash);
513
370calc_exit_1: 514calc_exit_1:
371 kfree(user); 515 kfree(user);
372calc_exit_2: 516calc_exit_2:
373 /* BB FIXME what about bytes 24 through 40 of the signing key? 517 return rc;
374 compare with the NTLM example */ 518}
375 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 519
520static int
521CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
522{
523 int rc;
524 unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
525
526 if (!ses->server->secmech.sdeschmacmd5) {
527 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
528 return -1;
529 }
530
531 crypto_shash_setkey(ses->server->secmech.hmacmd5,
532 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
533
534 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
535 if (rc) {
536 cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
537 return rc;
538 }
539
540 if (ses->server->secType == RawNTLMSSP)
541 memcpy(ses->auth_key.response + offset,
542 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
543 else
544 memcpy(ses->auth_key.response + offset,
545 ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
546 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
547 ses->auth_key.response + offset, ses->auth_key.len - offset);
548
549 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
550 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
376 551
377 kfree(pctxt);
378 return rc; 552 return rc;
379} 553}
380 554
381void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, 555
382 const struct nls_table *nls_cp) 556int
557setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
383{ 558{
384 int rc; 559 int rc;
385 struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; 560 int baselen;
386 struct HMACMD5Context context; 561 unsigned int tilen;
562 struct ntlmv2_resp *buf;
563 char ntlmv2_hash[16];
564 unsigned char *tiblob = NULL; /* target info blob */
565
566 if (ses->server->secType == RawNTLMSSP) {
567 if (!ses->domainName) {
568 rc = find_domain_name(ses, nls_cp);
569 if (rc) {
570 cERROR(1, "error %d finding domain name", rc);
571 goto setup_ntlmv2_rsp_ret;
572 }
573 }
574 } else {
575 rc = build_avpair_blob(ses, nls_cp);
576 if (rc) {
577 cERROR(1, "error %d building av pair blob", rc);
578 goto setup_ntlmv2_rsp_ret;
579 }
580 }
387 581
582 baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
583 tilen = ses->auth_key.len;
584 tiblob = ses->auth_key.response;
585
586 ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
587 if (!ses->auth_key.response) {
588 rc = ENOMEM;
589 ses->auth_key.len = 0;
590 cERROR(1, "%s: Can't allocate auth blob", __func__);
591 goto setup_ntlmv2_rsp_ret;
592 }
593 ses->auth_key.len += baselen;
594
595 buf = (struct ntlmv2_resp *)
596 (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
388 buf->blob_signature = cpu_to_le32(0x00000101); 597 buf->blob_signature = cpu_to_le32(0x00000101);
389 buf->reserved = 0; 598 buf->reserved = 0;
390 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 599 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
391 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 600 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
392 buf->reserved2 = 0; 601 buf->reserved2 = 0;
393 buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
394 buf->names[0].length = 0;
395 buf->names[1].type = 0;
396 buf->names[1].length = 0;
397 602
398 /* calculate buf->ntlmv2_hash */ 603 memcpy(ses->auth_key.response + baselen, tiblob, tilen);
399 rc = calc_ntlmv2_hash(ses, nls_cp); 604
400 if (rc) 605 /* calculate ntlmv2_hash */
606 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
607 if (rc) {
401 cERROR(1, "could not get v2 hash rc %d", rc); 608 cERROR(1, "could not get v2 hash rc %d", rc);
402 CalcNTLMv2_response(ses, resp_buf); 609 goto setup_ntlmv2_rsp_ret;
610 }
611
612 /* calculate first part of the client response (CR1) */
613 rc = CalcNTLMv2_response(ses, ntlmv2_hash);
614 if (rc) {
615 cERROR(1, "Could not calculate CR1 rc: %d", rc);
616 goto setup_ntlmv2_rsp_ret;
617 }
618
619 /* now calculate the session key for NTLMv2 */
620 crypto_shash_setkey(ses->server->secmech.hmacmd5,
621 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
622
623 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
624 if (rc) {
625 cERROR(1, "%s: Could not init hmacmd5\n", __func__);
626 goto setup_ntlmv2_rsp_ret;
627 }
628
629 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
630 ses->auth_key.response + CIFS_SESS_KEY_SIZE,
631 CIFS_HMAC_MD5_HASH_SIZE);
632
633 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
634 ses->auth_key.response);
635
636setup_ntlmv2_rsp_ret:
637 kfree(tiblob);
638
639 return rc;
640}
641
642int
643calc_seckey(struct cifsSesInfo *ses)
644{
645 int rc;
646 struct crypto_blkcipher *tfm_arc4;
647 struct scatterlist sgin, sgout;
648 struct blkcipher_desc desc;
649 unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
650
651 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
652
653 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
654 if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
655 cERROR(1, "could not allocate crypto API arc4\n");
656 return PTR_ERR(tfm_arc4);
657 }
658
659 desc.tfm = tfm_arc4;
660
661 crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
662 CIFS_SESS_KEY_SIZE);
403 663
404 /* now calculate the MAC key for NTLMv2 */ 664 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
405 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 665 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
406 hmac_md5_update(resp_buf, 16, &context);
407 hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
408 666
409 memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, 667 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
410 sizeof(struct ntlmv2_resp)); 668 if (rc) {
411 ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); 669 cERROR(1, "could not encrypt session key rc: %d\n", rc);
670 crypto_free_blkcipher(tfm_arc4);
671 return rc;
672 }
673
674 /* make secondary_key/nonce as session key */
675 memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
676 /* and make len as that of session key only */
677 ses->auth_key.len = CIFS_SESS_KEY_SIZE;
678
679 crypto_free_blkcipher(tfm_arc4);
680
681 return 0;
412} 682}
413 683
414void CalcNTLMv2_response(const struct cifsSesInfo *ses, 684void
415 char *v2_session_response) 685cifs_crypto_shash_release(struct TCP_Server_Info *server)
416{ 686{
417 struct HMACMD5Context context; 687 if (server->secmech.md5)
418 /* rest of v2 struct already generated */ 688 crypto_free_shash(server->secmech.md5);
419 memcpy(v2_session_response + 8, ses->server->cryptKey, 8); 689
420 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 690 if (server->secmech.hmacmd5)
691 crypto_free_shash(server->secmech.hmacmd5);
421 692
422 hmac_md5_update(v2_session_response+8, 693 kfree(server->secmech.sdeschmacmd5);
423 sizeof(struct ntlmv2_resp) - 8, &context); 694
695 kfree(server->secmech.sdescmd5);
696}
424 697
425 hmac_md5_final(v2_session_response, &context); 698int
426/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ 699cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
700{
701 int rc;
702 unsigned int size;
703
704 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
705 if (!server->secmech.hmacmd5 ||
706 IS_ERR(server->secmech.hmacmd5)) {
707 cERROR(1, "could not allocate crypto hmacmd5\n");
708 return PTR_ERR(server->secmech.hmacmd5);
709 }
710
711 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
712 if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
713 cERROR(1, "could not allocate crypto md5\n");
714 rc = PTR_ERR(server->secmech.md5);
715 goto crypto_allocate_md5_fail;
716 }
717
718 size = sizeof(struct shash_desc) +
719 crypto_shash_descsize(server->secmech.hmacmd5);
720 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
721 if (!server->secmech.sdeschmacmd5) {
722 cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
723 rc = -ENOMEM;
724 goto crypto_allocate_hmacmd5_sdesc_fail;
725 }
726 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
727 server->secmech.sdeschmacmd5->shash.flags = 0x0;
728
729
730 size = sizeof(struct shash_desc) +
731 crypto_shash_descsize(server->secmech.md5);
732 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
733 if (!server->secmech.sdescmd5) {
734 cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
735 rc = -ENOMEM;
736 goto crypto_allocate_md5_sdesc_fail;
737 }
738 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
739 server->secmech.sdescmd5->shash.flags = 0x0;
740
741 return 0;
742
743crypto_allocate_md5_sdesc_fail:
744 kfree(server->secmech.sdeschmacmd5);
745
746crypto_allocate_hmacmd5_sdesc_fail:
747 crypto_free_shash(server->secmech.md5);
748
749crypto_allocate_md5_fail:
750 crypto_free_shash(server->secmech.hmacmd5);
751
752 return rc;
427} 753}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2cb1a70214d..75c4eaa7958 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h> 38#include <net/ipv6.h>
39#include "cifsfs.h" 39#include "cifsfs.h"
40#include "cifspdu.h" 40#include "cifspdu.h"
41#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -45,8 +45,8 @@
45#include "cifs_fs_sb.h" 45#include "cifs_fs_sb.h"
46#include <linux/mm.h> 46#include <linux/mm.h>
47#include <linux/key-type.h> 47#include <linux/key-type.h>
48#include "dns_resolve.h"
49#include "cifs_spnego.h" 48#include "cifs_spnego.h"
49#include "fscache.h"
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52int cifsFYI = 0; 52int cifsFYI = 0;
@@ -82,6 +82,24 @@ extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
84 84
85void
86cifs_sb_active(struct super_block *sb)
87{
88 struct cifs_sb_info *server = CIFS_SB(sb);
89
90 if (atomic_inc_return(&server->active) == 1)
91 atomic_inc(&sb->s_active);
92}
93
94void
95cifs_sb_deactive(struct super_block *sb)
96{
97 struct cifs_sb_info *server = CIFS_SB(sb);
98
99 if (atomic_dec_and_test(&server->active))
100 deactivate_super(sb);
101}
102
85static int 103static int
86cifs_read_super(struct super_block *sb, void *data, 104cifs_read_super(struct super_block *sb, void *data,
87 const char *devname, int silent) 105 const char *devname, int silent)
@@ -97,6 +115,9 @@ cifs_read_super(struct super_block *sb, void *data,
97 if (cifs_sb == NULL) 115 if (cifs_sb == NULL)
98 return -ENOMEM; 116 return -ENOMEM;
99 117
118 spin_lock_init(&cifs_sb->tlink_tree_lock);
119 INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
120
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) { 122 if (rc) {
102 kfree(cifs_sb); 123 kfree(cifs_sb);
@@ -136,9 +157,6 @@ cifs_read_super(struct super_block *sb, void *data,
136 sb->s_magic = CIFS_MAGIC_NUMBER; 157 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 158 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi; 159 sb->s_bdi = &cifs_sb->bdi;
139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
140 sb->s_blocksize =
141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
142 sb->s_blocksize = CIFS_MAX_MSGSIZE; 160 sb->s_blocksize = CIFS_MAX_MSGSIZE;
143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 161 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
144 inode = cifs_root_iget(sb, ROOT_I); 162 inode = cifs_root_iget(sb, ROOT_I);
@@ -200,8 +218,6 @@ cifs_put_super(struct super_block *sb)
200 return; 218 return;
201 } 219 }
202 220
203 lock_kernel();
204
205 rc = cifs_umount(sb, cifs_sb); 221 rc = cifs_umount(sb, cifs_sb);
206 if (rc) 222 if (rc)
207 cERROR(1, "cifs_umount failed with return code %d", rc); 223 cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +231,6 @@ cifs_put_super(struct super_block *sb)
215 unload_nls(cifs_sb->local_nls); 231 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi); 232 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 233 kfree(cifs_sb);
218
219 unlock_kernel();
220} 234}
221 235
222static int 236static int
@@ -224,7 +238,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 238{
225 struct super_block *sb = dentry->d_sb; 239 struct super_block *sb = dentry->d_sb;
226 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 240 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
227 struct cifsTconInfo *tcon = cifs_sb->tcon; 241 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
228 int rc = -EOPNOTSUPP; 242 int rc = -EOPNOTSUPP;
229 int xid; 243 int xid;
230 244
@@ -304,7 +318,6 @@ cifs_alloc_inode(struct super_block *sb)
304 return NULL; 318 return NULL;
305 cifs_inode->cifsAttrs = 0x20; /* default */ 319 cifs_inode->cifsAttrs = 0x20; /* default */
306 cifs_inode->time = 0; 320 cifs_inode->time = 0;
307 cifs_inode->write_behind_rc = 0;
308 /* Until the file is open and we have gotten oplock 321 /* Until the file is open and we have gotten oplock
309 info back from the server, can not assume caching of 322 info back from the server, can not assume caching of
310 file data or metadata */ 323 file data or metadata */
@@ -329,6 +342,14 @@ cifs_destroy_inode(struct inode *inode)
329} 342}
330 343
331static void 344static void
345cifs_evict_inode(struct inode *inode)
346{
347 truncate_inode_pages(&inode->i_data, 0);
348 end_writeback(inode);
349 cifs_fscache_release_inode_cookie(inode);
350}
351
352static void
332cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 353cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
333{ 354{
334 seq_printf(s, ",addr="); 355 seq_printf(s, ",addr=");
@@ -358,14 +379,36 @@ static int
358cifs_show_options(struct seq_file *s, struct vfsmount *m) 379cifs_show_options(struct seq_file *s, struct vfsmount *m)
359{ 380{
360 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); 381 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
361 struct cifsTconInfo *tcon = cifs_sb->tcon; 382 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
383 struct sockaddr *srcaddr;
384 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
362 385
363 seq_printf(s, ",unc=%s", tcon->treeName); 386 seq_printf(s, ",unc=%s", tcon->treeName);
364 if (tcon->ses->userName) 387
388 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
389 seq_printf(s, ",multiuser");
390 else if (tcon->ses->userName)
365 seq_printf(s, ",username=%s", tcon->ses->userName); 391 seq_printf(s, ",username=%s", tcon->ses->userName);
392
366 if (tcon->ses->domainName) 393 if (tcon->ses->domainName)
367 seq_printf(s, ",domain=%s", tcon->ses->domainName); 394 seq_printf(s, ",domain=%s", tcon->ses->domainName);
368 395
396 if (srcaddr->sa_family != AF_UNSPEC) {
397 struct sockaddr_in *saddr4;
398 struct sockaddr_in6 *saddr6;
399 saddr4 = (struct sockaddr_in *)srcaddr;
400 saddr6 = (struct sockaddr_in6 *)srcaddr;
401 if (srcaddr->sa_family == AF_INET6)
402 seq_printf(s, ",srcaddr=%pI6c",
403 &saddr6->sin6_addr);
404 else if (srcaddr->sa_family == AF_INET)
405 seq_printf(s, ",srcaddr=%pI4",
406 &saddr4->sin_addr.s_addr);
407 else
408 seq_printf(s, ",srcaddr=BAD-AF:%i",
409 (int)(srcaddr->sa_family));
410 }
411
369 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); 412 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
370 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 413 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
371 seq_printf(s, ",forceuid"); 414 seq_printf(s, ",forceuid");
@@ -414,6 +457,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
414 seq_printf(s, ",dynperm"); 457 seq_printf(s, ",dynperm");
415 if (m->mnt_sb->s_flags & MS_POSIXACL) 458 if (m->mnt_sb->s_flags & MS_POSIXACL)
416 seq_printf(s, ",acl"); 459 seq_printf(s, ",acl");
460 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
461 seq_printf(s, ",mfsymlinks");
417 462
418 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 463 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
419 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 464 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -429,20 +474,18 @@ static void cifs_umount_begin(struct super_block *sb)
429 if (cifs_sb == NULL) 474 if (cifs_sb == NULL)
430 return; 475 return;
431 476
432 tcon = cifs_sb->tcon; 477 tcon = cifs_sb_master_tcon(cifs_sb);
433 if (tcon == NULL)
434 return;
435 478
436 read_lock(&cifs_tcp_ses_lock); 479 spin_lock(&cifs_tcp_ses_lock);
437 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { 480 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
438 /* we have other mounts to same share or we have 481 /* we have other mounts to same share or we have
439 already tried to force umount this and woken up 482 already tried to force umount this and woken up
440 all waiting network requests, nothing to do */ 483 all waiting network requests, nothing to do */
441 read_unlock(&cifs_tcp_ses_lock); 484 spin_unlock(&cifs_tcp_ses_lock);
442 return; 485 return;
443 } else if (tcon->tc_count == 1) 486 } else if (tcon->tc_count == 1)
444 tcon->tidStatus = CifsExiting; 487 tcon->tidStatus = CifsExiting;
445 read_unlock(&cifs_tcp_ses_lock); 488 spin_unlock(&cifs_tcp_ses_lock);
446 489
447 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 490 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
448 /* cancel_notify_requests(tcon); */ 491 /* cancel_notify_requests(tcon); */
@@ -473,14 +516,13 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
473 return 0; 516 return 0;
474} 517}
475 518
476void cifs_drop_inode(struct inode *inode) 519static int cifs_drop_inode(struct inode *inode)
477{ 520{
478 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 521 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
479 522
480 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) 523 /* no serverino => unconditional eviction */
481 return generic_drop_inode(inode); 524 return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
482 525 generic_drop_inode(inode);
483 return generic_delete_inode(inode);
484} 526}
485 527
486static const struct super_operations cifs_super_ops = { 528static const struct super_operations cifs_super_ops = {
@@ -489,6 +531,7 @@ static const struct super_operations cifs_super_ops = {
489 .alloc_inode = cifs_alloc_inode, 531 .alloc_inode = cifs_alloc_inode,
490 .destroy_inode = cifs_destroy_inode, 532 .destroy_inode = cifs_destroy_inode,
491 .drop_inode = cifs_drop_inode, 533 .drop_inode = cifs_drop_inode,
534 .evict_inode = cifs_evict_inode,
492/* .delete_inode = cifs_delete_inode, */ /* Do not need above 535/* .delete_inode = cifs_delete_inode, */ /* Do not need above
493 function unless later we add lazy close of inodes or unless the 536 function unless later we add lazy close of inodes or unless the
494 kernel forgets to call us with the same number of releases (closes) 537 kernel forgets to call us with the same number of releases (closes)
@@ -501,28 +544,29 @@ static const struct super_operations cifs_super_ops = {
501#endif 544#endif
502}; 545};
503 546
504static int 547static struct dentry *
505cifs_get_sb(struct file_system_type *fs_type, 548cifs_do_mount(struct file_system_type *fs_type,
506 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 549 int flags, const char *dev_name, void *data)
507{ 550{
508 int rc; 551 int rc;
509 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 552 struct super_block *sb;
553
554 sb = sget(fs_type, NULL, set_anon_super, NULL);
510 555
511 cFYI(1, "Devname: %s flags: %d ", dev_name, flags); 556 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
512 557
513 if (IS_ERR(sb)) 558 if (IS_ERR(sb))
514 return PTR_ERR(sb); 559 return ERR_CAST(sb);
515 560
516 sb->s_flags = flags; 561 sb->s_flags = flags;
517 562
518 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); 563 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
519 if (rc) { 564 if (rc) {
520 deactivate_locked_super(sb); 565 deactivate_locked_super(sb);
521 return rc; 566 return ERR_PTR(rc);
522 } 567 }
523 sb->s_flags |= MS_ACTIVE; 568 sb->s_flags |= MS_ACTIVE;
524 simple_set_mnt(mnt, sb); 569 return dget(sb->s_root);
525 return 0;
526} 570}
527 571
528static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 572static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -557,9 +601,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
557 601
558static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 602static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
559{ 603{
560 /* note that this is called by vfs setlease with the BKL held 604 /* note that this is called by vfs setlease with lock_flocks held
561 although I doubt that BKL is needed here in cifs */ 605 to protect *lease from going away */
562 struct inode *inode = file->f_path.dentry->d_inode; 606 struct inode *inode = file->f_path.dentry->d_inode;
607 struct cifsFileInfo *cfile = file->private_data;
563 608
564 if (!(S_ISREG(inode->i_mode))) 609 if (!(S_ISREG(inode->i_mode)))
565 return -EINVAL; 610 return -EINVAL;
@@ -570,8 +615,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
570 ((arg == F_WRLCK) && 615 ((arg == F_WRLCK) &&
571 (CIFS_I(inode)->clientCanCacheAll))) 616 (CIFS_I(inode)->clientCanCacheAll)))
572 return generic_setlease(file, arg, lease); 617 return generic_setlease(file, arg, lease);
573 else if (CIFS_SB(inode->i_sb)->tcon->local_lease && 618 else if (tlink_tcon(cfile->tlink)->local_lease &&
574 !CIFS_I(inode)->clientCanCacheRead) 619 !CIFS_I(inode)->clientCanCacheRead)
575 /* If the server claims to support oplock on this 620 /* If the server claims to support oplock on this
576 file, then we still need to check oplock even 621 file, then we still need to check oplock even
577 if the local_lease mount option is set, but there 622 if the local_lease mount option is set, but there
@@ -587,7 +632,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
587struct file_system_type cifs_fs_type = { 632struct file_system_type cifs_fs_type = {
588 .owner = THIS_MODULE, 633 .owner = THIS_MODULE,
589 .name = "cifs", 634 .name = "cifs",
590 .get_sb = cifs_get_sb, 635 .mount = cifs_do_mount,
591 .kill_sb = kill_anon_super, 636 .kill_sb = kill_anon_super,
592 /* .fs_flags */ 637 /* .fs_flags */
593}; 638};
@@ -890,8 +935,8 @@ init_cifs(void)
890 GlobalTotalActiveXid = 0; 935 GlobalTotalActiveXid = 0;
891 GlobalMaxActiveXid = 0; 936 GlobalMaxActiveXid = 0;
892 memset(Local_System_Name, 0, 15); 937 memset(Local_System_Name, 0, 15);
893 rwlock_init(&GlobalSMBSeslock); 938 spin_lock_init(&cifs_tcp_ses_lock);
894 rwlock_init(&cifs_tcp_ses_lock); 939 spin_lock_init(&cifs_file_list_lock);
895 spin_lock_init(&GlobalMid_Lock); 940 spin_lock_init(&GlobalMid_Lock);
896 941
897 if (cifs_max_pending < 2) { 942 if (cifs_max_pending < 2) {
@@ -902,10 +947,14 @@ init_cifs(void)
902 cFYI(1, "cifs_max_pending set to max of 256"); 947 cFYI(1, "cifs_max_pending set to max of 256");
903 } 948 }
904 949
905 rc = cifs_init_inodecache(); 950 rc = cifs_fscache_register();
906 if (rc) 951 if (rc)
907 goto out_clean_proc; 952 goto out_clean_proc;
908 953
954 rc = cifs_init_inodecache();
955 if (rc)
956 goto out_unreg_fscache;
957
909 rc = cifs_init_mids(); 958 rc = cifs_init_mids();
910 if (rc) 959 if (rc)
911 goto out_destroy_inodecache; 960 goto out_destroy_inodecache;
@@ -922,34 +971,22 @@ init_cifs(void)
922 if (rc) 971 if (rc)
923 goto out_unregister_filesystem; 972 goto out_unregister_filesystem;
924#endif 973#endif
925#ifdef CONFIG_CIFS_DFS_UPCALL
926 rc = cifs_init_dns_resolver();
927 if (rc)
928 goto out_unregister_key_type;
929#endif
930 rc = slow_work_register_user(THIS_MODULE);
931 if (rc)
932 goto out_unregister_resolver_key;
933 974
934 return 0; 975 return 0;
935 976
936 out_unregister_resolver_key:
937#ifdef CONFIG_CIFS_DFS_UPCALL
938 cifs_exit_dns_resolver();
939 out_unregister_key_type:
940#endif
941#ifdef CONFIG_CIFS_UPCALL 977#ifdef CONFIG_CIFS_UPCALL
942 unregister_key_type(&cifs_spnego_key_type); 978out_unregister_filesystem:
943 out_unregister_filesystem:
944#endif
945 unregister_filesystem(&cifs_fs_type); 979 unregister_filesystem(&cifs_fs_type);
946 out_destroy_request_bufs: 980#endif
981out_destroy_request_bufs:
947 cifs_destroy_request_bufs(); 982 cifs_destroy_request_bufs();
948 out_destroy_mids: 983out_destroy_mids:
949 cifs_destroy_mids(); 984 cifs_destroy_mids();
950 out_destroy_inodecache: 985out_destroy_inodecache:
951 cifs_destroy_inodecache(); 986 cifs_destroy_inodecache();
952 out_clean_proc: 987out_unreg_fscache:
988 cifs_fscache_unregister();
989out_clean_proc:
953 cifs_proc_clean(); 990 cifs_proc_clean();
954 return rc; 991 return rc;
955} 992}
@@ -959,9 +996,9 @@ exit_cifs(void)
959{ 996{
960 cFYI(DBG2, "exit_cifs"); 997 cFYI(DBG2, "exit_cifs");
961 cifs_proc_clean(); 998 cifs_proc_clean();
999 cifs_fscache_unregister();
962#ifdef CONFIG_CIFS_DFS_UPCALL 1000#ifdef CONFIG_CIFS_DFS_UPCALL
963 cifs_dfs_release_automount_timer(); 1001 cifs_dfs_release_automount_timer();
964 cifs_exit_dns_resolver();
965#endif 1002#endif
966#ifdef CONFIG_CIFS_UPCALL 1003#ifdef CONFIG_CIFS_UPCALL
967 unregister_key_type(&cifs_spnego_key_type); 1004 unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a7eb65c84b1..897b2b2b28b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */ 44/* Functions related to super block operations */
45/* extern const struct super_operations cifs_super_ops;*/ 45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_read_inode(struct inode *); 46extern void cifs_sb_deactive(struct super_block *sb);
47/*extern void cifs_delete_inode(struct inode *);*/ /* BB not needed yet */
48/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
49 47
50/* Functions related to inodes */ 48/* Functions related to inodes */
51extern const struct inode_operations cifs_dir_inode_ops; 49extern const struct inode_operations cifs_dir_inode_ops;
@@ -104,7 +102,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
104extern int cifs_symlink(struct inode *inode, struct dentry *direntry, 102extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
105 const char *symname); 103 const char *symname);
106extern int cifs_removexattr(struct dentry *, const char *); 104extern int cifs_removexattr(struct dentry *, const char *);
107extern int cifs_setxattr(struct dentry *, const char *, const void *, 105extern int cifs_setxattr(struct dentry *, const char *, const void *,
108 size_t, int); 106 size_t, int);
109extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); 107extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
110extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 108extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 112extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 113#endif /* EXPERIMENTAL */
116 114
117#define CIFS_VERSION "1.64" 115#define CIFS_VERSION "1.68"
118#endif /* _CIFSFS_H */ 116#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a88479ceaad..f259e4d7612 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,12 +16,18 @@
16 * the GNU Lesser General Public License for more details. 16 * the GNU Lesser General Public License for more details.
17 * 17 *
18 */ 18 */
19#ifndef _CIFS_GLOB_H
20#define _CIFS_GLOB_H
21
19#include <linux/in.h> 22#include <linux/in.h>
20#include <linux/in6.h> 23#include <linux/in6.h>
21#include <linux/slab.h> 24#include <linux/slab.h>
22#include <linux/slow-work.h> 25#include <linux/workqueue.h>
23#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
24#include "cifsacl.h" 27#include "cifsacl.h"
28#include <crypto/internal/hash.h>
29#include <linux/scatterlist.h>
30
25/* 31/*
26 * The sizes of various internal tables and strings 32 * The sizes of various internal tables and strings
27 */ 33 */
@@ -34,7 +40,7 @@
34#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ 40#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */
35#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null 41#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null
36 termination then *2 for unicode versions */ 42 termination then *2 for unicode versions */
37#define MAX_PASSWORD_SIZE 16 43#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
38 44
39#define CIFS_MIN_RCV_POOL 4 45#define CIFS_MIN_RCV_POOL 4
40 46
@@ -71,7 +77,7 @@
71 * CIFS vfs client Status information (based on what we know.) 77 * CIFS vfs client Status information (based on what we know.)
72 */ 78 */
73 79
74 /* associated with each tcp and smb session */ 80/* associated with each tcp and smb session */
75enum statusEnum { 81enum statusEnum {
76 CifsNew = 0, 82 CifsNew = 0,
77 CifsGood, 83 CifsGood,
@@ -80,8 +86,7 @@ enum statusEnum {
80}; 86};
81 87
82enum securityEnum { 88enum securityEnum {
83 PLAINTXT = 0, /* Legacy with Plaintext passwords */ 89 LANMAN = 0, /* Legacy LANMAN auth */
84 LANMAN, /* Legacy LANMAN auth */
85 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 90 NTLM, /* Legacy NTLM012 auth with NTLM hash */
86 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 91 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
87 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 92 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
@@ -95,16 +100,31 @@ enum protocolEnum {
95 /* Netbios frames protocol not supported at this time */ 100 /* Netbios frames protocol not supported at this time */
96}; 101};
97 102
98struct mac_key { 103struct session_key {
99 unsigned int len; 104 unsigned int len;
100 union { 105 char *response;
101 char ntlm[CIFS_SESS_KEY_SIZE + 16]; 106};
102 char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */ 107
103 struct { 108/* crypto security descriptor definition */
104 char key[16]; 109struct sdesc {
105 struct ntlmv2_resp resp; 110 struct shash_desc shash;
106 } ntlmv2; 111 char ctx[];
107 } data; 112};
113
114/* crypto hashing related structure/fields, not specific to a sec mech */
115struct cifs_secmech {
116 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
117 struct crypto_shash *md5; /* md5 hash function */
118 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
119 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
120};
121
122/* per smb session structure/fields */
123struct ntlmssp_auth {
124 __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
125 __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
126 unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
127 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
108}; 128};
109 129
110struct cifs_cred { 130struct cifs_cred {
@@ -137,12 +157,12 @@ struct TCP_Server_Info {
137 struct sockaddr_in sockAddr; 157 struct sockaddr_in sockAddr;
138 struct sockaddr_in6 sockAddr6; 158 struct sockaddr_in6 sockAddr6;
139 } addr; 159 } addr;
160 struct sockaddr_storage srcaddr; /* locally bind to this IP */
140 wait_queue_head_t response_q; 161 wait_queue_head_t response_q;
141 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 162 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
142 struct list_head pending_mid_q; 163 struct list_head pending_mid_q;
143 void *Server_NlsInfo; /* BB - placeholder for future NLS info */ 164 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
144 unsigned short server_codepage; /* codepage for the server */ 165 unsigned short server_codepage; /* codepage for the server */
145 unsigned long ip_address; /* IP addr for the server if known */
146 enum protocolEnum protocolType; 166 enum protocolEnum protocolType;
147 char versionMajor; 167 char versionMajor;
148 char versionMinor; 168 char versionMinor;
@@ -177,32 +197,23 @@ struct TCP_Server_Info {
177 int capabilities; /* allow selective disabling of caps by smb sess */ 197 int capabilities; /* allow selective disabling of caps by smb sess */
178 int timeAdj; /* Adjust for difference in server time zone in sec */ 198 int timeAdj; /* Adjust for difference in server time zone in sec */
179 __u16 CurrentMid; /* multiplex id - rotating counter */ 199 __u16 CurrentMid; /* multiplex id - rotating counter */
180 char cryptKey[CIFS_CRYPTO_KEY_SIZE]; 200 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
181 /* 16th byte of RFC1001 workstation name is always null */ 201 /* 16th byte of RFC1001 workstation name is always null */
182 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 202 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
183 __u32 sequence_number; /* needed for CIFS PDU signature */ 203 __u32 sequence_number; /* needed for CIFS PDU signature */
184 struct mac_key mac_signing_key; 204 struct session_key session_key;
185 char ntlmv2_hash[16];
186 unsigned long lstrp; /* when we got last response from this server */ 205 unsigned long lstrp; /* when we got last response from this server */
187 u16 dialect; /* dialect index that server chose */ 206 u16 dialect; /* dialect index that server chose */
207 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
188 /* extended security flavors that server supports */ 208 /* extended security flavors that server supports */
189 bool sec_kerberos; /* supports plain Kerberos */ 209 bool sec_kerberos; /* supports plain Kerberos */
190 bool sec_mskerberos; /* supports legacy MS Kerberos */ 210 bool sec_mskerberos; /* supports legacy MS Kerberos */
191 bool sec_kerberosu2u; /* supports U2U Kerberos */ 211 bool sec_kerberosu2u; /* supports U2U Kerberos */
192 bool sec_ntlmssp; /* supports NTLMSSP */ 212 bool sec_ntlmssp; /* supports NTLMSSP */
193}; 213 bool session_estab; /* mark when very first sess is established */
194 214#ifdef CONFIG_CIFS_FSCACHE
195/* 215 struct fscache_cookie *fscache; /* client index cache cookie */
196 * The following is our shortcut to user information. We surface the uid, 216#endif
197 * and name. We always get the password on the fly in case it
198 * has changed. We also hang a list of sessions owned by this user off here.
199 */
200struct cifsUidInfo {
201 struct list_head userList;
202 struct list_head sessionList; /* SMB sessions for this user */
203 uid_t linux_uid;
204 char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */
205 /* BB may need ptr or callback for PAM or WinBind info */
206}; 217};
207 218
208/* 219/*
@@ -212,9 +223,6 @@ struct cifsSesInfo {
212 struct list_head smb_ses_list; 223 struct list_head smb_ses_list;
213 struct list_head tcon_list; 224 struct list_head tcon_list;
214 struct mutex session_mutex; 225 struct mutex session_mutex;
215#if 0
216 struct cifsUidInfo *uidInfo; /* pointer to user info */
217#endif
218 struct TCP_Server_Info *server; /* pointer to server info */ 226 struct TCP_Server_Info *server; /* pointer to server info */
219 int ses_count; /* reference counter */ 227 int ses_count; /* reference counter */
220 enum statusEnum status; 228 enum statusEnum status;
@@ -226,13 +234,16 @@ struct cifsSesInfo {
226 char *serverNOS; /* name of network operating system of server */ 234 char *serverNOS; /* name of network operating system of server */
227 char *serverDomain; /* security realm of server */ 235 char *serverDomain; /* security realm of server */
228 int Suid; /* remote smb uid */ 236 int Suid; /* remote smb uid */
229 uid_t linux_uid; /* local Linux uid */ 237 uid_t linux_uid; /* overriding owner of files on the mount */
238 uid_t cred_uid; /* owner of credentials */
230 int capabilities; 239 int capabilities;
231 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for 240 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
232 TCP names - will ipv6 and sctp addresses fit? */ 241 TCP names - will ipv6 and sctp addresses fit? */
233 char userName[MAX_USERNAME_SIZE + 1]; 242 char userName[MAX_USERNAME_SIZE + 1];
234 char *domainName; 243 char *domainName;
235 char *password; 244 char *password;
245 struct session_key auth_key;
246 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
236 bool need_reconnect:1; /* connection reset, uid now invalid */ 247 bool need_reconnect:1; /* connection reset, uid now invalid */
237}; 248};
238/* no more than one of the following three session flags may be set */ 249/* no more than one of the following three session flags may be set */
@@ -311,10 +322,52 @@ struct cifsTconInfo {
311 bool local_lease:1; /* check leases (only) on local system not remote */ 322 bool local_lease:1; /* check leases (only) on local system not remote */
312 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ 323 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
313 bool need_reconnect:1; /* connection reset, tid now invalid */ 324 bool need_reconnect:1; /* connection reset, tid now invalid */
325#ifdef CONFIG_CIFS_FSCACHE
326 u64 resource_id; /* server resource id */
327 struct fscache_cookie *fscache; /* cookie for share */
328#endif
314 /* BB add field for back pointer to sb struct(s)? */ 329 /* BB add field for back pointer to sb struct(s)? */
315}; 330};
316 331
317/* 332/*
333 * This is a refcounted and timestamped container for a tcon pointer. The
334 * container holds a tcon reference. It is considered safe to free one of
335 * these when the tl_count goes to 0. The tl_time is the time of the last
336 * "get" on the container.
337 */
338struct tcon_link {
339 unsigned long tl_index;
340 unsigned long tl_flags;
341#define TCON_LINK_MASTER 0
342#define TCON_LINK_PENDING 1
343#define TCON_LINK_IN_TREE 2
344 unsigned long tl_time;
345 atomic_t tl_count;
346 struct cifsTconInfo *tl_tcon;
347};
348
349extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
350
351static inline struct cifsTconInfo *
352tlink_tcon(struct tcon_link *tlink)
353{
354 return tlink->tl_tcon;
355}
356
357extern void cifs_put_tlink(struct tcon_link *tlink);
358
359static inline struct tcon_link *
360cifs_get_tlink(struct tcon_link *tlink)
361{
362 if (tlink && !IS_ERR(tlink))
363 atomic_inc(&tlink->tl_count);
364 return tlink;
365}
366
367/* This function is always expected to succeed */
368extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
369
370/*
318 * This info hangs off the cifsFileInfo structure, pointed to by llist. 371 * This info hangs off the cifsFileInfo structure, pointed to by llist.
319 * This is used to track byte stream locks on the file 372 * This is used to track byte stream locks on the file
320 */ 373 */
@@ -352,34 +405,29 @@ struct cifsFileInfo {
352 __u16 netfid; /* file id from remote */ 405 __u16 netfid; /* file id from remote */
353 /* BB add lock scope info here if needed */ ; 406 /* BB add lock scope info here if needed */ ;
354 /* lock scope id (0 if none) */ 407 /* lock scope id (0 if none) */
355 struct file *pfile; /* needed for writepage */ 408 struct dentry *dentry;
356 struct inode *pInode; /* needed for oplock break */ 409 unsigned int f_flags;
357 struct vfsmount *mnt; 410 struct tcon_link *tlink;
358 struct mutex lock_mutex; 411 struct mutex lock_mutex;
359 struct list_head llist; /* list of byte range locks we have. */ 412 struct list_head llist; /* list of byte range locks we have. */
360 bool closePend:1; /* file is marked to close */
361 bool invalidHandle:1; /* file closed via session abend */ 413 bool invalidHandle:1; /* file closed via session abend */
362 bool oplock_break_cancelled:1; 414 bool oplock_break_cancelled:1;
363 atomic_t count; /* reference count */ 415 int count; /* refcount protected by cifs_file_list_lock */
364 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 416 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
365 struct cifs_search_info srch_inf; 417 struct cifs_search_info srch_inf;
366 struct slow_work oplock_break; /* slow_work job for oplock breaks */ 418 struct work_struct oplock_break; /* work for oplock breaks */
367}; 419};
368 420
369/* Take a reference on the file private data */ 421/*
422 * Take a reference on the file private data. Must be called with
423 * cifs_file_list_lock held.
424 */
370static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) 425static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
371{ 426{
372 atomic_inc(&cifs_file->count); 427 ++cifs_file->count;
373} 428}
374 429
375/* Release a reference on the file private data */ 430void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
376static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
377{
378 if (atomic_dec_and_test(&cifs_file->count)) {
379 iput(cifs_file->pInode);
380 kfree(cifs_file);
381 }
382}
383 431
384/* 432/*
385 * One of these for each file inode 433 * One of these for each file inode
@@ -389,7 +437,6 @@ struct cifsInodeInfo {
389 struct list_head lockList; 437 struct list_head lockList;
390 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 438 /* BB add in lists for dirty pages i.e. write caching info for oplock */
391 struct list_head openFileList; 439 struct list_head openFileList;
392 int write_behind_rc;
393 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 440 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
394 unsigned long time; /* jiffies of last update/check of inode */ 441 unsigned long time; /* jiffies of last update/check of inode */
395 bool clientCanCacheRead:1; /* read oplock */ 442 bool clientCanCacheRead:1; /* read oplock */
@@ -398,6 +445,9 @@ struct cifsInodeInfo {
398 bool invalid_mapping:1; /* pagecache is invalid */ 445 bool invalid_mapping:1; /* pagecache is invalid */
399 u64 server_eof; /* current file size on server */ 446 u64 server_eof; /* current file size on server */
400 u64 uniqueid; /* server inode number */ 447 u64 uniqueid; /* server inode number */
448#ifdef CONFIG_CIFS_FSCACHE
449 struct fscache_cookie *fscache;
450#endif
401 struct inode vfs_inode; 451 struct inode vfs_inode;
402}; 452};
403 453
@@ -478,16 +528,16 @@ struct oplock_q_entry {
478 528
479/* for pending dnotify requests */ 529/* for pending dnotify requests */
480struct dir_notify_req { 530struct dir_notify_req {
481 struct list_head lhead; 531 struct list_head lhead;
482 __le16 Pid; 532 __le16 Pid;
483 __le16 PidHigh; 533 __le16 PidHigh;
484 __u16 Mid; 534 __u16 Mid;
485 __u16 Tid; 535 __u16 Tid;
486 __u16 Uid; 536 __u16 Uid;
487 __u16 netfid; 537 __u16 netfid;
488 __u32 filter; /* CompletionFilter (for multishot) */ 538 __u32 filter; /* CompletionFilter (for multishot) */
489 int multishot; 539 int multishot;
490 struct file *pfile; 540 struct file *pfile;
491}; 541};
492 542
493struct dfs_info3_param { 543struct dfs_info3_param {
@@ -637,7 +687,7 @@ require use of the stronger protocol */
637 * GlobalMid_Lock protects: 687 * GlobalMid_Lock protects:
638 * list operations on pending_mid_q and oplockQ 688 * list operations on pending_mid_q and oplockQ
639 * updates to XID counters, multiplex id and SMB sequence numbers 689 * updates to XID counters, multiplex id and SMB sequence numbers
640 * GlobalSMBSesLock protects: 690 * cifs_file_list_lock protects:
641 * list operations on tcp and SMB session lists and tCon lists 691 * list operations on tcp and SMB session lists and tCon lists
642 * f_owner.lock protects certain per file struct operations 692 * f_owner.lock protects certain per file struct operations
643 * mapping->page_lock protects certain per page operations 693 * mapping->page_lock protects certain per page operations
@@ -671,7 +721,7 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
671 * the reference counters for the server, smb session, and tcon. Finally, 721 * the reference counters for the server, smb session, and tcon. Finally,
672 * changes to the tcon->tidStatus should be done while holding this lock. 722 * changes to the tcon->tidStatus should be done while holding this lock.
673 */ 723 */
674GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; 724GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
675 725
676/* 726/*
677 * This lock protects the cifs_file->llist and cifs_file->flist 727 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -680,7 +730,7 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
680 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then 730 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
681 * the cifs_tcp_ses_lock must be grabbed first and released last. 731 * the cifs_tcp_ses_lock must be grabbed first and released last.
682 */ 732 */
683GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 733GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
684 734
685/* Outstanding dir notify requests */ 735/* Outstanding dir notify requests */
686GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 736GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
@@ -732,4 +782,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
732GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 782GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
733GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 783GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
734 784
785void cifs_oplock_break(struct work_struct *work);
786void cifs_oplock_break_get(struct cifsFileInfo *cfile);
787void cifs_oplock_break_put(struct cifsFileInfo *cfile);
788
735extern const struct slow_work_ops cifs_oplock_break_ops; 789extern const struct slow_work_ops cifs_oplock_break_ops;
790
791#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db1..de36b09763a 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -131,9 +131,20 @@
131#define CIFS_CRYPTO_KEY_SIZE (8) 131#define CIFS_CRYPTO_KEY_SIZE (8)
132 132
133/* 133/*
134 * Size of the ntlm client response
135 */
136#define CIFS_AUTH_RESP_SIZE (24)
137
138/*
134 * Size of the session key (crypto key encrypted with the password 139 * Size of the session key (crypto key encrypted with the password
135 */ 140 */
136#define CIFS_SESS_KEY_SIZE (24) 141#define CIFS_SESS_KEY_SIZE (16)
142
143#define CIFS_CLIENT_CHALLENGE_SIZE (8)
144#define CIFS_SERVER_CHALLENGE_SIZE (8)
145#define CIFS_HMAC_MD5_HASH_SIZE (16)
146#define CIFS_CPHTXT_SIZE (16)
147#define CIFS_NTHASH_SIZE (16)
137 148
138/* 149/*
139 * Maximum user name length 150 * Maximum user name length
@@ -663,7 +674,6 @@ struct ntlmv2_resp {
663 __le64 time; 674 __le64 time;
664 __u64 client_chal; /* random */ 675 __u64 client_chal; /* random */
665 __u32 reserved2; 676 __u32 reserved2;
666 struct ntlmssp2_name names[2];
667 /* array of name entries could follow ending in minimum 4 byte struct */ 677 /* array of name entries could follow ending in minimum 4 byte struct */
668} __attribute__((packed)); 678} __attribute__((packed));
669 679
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fb6318b8150..edb6d90efdf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,15 +78,18 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
78extern bool is_valid_oplock_break(struct smb_hdr *smb, 78extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 79 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
82#ifdef CONFIG_CIFS_EXPERIMENTAL 82#ifdef CONFIG_CIFS_EXPERIMENTAL
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *); 83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84#endif 84#endif
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
88 struct TCP_Server_Info *server); 88 struct TCP_Server_Info *server);
89extern int cifs_convert_address(char *src, void *dst); 89extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
90extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
91extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
92 const unsigned short int port);
90extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 93extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
91extern void header_assemble(struct smb_hdr *, char /* command */ , 94extern void header_assemble(struct smb_hdr *, char /* command */ ,
92 const struct cifsTconInfo *, int /* length of 95 const struct cifsTconInfo *, int /* length of
@@ -102,12 +105,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
102extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
103 int offset); 106 int offset);
104 107
105extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, 108extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
106 __u16 fileHandle, struct file *file, 109 struct file *file, struct tcon_link *tlink,
107 struct vfsmount *mnt, unsigned int oflags); 110 __u32 oplock);
108extern int cifs_posix_open(char *full_path, struct inode **pinode, 111extern int cifs_posix_open(char *full_path, struct inode **pinode,
109 struct super_block *sb, 112 struct super_block *sb,
110 int mode, int oflags, 113 int mode, unsigned int f_flags,
111 __u32 *poplock, __u16 *pnetfid, int xid); 114 __u32 *poplock, __u16 *pnetfid, int xid);
112void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); 115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
113extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -359,15 +362,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
359extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 362extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
360 __u32 *); 363 __u32 *);
361extern int cifs_verify_signature(struct smb_hdr *, 364extern int cifs_verify_signature(struct smb_hdr *,
362 const struct mac_key *mac_key, 365 struct TCP_Server_Info *server,
363 __u32 expected_sequence_number); 366 __u32 expected_sequence_number);
364extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 367extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
365 const char *pass); 368extern int setup_ntlm_response(struct cifsSesInfo *);
366extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, 369extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
367 const struct nls_table *); 370extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
368extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 371extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
369extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 372extern int calc_seckey(struct cifsSesInfo *);
370 const struct nls_table *); 373
371#ifdef CONFIG_CIFS_WEAK_PW_HASH 374#ifdef CONFIG_CIFS_WEAK_PW_HASH
372extern void calc_lanman_hash(const char *password, const char *cryptkey, 375extern void calc_lanman_hash(const char *password, const char *cryptkey,
373 bool encrypt, char *lnm_session_key); 376 bool encrypt, char *lnm_session_key);
@@ -407,4 +410,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
407extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 410extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
408 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 411 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
409extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 412extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
413extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
414extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
415 const unsigned char *path,
416 struct cifs_sb_info *cifs_sb, int xid);
410#endif /* _CIFSPROTO_H */ 417#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd3..2f2632b6df5 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
91 struct list_head *tmp1; 91 struct list_head *tmp1;
92 92
93/* list all files open on tree connection and mark them invalid */ 93/* list all files open on tree connection and mark them invalid */
94 write_lock(&GlobalSMBSeslock); 94 spin_lock(&cifs_file_list_lock);
95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
96 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 96 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
97 open_file->invalidHandle = true; 97 open_file->invalidHandle = true;
98 open_file->oplock_break_cancelled = true; 98 open_file->oplock_break_cancelled = true;
99 } 99 }
100 write_unlock(&GlobalSMBSeslock); 100 spin_unlock(&cifs_file_list_lock);
101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
102 to this tcon */ 102 to this tcon */
103} 103}
@@ -232,7 +232,7 @@ static int
232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
233 void **request_buf) 233 void **request_buf)
234{ 234{
235 int rc = 0; 235 int rc;
236 236
237 rc = cifs_reconnect_tcon(tcon, smb_command); 237 rc = cifs_reconnect_tcon(tcon, smb_command);
238 if (rc) 238 if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
250 if (tcon != NULL) 250 if (tcon != NULL)
251 cifs_stats_inc(&tcon->num_smbs_sent); 251 cifs_stats_inc(&tcon->num_smbs_sent);
252 252
253 return rc; 253 return 0;
254} 254}
255 255
256int 256int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
281 281
282/* If the return code is zero, this function must fill in request_buf pointer */ 282/* If the return code is zero, this function must fill in request_buf pointer */
283static int 283static int
284smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 284__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
285 void **request_buf /* returned */ , 285 void **request_buf, void **response_buf)
286 void **response_buf /* returned */ )
287{ 286{
288 int rc = 0;
289
290 rc = cifs_reconnect_tcon(tcon, smb_command);
291 if (rc)
292 return rc;
293
294 *request_buf = cifs_buf_get(); 287 *request_buf = cifs_buf_get();
295 if (*request_buf == NULL) { 288 if (*request_buf == NULL) {
296 /* BB should we add a retry in here if not a writepage? */ 289 /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
309 if (tcon != NULL) 302 if (tcon != NULL)
310 cifs_stats_inc(&tcon->num_smbs_sent); 303 cifs_stats_inc(&tcon->num_smbs_sent);
311 304
312 return rc; 305 return 0;
306}
307
308/* If the return code is zero, this function must fill in request_buf pointer */
309static int
310smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
311 void **request_buf, void **response_buf)
312{
313 int rc;
314
315 rc = cifs_reconnect_tcon(tcon, smb_command);
316 if (rc)
317 return rc;
318
319 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
320}
321
322static int
323smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
324 void **request_buf, void **response_buf)
325{
326 if (tcon->ses->need_reconnect || tcon->need_reconnect)
327 return -EHOSTDOWN;
328
329 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
313} 330}
314 331
315static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -486,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
486 503
487 if (rsp->EncryptionKeyLength == 504 if (rsp->EncryptionKeyLength ==
488 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { 505 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
489 memcpy(server->cryptKey, rsp->EncryptionKey, 506 memcpy(ses->server->cryptkey, rsp->EncryptionKey,
490 CIFS_CRYPTO_KEY_SIZE); 507 CIFS_CRYPTO_KEY_SIZE);
491 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 508 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
492 rc = -EIO; /* need cryptkey unless plain text */ 509 rc = -EIO; /* need cryptkey unless plain text */
@@ -557,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
557 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
558 server->timeAdj *= 60; 575 server->timeAdj *= 60;
559 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 576 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
560 memcpy(server->cryptKey, pSMBr->u.EncryptionKey, 577 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
561 CIFS_CRYPTO_KEY_SIZE); 578 CIFS_CRYPTO_KEY_SIZE);
562 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) 579 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
563 && (pSMBr->EncryptionKeyLength == 0)) { 580 && (pSMBr->EncryptionKeyLength == 0)) {
@@ -576,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
576 rc = -EIO; 593 rc = -EIO;
577 goto neg_err_exit; 594 goto neg_err_exit;
578 } 595 }
579 read_lock(&cifs_tcp_ses_lock); 596 spin_lock(&cifs_tcp_ses_lock);
580 if (server->srv_count > 1) { 597 if (server->srv_count > 1) {
581 read_unlock(&cifs_tcp_ses_lock); 598 spin_unlock(&cifs_tcp_ses_lock);
582 if (memcmp(server->server_GUID, 599 if (memcmp(server->server_GUID,
583 pSMBr->u.extended_response. 600 pSMBr->u.extended_response.
584 GUID, 16) != 0) { 601 GUID, 16) != 0) {
@@ -588,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
588 16); 605 16);
589 } 606 }
590 } else { 607 } else {
591 read_unlock(&cifs_tcp_ses_lock); 608 spin_unlock(&cifs_tcp_ses_lock);
592 memcpy(server->server_GUID, 609 memcpy(server->server_GUID,
593 pSMBr->u.extended_response.GUID, 16); 610 pSMBr->u.extended_response.GUID, 16);
594 } 611 }
@@ -603,13 +620,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
603 rc = 0; 620 rc = 0;
604 else 621 else
605 rc = -EINVAL; 622 rc = -EINVAL;
606 623 if (server->secType == Kerberos) {
607 if (server->sec_kerberos || server->sec_mskerberos) 624 if (!server->sec_kerberos &&
608 server->secType = Kerberos; 625 !server->sec_mskerberos)
609 else if (server->sec_ntlmssp) 626 rc = -EOPNOTSUPP;
610 server->secType = RawNTLMSSP; 627 } else if (server->secType == RawNTLMSSP) {
611 else 628 if (!server->sec_ntlmssp)
612 rc = -EOPNOTSUPP; 629 rc = -EOPNOTSUPP;
630 } else
631 rc = -EOPNOTSUPP;
613 } 632 }
614 } else 633 } else
615 server->capabilities &= ~CAP_EXTENDED_SECURITY; 634 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -4534,8 +4553,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4534 4553
4535 cFYI(1, "In QFSUnixInfo"); 4554 cFYI(1, "In QFSUnixInfo");
4536QFSUnixRetry: 4555QFSUnixRetry:
4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4556 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4538 (void **) &pSMBr); 4557 (void **) &pSMB, (void **) &pSMBr);
4539 if (rc) 4558 if (rc)
4540 return rc; 4559 return rc;
4541 4560
@@ -4604,8 +4623,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4604 cFYI(1, "In SETFSUnixInfo"); 4623 cFYI(1, "In SETFSUnixInfo");
4605SETFSUnixRetry: 4624SETFSUnixRetry:
4606 /* BB switch to small buf init to save memory */ 4625 /* BB switch to small buf init to save memory */
4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4626 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4608 (void **) &pSMBr); 4627 (void **) &pSMB, (void **) &pSMBr);
4609 if (rc) 4628 if (rc)
4610 return rc; 4629 return rc;
4611 4630
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb..00000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
1/*
2 * fs/cifs/cn_cifs.h
3 *
4 * Copyright (c) International Business Machines Corp., 2002
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _CN_CIFS_H
23#define _CN_CIFS_H
24#ifdef CONFIG_CIFS_UPCALL
25#include <linux/types.h>
26#include <linux/connector.h>
27
28struct cifs_upcall {
29 char signature[4]; /* CIFS */
30 enum command {
31 CIFS_GET_IP = 0x00000001, /* get ip address for hostname */
32 CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
33 } command;
34 /* union cifs upcall data follows */
35};
36#endif /* CIFS_UPCALL */
37#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2208f06e4c4..9eb327defa1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,7 @@
47#include "ntlmssp.h" 47#include "ntlmssp.h"
48#include "nterr.h" 48#include "nterr.h"
49#include "rfc1002pdu.h" 49#include "rfc1002pdu.h"
50#include "cn_cifs.h" 50#include "fscache.h"
51 51
52#define CIFS_PORT 445 52#define CIFS_PORT 445
53#define RFC1001_PORT 139 53#define RFC1001_PORT 139
@@ -66,6 +66,7 @@ struct smb_vol {
66 char *iocharset; /* local code page for mapping to and from Unicode */ 66 char *iocharset; /* local code page for mapping to and from Unicode */
67 char source_rfc1001_name[16]; /* netbios name of client */ 67 char source_rfc1001_name[16]; /* netbios name of client */
68 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 68 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
69 uid_t cred_uid;
69 uid_t linux_uid; 70 uid_t linux_uid;
70 gid_t linux_gid; 71 gid_t linux_gid;
71 mode_t file_mode; 72 mode_t file_mode;
@@ -97,16 +98,25 @@ struct smb_vol {
97 bool noblocksnd:1; 98 bool noblocksnd:1;
98 bool noautotune:1; 99 bool noautotune:1;
99 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 100 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
101 bool fsc:1; /* enable fscache */
102 bool mfsymlinks:1; /* use Minshall+French Symlinks */
103 bool multiuser:1;
100 unsigned int rsize; 104 unsigned int rsize;
101 unsigned int wsize; 105 unsigned int wsize;
102 bool sockopt_tcp_nodelay:1; 106 bool sockopt_tcp_nodelay:1;
103 unsigned short int port; 107 unsigned short int port;
104 char *prepath; 108 char *prepath;
109 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
105 struct nls_table *local_nls; 110 struct nls_table *local_nls;
106}; 111};
107 112
113/* FIXME: should these be tunable? */
114#define TLINK_ERROR_EXPIRE (1 * HZ)
115#define TLINK_IDLE_EXPIRE (600 * HZ)
116
108static int ipv4_connect(struct TCP_Server_Info *server); 117static int ipv4_connect(struct TCP_Server_Info *server);
109static int ipv6_connect(struct TCP_Server_Info *server); 118static int ipv6_connect(struct TCP_Server_Info *server);
119static void cifs_prune_tlinks(struct work_struct *work);
110 120
111/* 121/*
112 * cifs tcp session reconnection 122 * cifs tcp session reconnection
@@ -140,7 +150,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
140 150
141 /* before reconnecting the tcp session, mark the smb session (uid) 151 /* before reconnecting the tcp session, mark the smb session (uid)
142 and the tid bad so they are not used until reconnected */ 152 and the tid bad so they are not used until reconnected */
143 read_lock(&cifs_tcp_ses_lock); 153 spin_lock(&cifs_tcp_ses_lock);
144 list_for_each(tmp, &server->smb_ses_list) { 154 list_for_each(tmp, &server->smb_ses_list) {
145 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 155 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
146 ses->need_reconnect = true; 156 ses->need_reconnect = true;
@@ -150,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
150 tcon->need_reconnect = true; 160 tcon->need_reconnect = true;
151 } 161 }
152 } 162 }
153 read_unlock(&cifs_tcp_ses_lock); 163 spin_unlock(&cifs_tcp_ses_lock);
154 /* do not want to be sending data on a socket we are freeing */ 164 /* do not want to be sending data on a socket we are freeing */
155 mutex_lock(&server->srv_mutex); 165 mutex_lock(&server->srv_mutex);
156 if (server->ssocket) { 166 if (server->ssocket) {
@@ -163,6 +173,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
163 sock_release(server->ssocket); 173 sock_release(server->ssocket);
164 server->ssocket = NULL; 174 server->ssocket = NULL;
165 } 175 }
176 server->sequence_number = 0;
177 server->session_estab = false;
178 kfree(server->session_key.response);
179 server->session_key.response = NULL;
180 server->session_key.len = 0;
166 181
167 spin_lock(&GlobalMid_Lock); 182 spin_lock(&GlobalMid_Lock);
168 list_for_each(tmp, &server->pending_mid_q) { 183 list_for_each(tmp, &server->pending_mid_q) {
@@ -195,7 +210,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
195 spin_lock(&GlobalMid_Lock); 210 spin_lock(&GlobalMid_Lock);
196 if (server->tcpStatus != CifsExiting) 211 if (server->tcpStatus != CifsExiting)
197 server->tcpStatus = CifsGood; 212 server->tcpStatus = CifsGood;
198 server->sequence_number = 0;
199 spin_unlock(&GlobalMid_Lock); 213 spin_unlock(&GlobalMid_Lock);
200 /* atomic_set(&server->inFlight,0);*/ 214 /* atomic_set(&server->inFlight,0);*/
201 wake_up(&server->response_q); 215 wake_up(&server->response_q);
@@ -397,7 +411,9 @@ incomplete_rcv:
397 cFYI(1, "call to reconnect done"); 411 cFYI(1, "call to reconnect done");
398 csocket = server->ssocket; 412 csocket = server->ssocket;
399 continue; 413 continue;
400 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { 414 } else if (length == -ERESTARTSYS ||
415 length == -EAGAIN ||
416 length == -EINTR) {
401 msleep(1); /* minimum sleep to prevent looping 417 msleep(1); /* minimum sleep to prevent looping
402 allowing socket to clear and app threads to set 418 allowing socket to clear and app threads to set
403 tcpStatus CifsNeedReconnect if server hung */ 419 tcpStatus CifsNeedReconnect if server hung */
@@ -411,18 +427,6 @@ incomplete_rcv:
411 } else 427 } else
412 continue; 428 continue;
413 } else if (length <= 0) { 429 } else if (length <= 0) {
414 if (server->tcpStatus == CifsNew) {
415 cFYI(1, "tcp session abend after SMBnegprot");
416 /* some servers kill the TCP session rather than
417 returning an SMB negprot error, in which
418 case reconnecting here is not going to help,
419 and so simply return error to mount */
420 break;
421 }
422 if (!try_to_freeze() && (length == -EINTR)) {
423 cFYI(1, "cifsd thread killed");
424 break;
425 }
426 cFYI(1, "Reconnect after unexpected peek error %d", 430 cFYI(1, "Reconnect after unexpected peek error %d",
427 length); 431 length);
428 cifs_reconnect(server); 432 cifs_reconnect(server);
@@ -463,27 +467,19 @@ incomplete_rcv:
463 an error on SMB negprot response */ 467 an error on SMB negprot response */
464 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", 468 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
465 pdu_length); 469 pdu_length);
466 if (server->tcpStatus == CifsNew) { 470 /* give server a second to clean up */
467 /* if nack on negprot (rather than 471 msleep(1000);
468 ret of smb negprot error) reconnecting 472 /* always try 445 first on reconnect since we get NACK
469 not going to help, ret error to mount */ 473 * on some if we ever connected to port 139 (the NACK
470 break; 474 * is since we do not begin with RFC1001 session
471 } else { 475 * initialize frame)
472 /* give server a second to 476 */
473 clean up before reconnect attempt */ 477 cifs_set_port((struct sockaddr *)
474 msleep(1000); 478 &server->addr.sockAddr, CIFS_PORT);
475 /* always try 445 first on reconnect 479 cifs_reconnect(server);
476 since we get NACK on some if we ever 480 csocket = server->ssocket;
477 connected to port 139 (the NACK is 481 wake_up(&server->response_q);
478 since we do not begin with RFC1001 482 continue;
479 session initialize frame) */
480 server->addr.sockAddr.sin_port =
481 htons(CIFS_PORT);
482 cifs_reconnect(server);
483 csocket = server->ssocket;
484 wake_up(&server->response_q);
485 continue;
486 }
487 } else if (temp != (char) 0) { 483 } else if (temp != (char) 0) {
488 cERROR(1, "Unknown RFC 1002 frame"); 484 cERROR(1, "Unknown RFC 1002 frame");
489 cifs_dump_mem(" Received Data: ", (char *)smb_buffer, 485 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
@@ -519,8 +515,7 @@ incomplete_rcv:
519 total_read += length) { 515 total_read += length) {
520 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 516 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
521 pdu_length - total_read, 0); 517 pdu_length - total_read, 0);
522 if ((server->tcpStatus == CifsExiting) || 518 if (server->tcpStatus == CifsExiting) {
523 (length == -EINTR)) {
524 /* then will exit */ 519 /* then will exit */
525 reconnect = 2; 520 reconnect = 2;
526 break; 521 break;
@@ -531,8 +526,9 @@ incomplete_rcv:
531 /* Now we will reread sock */ 526 /* Now we will reread sock */
532 reconnect = 1; 527 reconnect = 1;
533 break; 528 break;
534 } else if ((length == -ERESTARTSYS) || 529 } else if (length == -ERESTARTSYS ||
535 (length == -EAGAIN)) { 530 length == -EAGAIN ||
531 length == -EINTR) {
536 msleep(1); /* minimum sleep to prevent looping, 532 msleep(1); /* minimum sleep to prevent looping,
537 allowing socket to clear and app 533 allowing socket to clear and app
538 threads to set tcpStatus 534 threads to set tcpStatus
@@ -644,9 +640,9 @@ multi_t2_fnd:
644 } /* end while !EXITING */ 640 } /* end while !EXITING */
645 641
646 /* take it off the list, if it's not already */ 642 /* take it off the list, if it's not already */
647 write_lock(&cifs_tcp_ses_lock); 643 spin_lock(&cifs_tcp_ses_lock);
648 list_del_init(&server->tcp_ses_list); 644 list_del_init(&server->tcp_ses_list);
649 write_unlock(&cifs_tcp_ses_lock); 645 spin_unlock(&cifs_tcp_ses_lock);
650 646
651 spin_lock(&GlobalMid_Lock); 647 spin_lock(&GlobalMid_Lock);
652 server->tcpStatus = CifsExiting; 648 server->tcpStatus = CifsExiting;
@@ -684,7 +680,7 @@ multi_t2_fnd:
684 * BB: we shouldn't have to do any of this. It shouldn't be 680 * BB: we shouldn't have to do any of this. It shouldn't be
685 * possible to exit from the thread with active SMB sessions 681 * possible to exit from the thread with active SMB sessions
686 */ 682 */
687 read_lock(&cifs_tcp_ses_lock); 683 spin_lock(&cifs_tcp_ses_lock);
688 if (list_empty(&server->pending_mid_q)) { 684 if (list_empty(&server->pending_mid_q)) {
689 /* loop through server session structures attached to this and 685 /* loop through server session structures attached to this and
690 mark them dead */ 686 mark them dead */
@@ -694,7 +690,7 @@ multi_t2_fnd:
694 ses->status = CifsExiting; 690 ses->status = CifsExiting;
695 ses->server = NULL; 691 ses->server = NULL;
696 } 692 }
697 read_unlock(&cifs_tcp_ses_lock); 693 spin_unlock(&cifs_tcp_ses_lock);
698 } else { 694 } else {
699 /* although we can not zero the server struct pointer yet, 695 /* although we can not zero the server struct pointer yet,
700 since there are active requests which may depnd on them, 696 since there are active requests which may depnd on them,
@@ -717,7 +713,7 @@ multi_t2_fnd:
717 } 713 }
718 } 714 }
719 spin_unlock(&GlobalMid_Lock); 715 spin_unlock(&GlobalMid_Lock);
720 read_unlock(&cifs_tcp_ses_lock); 716 spin_unlock(&cifs_tcp_ses_lock);
721 /* 1/8th of sec is more than enough time for them to exit */ 717 /* 1/8th of sec is more than enough time for them to exit */
722 msleep(125); 718 msleep(125);
723 } 719 }
@@ -740,12 +736,12 @@ multi_t2_fnd:
740 if a crazy root user tried to kill cifsd 736 if a crazy root user tried to kill cifsd
741 kernel thread explicitly this might happen) */ 737 kernel thread explicitly this might happen) */
742 /* BB: This shouldn't be necessary, see above */ 738 /* BB: This shouldn't be necessary, see above */
743 read_lock(&cifs_tcp_ses_lock); 739 spin_lock(&cifs_tcp_ses_lock);
744 list_for_each(tmp, &server->smb_ses_list) { 740 list_for_each(tmp, &server->smb_ses_list) {
745 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 741 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
746 ses->server = NULL; 742 ses->server = NULL;
747 } 743 }
748 read_unlock(&cifs_tcp_ses_lock); 744 spin_unlock(&cifs_tcp_ses_lock);
749 745
750 kfree(server->hostname); 746 kfree(server->hostname);
751 task_to_wake = xchg(&server->tsk, NULL); 747 task_to_wake = xchg(&server->tsk, NULL);
@@ -830,7 +826,8 @@ cifs_parse_mount_options(char *options, const char *devname,
830 /* null target name indicates to use *SMBSERVR default called name 826 /* null target name indicates to use *SMBSERVR default called name
831 if we end up sending RFC1001 session initialize */ 827 if we end up sending RFC1001 session initialize */
832 vol->target_rfc1001_name[0] = 0; 828 vol->target_rfc1001_name[0] = 0;
833 vol->linux_uid = current_uid(); /* use current_euid() instead? */ 829 vol->cred_uid = current_uid();
830 vol->linux_uid = current_uid();
834 vol->linux_gid = current_gid(); 831 vol->linux_gid = current_gid();
835 832
836 /* default to only allowing write access to owner of the mount */ 833 /* default to only allowing write access to owner of the mount */
@@ -1060,6 +1057,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1060 "long\n"); 1057 "long\n");
1061 return 1; 1058 return 1;
1062 } 1059 }
1060 } else if (strnicmp(data, "srcaddr", 7) == 0) {
1061 vol->srcaddr.ss_family = AF_UNSPEC;
1062
1063 if (!value || !*value) {
1064 printk(KERN_WARNING "CIFS: srcaddr value"
1065 " not specified.\n");
1066 return 1; /* needs_arg; */
1067 }
1068 i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
1069 value, strlen(value));
1070 if (i == 0) {
1071 printk(KERN_WARNING "CIFS: Could not parse"
1072 " srcaddr: %s\n",
1073 value);
1074 return 1;
1075 }
1063 } else if (strnicmp(data, "prefixpath", 10) == 0) { 1076 } else if (strnicmp(data, "prefixpath", 10) == 0) {
1064 if (!value || !*value) { 1077 if (!value || !*value) {
1065 printk(KERN_WARNING 1078 printk(KERN_WARNING
@@ -1257,6 +1270,12 @@ cifs_parse_mount_options(char *options, const char *devname,
1257 } else if ((strnicmp(data, "nocase", 6) == 0) || 1270 } else if ((strnicmp(data, "nocase", 6) == 0) ||
1258 (strnicmp(data, "ignorecase", 10) == 0)) { 1271 (strnicmp(data, "ignorecase", 10) == 0)) {
1259 vol->nocase = 1; 1272 vol->nocase = 1;
1273 } else if (strnicmp(data, "mand", 4) == 0) {
1274 /* ignore */
1275 } else if (strnicmp(data, "nomand", 6) == 0) {
1276 /* ignore */
1277 } else if (strnicmp(data, "_netdev", 7) == 0) {
1278 /* ignore */
1260 } else if (strnicmp(data, "brl", 3) == 0) { 1279 } else if (strnicmp(data, "brl", 3) == 0) {
1261 vol->nobrl = 0; 1280 vol->nobrl = 0;
1262 } else if ((strnicmp(data, "nobrl", 5) == 0) || 1281 } else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1331,6 +1350,12 @@ cifs_parse_mount_options(char *options, const char *devname,
1331 printk(KERN_WARNING "CIFS: Mount option noac not " 1350 printk(KERN_WARNING "CIFS: Mount option noac not "
1332 "supported. Instead set " 1351 "supported. Instead set "
1333 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1352 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1353 } else if (strnicmp(data, "fsc", 3) == 0) {
1354 vol->fsc = true;
1355 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1356 vol->mfsymlinks = true;
1357 } else if (strnicmp(data, "multiuser", 8) == 0) {
1358 vol->multiuser = true;
1334 } else 1359 } else
1335 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1360 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1336 data); 1361 data);
@@ -1362,6 +1387,13 @@ cifs_parse_mount_options(char *options, const char *devname,
1362 return 1; 1387 return 1;
1363 } 1388 }
1364 } 1389 }
1390
1391 if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
1392 cERROR(1, "Multiuser mounts currently require krb5 "
1393 "authentication!");
1394 return 1;
1395 }
1396
1365 if (vol->UNCip == NULL) 1397 if (vol->UNCip == NULL)
1366 vol->UNCip = &vol->UNC[2]; 1398 vol->UNCip = &vol->UNC[2];
1367 1399
@@ -1380,65 +1412,136 @@ cifs_parse_mount_options(char *options, const char *devname,
1380 return 0; 1412 return 0;
1381} 1413}
1382 1414
1415/** Returns true if srcaddr isn't specified and rhs isn't
1416 * specified, or if srcaddr is specified and
1417 * matches the IP address of the rhs argument.
1418 */
1419static bool
1420srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1421{
1422 switch (srcaddr->sa_family) {
1423 case AF_UNSPEC:
1424 return (rhs->sa_family == AF_UNSPEC);
1425 case AF_INET: {
1426 struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
1427 struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
1428 return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
1429 }
1430 case AF_INET6: {
1431 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
1432 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
1433 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
1434 }
1435 default:
1436 WARN_ON(1);
1437 return false; /* don't expect to be here */
1438 }
1439}
1440
1441
1442static bool
1443match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1444 struct sockaddr *srcaddr)
1445{
1446 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1447 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1448
1449 switch (addr->sa_family) {
1450 case AF_INET:
1451 if (addr4->sin_addr.s_addr !=
1452 server->addr.sockAddr.sin_addr.s_addr)
1453 return false;
1454 if (addr4->sin_port &&
1455 addr4->sin_port != server->addr.sockAddr.sin_port)
1456 return false;
1457 break;
1458 case AF_INET6:
1459 if (!ipv6_addr_equal(&addr6->sin6_addr,
1460 &server->addr.sockAddr6.sin6_addr))
1461 return false;
1462 if (addr6->sin6_scope_id !=
1463 server->addr.sockAddr6.sin6_scope_id)
1464 return false;
1465 if (addr6->sin6_port &&
1466 addr6->sin6_port != server->addr.sockAddr6.sin6_port)
1467 return false;
1468 break;
1469 }
1470
1471 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1472 return false;
1473
1474 return true;
1475}
1476
1477static bool
1478match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1479{
1480 unsigned int secFlags;
1481
1482 if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
1483 secFlags = vol->secFlg;
1484 else
1485 secFlags = global_secflags | vol->secFlg;
1486
1487 switch (server->secType) {
1488 case LANMAN:
1489 if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
1490 return false;
1491 break;
1492 case NTLMv2:
1493 if (!(secFlags & CIFSSEC_MAY_NTLMV2))
1494 return false;
1495 break;
1496 case NTLM:
1497 if (!(secFlags & CIFSSEC_MAY_NTLM))
1498 return false;
1499 break;
1500 case Kerberos:
1501 if (!(secFlags & CIFSSEC_MAY_KRB5))
1502 return false;
1503 break;
1504 case RawNTLMSSP:
1505 if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
1506 return false;
1507 break;
1508 default:
1509 /* shouldn't happen */
1510 return false;
1511 }
1512
1513 /* now check if signing mode is acceptible */
1514 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
1515 (server->secMode & SECMODE_SIGN_REQUIRED))
1516 return false;
1517 else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
1518 (server->secMode &
1519 (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
1520 return false;
1521
1522 return true;
1523}
1524
1383static struct TCP_Server_Info * 1525static struct TCP_Server_Info *
1384cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) 1526cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1385{ 1527{
1386 struct list_head *tmp;
1387 struct TCP_Server_Info *server; 1528 struct TCP_Server_Info *server;
1388 struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
1389 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
1390
1391 write_lock(&cifs_tcp_ses_lock);
1392 list_for_each(tmp, &cifs_tcp_ses_list) {
1393 server = list_entry(tmp, struct TCP_Server_Info,
1394 tcp_ses_list);
1395 /*
1396 * the demux thread can exit on its own while still in CifsNew
1397 * so don't accept any sockets in that state. Since the
1398 * tcpStatus never changes back to CifsNew it's safe to check
1399 * for this without a lock.
1400 */
1401 if (server->tcpStatus == CifsNew)
1402 continue;
1403 1529
1404 switch (addr->ss_family) { 1530 spin_lock(&cifs_tcp_ses_lock);
1405 case AF_INET: 1531 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1406 if (addr4->sin_addr.s_addr == 1532 if (!match_address(server, addr,
1407 server->addr.sockAddr.sin_addr.s_addr) { 1533 (struct sockaddr *)&vol->srcaddr))
1408 addr4->sin_port = htons(port); 1534 continue;
1409 /* user overrode default port? */
1410 if (addr4->sin_port) {
1411 if (addr4->sin_port !=
1412 server->addr.sockAddr.sin_port)
1413 continue;
1414 }
1415 break;
1416 } else
1417 continue;
1418 1535
1419 case AF_INET6: 1536 if (!match_security(server, vol))
1420 if (ipv6_addr_equal(&addr6->sin6_addr, 1537 continue;
1421 &server->addr.sockAddr6.sin6_addr) &&
1422 (addr6->sin6_scope_id ==
1423 server->addr.sockAddr6.sin6_scope_id)) {
1424 addr6->sin6_port = htons(port);
1425 /* user overrode default port? */
1426 if (addr6->sin6_port) {
1427 if (addr6->sin6_port !=
1428 server->addr.sockAddr6.sin6_port)
1429 continue;
1430 }
1431 break;
1432 } else
1433 continue;
1434 }
1435 1538
1436 ++server->srv_count; 1539 ++server->srv_count;
1437 write_unlock(&cifs_tcp_ses_lock); 1540 spin_unlock(&cifs_tcp_ses_lock);
1438 cFYI(1, "Existing tcp session with server found"); 1541 cFYI(1, "Existing tcp session with server found");
1439 return server; 1542 return server;
1440 } 1543 }
1441 write_unlock(&cifs_tcp_ses_lock); 1544 spin_unlock(&cifs_tcp_ses_lock);
1442 return NULL; 1545 return NULL;
1443} 1546}
1444 1547
@@ -1447,19 +1550,26 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1447{ 1550{
1448 struct task_struct *task; 1551 struct task_struct *task;
1449 1552
1450 write_lock(&cifs_tcp_ses_lock); 1553 spin_lock(&cifs_tcp_ses_lock);
1451 if (--server->srv_count > 0) { 1554 if (--server->srv_count > 0) {
1452 write_unlock(&cifs_tcp_ses_lock); 1555 spin_unlock(&cifs_tcp_ses_lock);
1453 return; 1556 return;
1454 } 1557 }
1455 1558
1456 list_del_init(&server->tcp_ses_list); 1559 list_del_init(&server->tcp_ses_list);
1457 write_unlock(&cifs_tcp_ses_lock); 1560 spin_unlock(&cifs_tcp_ses_lock);
1458 1561
1459 spin_lock(&GlobalMid_Lock); 1562 spin_lock(&GlobalMid_Lock);
1460 server->tcpStatus = CifsExiting; 1563 server->tcpStatus = CifsExiting;
1461 spin_unlock(&GlobalMid_Lock); 1564 spin_unlock(&GlobalMid_Lock);
1462 1565
1566 cifs_crypto_shash_release(server);
1567 cifs_fscache_release_client_cookie(server);
1568
1569 kfree(server->session_key.response);
1570 server->session_key.response = NULL;
1571 server->session_key.len = 0;
1572
1463 task = xchg(&server->tsk, NULL); 1573 task = xchg(&server->tsk, NULL);
1464 if (task) 1574 if (task)
1465 force_sig(SIGKILL, task); 1575 force_sig(SIGKILL, task);
@@ -1479,7 +1589,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1479 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); 1589 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
1480 1590
1481 if (volume_info->UNCip && volume_info->UNC) { 1591 if (volume_info->UNCip && volume_info->UNC) {
1482 rc = cifs_convert_address(volume_info->UNCip, &addr); 1592 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
1593 volume_info->UNCip,
1594 strlen(volume_info->UNCip),
1595 volume_info->port);
1483 if (!rc) { 1596 if (!rc) {
1484 /* we failed translating address */ 1597 /* we failed translating address */
1485 rc = -EINVAL; 1598 rc = -EINVAL;
@@ -1499,7 +1612,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1499 } 1612 }
1500 1613
1501 /* see if we already have a matching tcp_ses */ 1614 /* see if we already have a matching tcp_ses */
1502 tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); 1615 tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
1503 if (tcp_ses) 1616 if (tcp_ses)
1504 return tcp_ses; 1617 return tcp_ses;
1505 1618
@@ -1509,10 +1622,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1509 goto out_err; 1622 goto out_err;
1510 } 1623 }
1511 1624
1625 rc = cifs_crypto_shash_allocate(tcp_ses);
1626 if (rc) {
1627 cERROR(1, "could not setup hash structures rc %d", rc);
1628 goto out_err;
1629 }
1630
1512 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1631 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1513 if (IS_ERR(tcp_ses->hostname)) { 1632 if (IS_ERR(tcp_ses->hostname)) {
1514 rc = PTR_ERR(tcp_ses->hostname); 1633 rc = PTR_ERR(tcp_ses->hostname);
1515 goto out_err; 1634 goto out_err_crypto_release;
1516 } 1635 }
1517 1636
1518 tcp_ses->noblocksnd = volume_info->noblocksnd; 1637 tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1527,6 +1646,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1527 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1646 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1528 memcpy(tcp_ses->server_RFC1001_name, 1647 memcpy(tcp_ses->server_RFC1001_name,
1529 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1648 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1649 tcp_ses->session_estab = false;
1530 tcp_ses->sequence_number = 0; 1650 tcp_ses->sequence_number = 0;
1531 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1651 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1532 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1652 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -1537,25 +1657,25 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1537 * no need to spinlock this init of tcpStatus or srv_count 1657 * no need to spinlock this init of tcpStatus or srv_count
1538 */ 1658 */
1539 tcp_ses->tcpStatus = CifsNew; 1659 tcp_ses->tcpStatus = CifsNew;
1660 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
1661 sizeof(tcp_ses->srcaddr));
1540 ++tcp_ses->srv_count; 1662 ++tcp_ses->srv_count;
1541 1663
1542 if (addr.ss_family == AF_INET6) { 1664 if (addr.ss_family == AF_INET6) {
1543 cFYI(1, "attempting ipv6 connect"); 1665 cFYI(1, "attempting ipv6 connect");
1544 /* BB should we allow ipv6 on port 139? */ 1666 /* BB should we allow ipv6 on port 139? */
1545 /* other OS never observed in Wild doing 139 with v6 */ 1667 /* other OS never observed in Wild doing 139 with v6 */
1546 sin_server6->sin6_port = htons(volume_info->port);
1547 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1668 memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
1548 sizeof(struct sockaddr_in6)); 1669 sizeof(struct sockaddr_in6));
1549 rc = ipv6_connect(tcp_ses); 1670 rc = ipv6_connect(tcp_ses);
1550 } else { 1671 } else {
1551 sin_server->sin_port = htons(volume_info->port);
1552 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1672 memcpy(&tcp_ses->addr.sockAddr, sin_server,
1553 sizeof(struct sockaddr_in)); 1673 sizeof(struct sockaddr_in));
1554 rc = ipv4_connect(tcp_ses); 1674 rc = ipv4_connect(tcp_ses);
1555 } 1675 }
1556 if (rc < 0) { 1676 if (rc < 0) {
1557 cERROR(1, "Error connecting to socket. Aborting operation"); 1677 cERROR(1, "Error connecting to socket. Aborting operation");
1558 goto out_err; 1678 goto out_err_crypto_release;
1559 } 1679 }
1560 1680
1561 /* 1681 /*
@@ -1569,16 +1689,21 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1569 rc = PTR_ERR(tcp_ses->tsk); 1689 rc = PTR_ERR(tcp_ses->tsk);
1570 cERROR(1, "error %d create cifsd thread", rc); 1690 cERROR(1, "error %d create cifsd thread", rc);
1571 module_put(THIS_MODULE); 1691 module_put(THIS_MODULE);
1572 goto out_err; 1692 goto out_err_crypto_release;
1573 } 1693 }
1574 1694
1575 /* thread spawned, put it on the list */ 1695 /* thread spawned, put it on the list */
1576 write_lock(&cifs_tcp_ses_lock); 1696 spin_lock(&cifs_tcp_ses_lock);
1577 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); 1697 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
1578 write_unlock(&cifs_tcp_ses_lock); 1698 spin_unlock(&cifs_tcp_ses_lock);
1699
1700 cifs_fscache_get_client_cookie(tcp_ses);
1579 1701
1580 return tcp_ses; 1702 return tcp_ses;
1581 1703
1704out_err_crypto_release:
1705 cifs_crypto_shash_release(tcp_ses);
1706
1582out_err: 1707out_err:
1583 if (tcp_ses) { 1708 if (tcp_ses) {
1584 if (!IS_ERR(tcp_ses->hostname)) 1709 if (!IS_ERR(tcp_ses->hostname))
@@ -1591,22 +1716,34 @@ out_err:
1591} 1716}
1592 1717
1593static struct cifsSesInfo * 1718static struct cifsSesInfo *
1594cifs_find_smb_ses(struct TCP_Server_Info *server, char *username) 1719cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1595{ 1720{
1596 struct list_head *tmp;
1597 struct cifsSesInfo *ses; 1721 struct cifsSesInfo *ses;
1598 1722
1599 write_lock(&cifs_tcp_ses_lock); 1723 spin_lock(&cifs_tcp_ses_lock);
1600 list_for_each(tmp, &server->smb_ses_list) { 1724 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
1601 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 1725 switch (server->secType) {
1602 if (strncmp(ses->userName, username, MAX_USERNAME_SIZE)) 1726 case Kerberos:
1603 continue; 1727 if (vol->cred_uid != ses->cred_uid)
1604 1728 continue;
1729 break;
1730 default:
1731 /* anything else takes username/password */
1732 if (strncmp(ses->userName, vol->username,
1733 MAX_USERNAME_SIZE))
1734 continue;
1735 if (strlen(vol->username) != 0 &&
1736 ses->password != NULL &&
1737 strncmp(ses->password,
1738 vol->password ? vol->password : "",
1739 MAX_PASSWORD_SIZE))
1740 continue;
1741 }
1605 ++ses->ses_count; 1742 ++ses->ses_count;
1606 write_unlock(&cifs_tcp_ses_lock); 1743 spin_unlock(&cifs_tcp_ses_lock);
1607 return ses; 1744 return ses;
1608 } 1745 }
1609 write_unlock(&cifs_tcp_ses_lock); 1746 spin_unlock(&cifs_tcp_ses_lock);
1610 return NULL; 1747 return NULL;
1611} 1748}
1612 1749
@@ -1617,14 +1754,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1617 struct TCP_Server_Info *server = ses->server; 1754 struct TCP_Server_Info *server = ses->server;
1618 1755
1619 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count); 1756 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1620 write_lock(&cifs_tcp_ses_lock); 1757 spin_lock(&cifs_tcp_ses_lock);
1621 if (--ses->ses_count > 0) { 1758 if (--ses->ses_count > 0) {
1622 write_unlock(&cifs_tcp_ses_lock); 1759 spin_unlock(&cifs_tcp_ses_lock);
1623 return; 1760 return;
1624 } 1761 }
1625 1762
1626 list_del_init(&ses->smb_ses_list); 1763 list_del_init(&ses->smb_ses_list);
1627 write_unlock(&cifs_tcp_ses_lock); 1764 spin_unlock(&cifs_tcp_ses_lock);
1628 1765
1629 if (ses->status == CifsGood) { 1766 if (ses->status == CifsGood) {
1630 xid = GetXid(); 1767 xid = GetXid();
@@ -1643,13 +1780,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1643 1780
1644 xid = GetXid(); 1781 xid = GetXid();
1645 1782
1646 ses = cifs_find_smb_ses(server, volume_info->username); 1783 ses = cifs_find_smb_ses(server, volume_info);
1647 if (ses) { 1784 if (ses) {
1648 cFYI(1, "Existing smb sess found (status=%d)", ses->status); 1785 cFYI(1, "Existing smb sess found (status=%d)", ses->status);
1649 1786
1650 /* existing SMB ses has a server reference already */
1651 cifs_put_tcp_session(server);
1652
1653 mutex_lock(&ses->session_mutex); 1787 mutex_lock(&ses->session_mutex);
1654 rc = cifs_negotiate_protocol(xid, ses); 1788 rc = cifs_negotiate_protocol(xid, ses);
1655 if (rc) { 1789 if (rc) {
@@ -1672,6 +1806,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1672 } 1806 }
1673 } 1807 }
1674 mutex_unlock(&ses->session_mutex); 1808 mutex_unlock(&ses->session_mutex);
1809
1810 /* existing SMB ses has a server reference already */
1811 cifs_put_tcp_session(server);
1675 FreeXid(xid); 1812 FreeXid(xid);
1676 return ses; 1813 return ses;
1677 } 1814 }
@@ -1701,11 +1838,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1701 goto get_ses_fail; 1838 goto get_ses_fail;
1702 } 1839 }
1703 if (volume_info->domainname) { 1840 if (volume_info->domainname) {
1704 int len = strlen(volume_info->domainname); 1841 ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
1705 ses->domainName = kmalloc(len + 1, GFP_KERNEL); 1842 if (!ses->domainName)
1706 if (ses->domainName) 1843 goto get_ses_fail;
1707 strcpy(ses->domainName, volume_info->domainname);
1708 } 1844 }
1845 ses->cred_uid = volume_info->cred_uid;
1709 ses->linux_uid = volume_info->linux_uid; 1846 ses->linux_uid = volume_info->linux_uid;
1710 ses->overrideSecFlg = volume_info->secFlg; 1847 ses->overrideSecFlg = volume_info->secFlg;
1711 1848
@@ -1718,9 +1855,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1718 goto get_ses_fail; 1855 goto get_ses_fail;
1719 1856
1720 /* success, put it on the list */ 1857 /* success, put it on the list */
1721 write_lock(&cifs_tcp_ses_lock); 1858 spin_lock(&cifs_tcp_ses_lock);
1722 list_add(&ses->smb_ses_list, &server->smb_ses_list); 1859 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1723 write_unlock(&cifs_tcp_ses_lock); 1860 spin_unlock(&cifs_tcp_ses_lock);
1724 1861
1725 FreeXid(xid); 1862 FreeXid(xid);
1726 return ses; 1863 return ses;
@@ -1737,7 +1874,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1737 struct list_head *tmp; 1874 struct list_head *tmp;
1738 struct cifsTconInfo *tcon; 1875 struct cifsTconInfo *tcon;
1739 1876
1740 write_lock(&cifs_tcp_ses_lock); 1877 spin_lock(&cifs_tcp_ses_lock);
1741 list_for_each(tmp, &ses->tcon_list) { 1878 list_for_each(tmp, &ses->tcon_list) {
1742 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list); 1879 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
1743 if (tcon->tidStatus == CifsExiting) 1880 if (tcon->tidStatus == CifsExiting)
@@ -1746,10 +1883,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1746 continue; 1883 continue;
1747 1884
1748 ++tcon->tc_count; 1885 ++tcon->tc_count;
1749 write_unlock(&cifs_tcp_ses_lock); 1886 spin_unlock(&cifs_tcp_ses_lock);
1750 return tcon; 1887 return tcon;
1751 } 1888 }
1752 write_unlock(&cifs_tcp_ses_lock); 1889 spin_unlock(&cifs_tcp_ses_lock);
1753 return NULL; 1890 return NULL;
1754} 1891}
1755 1892
@@ -1760,19 +1897,20 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1760 struct cifsSesInfo *ses = tcon->ses; 1897 struct cifsSesInfo *ses = tcon->ses;
1761 1898
1762 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count); 1899 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1763 write_lock(&cifs_tcp_ses_lock); 1900 spin_lock(&cifs_tcp_ses_lock);
1764 if (--tcon->tc_count > 0) { 1901 if (--tcon->tc_count > 0) {
1765 write_unlock(&cifs_tcp_ses_lock); 1902 spin_unlock(&cifs_tcp_ses_lock);
1766 return; 1903 return;
1767 } 1904 }
1768 1905
1769 list_del_init(&tcon->tcon_list); 1906 list_del_init(&tcon->tcon_list);
1770 write_unlock(&cifs_tcp_ses_lock); 1907 spin_unlock(&cifs_tcp_ses_lock);
1771 1908
1772 xid = GetXid(); 1909 xid = GetXid();
1773 CIFSSMBTDis(xid, tcon); 1910 CIFSSMBTDis(xid, tcon);
1774 _FreeXid(xid); 1911 _FreeXid(xid);
1775 1912
1913 cifs_fscache_release_super_cookie(tcon);
1776 tconInfoFree(tcon); 1914 tconInfoFree(tcon);
1777 cifs_put_smb_ses(ses); 1915 cifs_put_smb_ses(ses);
1778} 1916}
@@ -1839,9 +1977,11 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1839 tcon->nocase = volume_info->nocase; 1977 tcon->nocase = volume_info->nocase;
1840 tcon->local_lease = volume_info->local_lease; 1978 tcon->local_lease = volume_info->local_lease;
1841 1979
1842 write_lock(&cifs_tcp_ses_lock); 1980 spin_lock(&cifs_tcp_ses_lock);
1843 list_add(&tcon->tcon_list, &ses->tcon_list); 1981 list_add(&tcon->tcon_list, &ses->tcon_list);
1844 write_unlock(&cifs_tcp_ses_lock); 1982 spin_unlock(&cifs_tcp_ses_lock);
1983
1984 cifs_fscache_get_super_cookie(tcon);
1845 1985
1846 return tcon; 1986 return tcon;
1847 1987
@@ -1850,6 +1990,23 @@ out_fail:
1850 return ERR_PTR(rc); 1990 return ERR_PTR(rc);
1851} 1991}
1852 1992
1993void
1994cifs_put_tlink(struct tcon_link *tlink)
1995{
1996 if (!tlink || IS_ERR(tlink))
1997 return;
1998
1999 if (!atomic_dec_and_test(&tlink->tl_count) ||
2000 test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
2001 tlink->tl_time = jiffies;
2002 return;
2003 }
2004
2005 if (!IS_ERR(tlink_tcon(tlink)))
2006 cifs_put_tcon(tlink_tcon(tlink));
2007 kfree(tlink);
2008 return;
2009}
1853 2010
1854int 2011int
1855get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 2012get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1934,6 +2091,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
1934 2091
1935} 2092}
1936 2093
2094static int
2095bind_socket(struct TCP_Server_Info *server)
2096{
2097 int rc = 0;
2098 if (server->srcaddr.ss_family != AF_UNSPEC) {
2099 /* Bind to the specified local IP address */
2100 struct socket *socket = server->ssocket;
2101 rc = socket->ops->bind(socket,
2102 (struct sockaddr *) &server->srcaddr,
2103 sizeof(server->srcaddr));
2104 if (rc < 0) {
2105 struct sockaddr_in *saddr4;
2106 struct sockaddr_in6 *saddr6;
2107 saddr4 = (struct sockaddr_in *)&server->srcaddr;
2108 saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
2109 if (saddr6->sin6_family == AF_INET6)
2110 cERROR(1, "cifs: "
2111 "Failed to bind to: %pI6c, error: %d\n",
2112 &saddr6->sin6_addr, rc);
2113 else
2114 cERROR(1, "cifs: "
2115 "Failed to bind to: %pI4, error: %d\n",
2116 &saddr4->sin_addr.s_addr, rc);
2117 }
2118 }
2119 return rc;
2120}
1937 2121
1938static int 2122static int
1939ipv4_connect(struct TCP_Server_Info *server) 2123ipv4_connect(struct TCP_Server_Info *server)
@@ -1959,6 +2143,10 @@ ipv4_connect(struct TCP_Server_Info *server)
1959 cifs_reclassify_socket4(socket); 2143 cifs_reclassify_socket4(socket);
1960 } 2144 }
1961 2145
2146 rc = bind_socket(server);
2147 if (rc < 0)
2148 return rc;
2149
1962 /* user overrode default port */ 2150 /* user overrode default port */
1963 if (server->addr.sockAddr.sin_port) { 2151 if (server->addr.sockAddr.sin_port) {
1964 rc = socket->ops->connect(socket, (struct sockaddr *) 2152 rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2121,6 +2309,10 @@ ipv6_connect(struct TCP_Server_Info *server)
2121 cifs_reclassify_socket6(socket); 2309 cifs_reclassify_socket6(socket);
2122 } 2310 }
2123 2311
2312 rc = bind_socket(server);
2313 if (rc < 0)
2314 return rc;
2315
2124 /* user overrode default port */ 2316 /* user overrode default port */
2125 if (server->addr.sockAddr6.sin6_port) { 2317 if (server->addr.sockAddr6.sin6_port) {
2126 rc = socket->ops->connect(socket, 2318 rc = socket->ops->connect(socket,
@@ -2320,6 +2512,8 @@ convert_delimiter(char *path, char delim)
2320static void setup_cifs_sb(struct smb_vol *pvolume_info, 2512static void setup_cifs_sb(struct smb_vol *pvolume_info,
2321 struct cifs_sb_info *cifs_sb) 2513 struct cifs_sb_info *cifs_sb)
2322{ 2514{
2515 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
2516
2323 if (pvolume_info->rsize > CIFSMaxBufSize) { 2517 if (pvolume_info->rsize > CIFSMaxBufSize) {
2324 cERROR(1, "rsize %d too large, using MaxBufSize", 2518 cERROR(1, "rsize %d too large, using MaxBufSize",
2325 pvolume_info->rsize); 2519 pvolume_info->rsize);
@@ -2397,10 +2591,23 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2397 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; 2591 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
2398 if (pvolume_info->dynperm) 2592 if (pvolume_info->dynperm)
2399 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2593 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2594 if (pvolume_info->fsc)
2595 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
2596 if (pvolume_info->multiuser)
2597 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2598 CIFS_MOUNT_NO_PERM);
2400 if (pvolume_info->direct_io) { 2599 if (pvolume_info->direct_io) {
2401 cFYI(1, "mounting share using direct i/o"); 2600 cFYI(1, "mounting share using direct i/o");
2402 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2601 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2403 } 2602 }
2603 if (pvolume_info->mfsymlinks) {
2604 if (pvolume_info->sfu_emul) {
2605 cERROR(1, "mount option mfsymlinks ignored if sfu "
2606 "mount option is used");
2607 } else {
2608 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
2609 }
2610 }
2404 2611
2405 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2612 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2406 cERROR(1, "mount option dynperm ignored if cifsacl " 2613 cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2487,6 +2694,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2487 struct TCP_Server_Info *srvTcp; 2694 struct TCP_Server_Info *srvTcp;
2488 char *full_path; 2695 char *full_path;
2489 char *mount_data = mount_data_global; 2696 char *mount_data = mount_data_global;
2697 struct tcon_link *tlink;
2490#ifdef CONFIG_CIFS_DFS_UPCALL 2698#ifdef CONFIG_CIFS_DFS_UPCALL
2491 struct dfs_info3_param *referrals = NULL; 2699 struct dfs_info3_param *referrals = NULL;
2492 unsigned int num_referrals = 0; 2700 unsigned int num_referrals = 0;
@@ -2498,6 +2706,7 @@ try_mount_again:
2498 pSesInfo = NULL; 2706 pSesInfo = NULL;
2499 srvTcp = NULL; 2707 srvTcp = NULL;
2500 full_path = NULL; 2708 full_path = NULL;
2709 tlink = NULL;
2501 2710
2502 xid = GetXid(); 2711 xid = GetXid();
2503 2712
@@ -2573,8 +2782,6 @@ try_mount_again:
2573 goto remote_path_check; 2782 goto remote_path_check;
2574 } 2783 }
2575 2784
2576 cifs_sb->tcon = tcon;
2577
2578 /* do not care if following two calls succeed - informational */ 2785 /* do not care if following two calls succeed - informational */
2579 if (!tcon->ipc) { 2786 if (!tcon->ipc) {
2580 CIFSSMBQFSDeviceInfo(xid, tcon); 2787 CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2683,6 +2890,38 @@ remote_path_check:
2683#endif 2890#endif
2684 } 2891 }
2685 2892
2893 if (rc)
2894 goto mount_fail_check;
2895
2896 /* now, hang the tcon off of the superblock */
2897 tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
2898 if (tlink == NULL) {
2899 rc = -ENOMEM;
2900 goto mount_fail_check;
2901 }
2902
2903 tlink->tl_index = pSesInfo->linux_uid;
2904 tlink->tl_tcon = tcon;
2905 tlink->tl_time = jiffies;
2906 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2907 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2908
2909 rc = radix_tree_preload(GFP_KERNEL);
2910 if (rc == -ENOMEM) {
2911 kfree(tlink);
2912 goto mount_fail_check;
2913 }
2914
2915 spin_lock(&cifs_sb->tlink_tree_lock);
2916 radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
2917 radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
2918 CIFS_TLINK_MASTER_TAG);
2919 spin_unlock(&cifs_sb->tlink_tree_lock);
2920 radix_tree_preload_end();
2921
2922 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2923 TLINK_IDLE_EXPIRE);
2924
2686mount_fail_check: 2925mount_fail_check:
2687 /* on error free sesinfo and tcon struct if needed */ 2926 /* on error free sesinfo and tcon struct if needed */
2688 if (rc) { 2927 if (rc) {
@@ -2760,14 +2999,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2760#ifdef CONFIG_CIFS_WEAK_PW_HASH 2999#ifdef CONFIG_CIFS_WEAK_PW_HASH
2761 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 3000 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2762 (ses->server->secType == LANMAN)) 3001 (ses->server->secType == LANMAN))
2763 calc_lanman_hash(tcon->password, ses->server->cryptKey, 3002 calc_lanman_hash(tcon->password, ses->server->cryptkey,
2764 ses->server->secMode & 3003 ses->server->secMode &
2765 SECMODE_PW_ENCRYPT ? true : false, 3004 SECMODE_PW_ENCRYPT ? true : false,
2766 bcc_ptr); 3005 bcc_ptr);
2767 else 3006 else
2768#endif /* CIFS_WEAK_PW_HASH */ 3007#endif /* CIFS_WEAK_PW_HASH */
2769 SMBNTencrypt(tcon->password, ses->server->cryptKey, 3008 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
2770 bcc_ptr);
2771 3009
2772 bcc_ptr += CIFS_SESS_KEY_SIZE; 3010 bcc_ptr += CIFS_SESS_KEY_SIZE;
2773 if (ses->capabilities & CAP_UNICODE) { 3011 if (ses->capabilities & CAP_UNICODE) {
@@ -2869,19 +3107,39 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2869int 3107int
2870cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3108cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2871{ 3109{
2872 int rc = 0; 3110 int i, ret;
2873 char *tmp; 3111 char *tmp;
3112 struct tcon_link *tlink[8];
3113 unsigned long index = 0;
3114
3115 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
3116
3117 do {
3118 spin_lock(&cifs_sb->tlink_tree_lock);
3119 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
3120 (void **)tlink, index,
3121 ARRAY_SIZE(tlink));
3122 /* increment index for next pass */
3123 if (ret > 0)
3124 index = tlink[ret - 1]->tl_index + 1;
3125 for (i = 0; i < ret; i++) {
3126 cifs_get_tlink(tlink[i]);
3127 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3128 radix_tree_delete(&cifs_sb->tlink_tree,
3129 tlink[i]->tl_index);
3130 }
3131 spin_unlock(&cifs_sb->tlink_tree_lock);
2874 3132
2875 if (cifs_sb->tcon) 3133 for (i = 0; i < ret; i++)
2876 cifs_put_tcon(cifs_sb->tcon); 3134 cifs_put_tlink(tlink[i]);
3135 } while (ret != 0);
2877 3136
2878 cifs_sb->tcon = NULL;
2879 tmp = cifs_sb->prepath; 3137 tmp = cifs_sb->prepath;
2880 cifs_sb->prepathlen = 0; 3138 cifs_sb->prepathlen = 0;
2881 cifs_sb->prepath = NULL; 3139 cifs_sb->prepath = NULL;
2882 kfree(tmp); 3140 kfree(tmp);
2883 3141
2884 return rc; 3142 return 0;
2885} 3143}
2886 3144
2887int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses) 3145int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2932,6 +3190,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2932 if (rc) { 3190 if (rc) {
2933 cERROR(1, "Send error in SessSetup = %d", rc); 3191 cERROR(1, "Send error in SessSetup = %d", rc);
2934 } else { 3192 } else {
3193 mutex_lock(&ses->server->srv_mutex);
3194 if (!server->session_estab) {
3195 server->session_key.response = ses->auth_key.response;
3196 server->session_key.len = ses->auth_key.len;
3197 server->sequence_number = 0x2;
3198 server->session_estab = true;
3199 ses->auth_key.response = NULL;
3200 }
3201 mutex_unlock(&server->srv_mutex);
3202
2935 cFYI(1, "CIFS Session Established successfully"); 3203 cFYI(1, "CIFS Session Established successfully");
2936 spin_lock(&GlobalMid_Lock); 3204 spin_lock(&GlobalMid_Lock);
2937 ses->status = CifsGood; 3205 ses->status = CifsGood;
@@ -2939,6 +3207,246 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2939 spin_unlock(&GlobalMid_Lock); 3207 spin_unlock(&GlobalMid_Lock);
2940 } 3208 }
2941 3209
3210 kfree(ses->auth_key.response);
3211 ses->auth_key.response = NULL;
3212 ses->auth_key.len = 0;
3213 kfree(ses->ntlmssp);
3214 ses->ntlmssp = NULL;
3215
2942 return rc; 3216 return rc;
2943} 3217}
2944 3218
3219static struct cifsTconInfo *
3220cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3221{
3222 struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
3223 struct cifsSesInfo *ses;
3224 struct cifsTconInfo *tcon = NULL;
3225 struct smb_vol *vol_info;
3226 char username[MAX_USERNAME_SIZE + 1];
3227
3228 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
3229 if (vol_info == NULL) {
3230 tcon = ERR_PTR(-ENOMEM);
3231 goto out;
3232 }
3233
3234 snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
3235 vol_info->username = username;
3236 vol_info->local_nls = cifs_sb->local_nls;
3237 vol_info->linux_uid = fsuid;
3238 vol_info->cred_uid = fsuid;
3239 vol_info->UNC = master_tcon->treeName;
3240 vol_info->retry = master_tcon->retry;
3241 vol_info->nocase = master_tcon->nocase;
3242 vol_info->local_lease = master_tcon->local_lease;
3243 vol_info->no_linux_ext = !master_tcon->unix_ext;
3244
3245 /* FIXME: allow for other secFlg settings */
3246 vol_info->secFlg = CIFSSEC_MUST_KRB5;
3247
3248 /* get a reference for the same TCP session */
3249 spin_lock(&cifs_tcp_ses_lock);
3250 ++master_tcon->ses->server->srv_count;
3251 spin_unlock(&cifs_tcp_ses_lock);
3252
3253 ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
3254 if (IS_ERR(ses)) {
3255 tcon = (struct cifsTconInfo *)ses;
3256 cifs_put_tcp_session(master_tcon->ses->server);
3257 goto out;
3258 }
3259
3260 tcon = cifs_get_tcon(ses, vol_info);
3261 if (IS_ERR(tcon)) {
3262 cifs_put_smb_ses(ses);
3263 goto out;
3264 }
3265
3266 if (ses->capabilities & CAP_UNIX)
3267 reset_cifs_unix_caps(0, tcon, NULL, vol_info);
3268out:
3269 kfree(vol_info);
3270
3271 return tcon;
3272}
3273
3274static struct tcon_link *
3275cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3276{
3277 struct tcon_link *tlink;
3278 unsigned int ret;
3279
3280 spin_lock(&cifs_sb->tlink_tree_lock);
3281 ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
3282 0, 1, CIFS_TLINK_MASTER_TAG);
3283 spin_unlock(&cifs_sb->tlink_tree_lock);
3284
3285 /* the master tcon should always be present */
3286 if (ret == 0)
3287 BUG();
3288
3289 return tlink;
3290}
3291
3292struct cifsTconInfo *
3293cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3294{
3295 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3296}
3297
3298static int
3299cifs_sb_tcon_pending_wait(void *unused)
3300{
3301 schedule();
3302 return signal_pending(current) ? -ERESTARTSYS : 0;
3303}
3304
3305/*
3306 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3307 * current task.
3308 *
3309 * If the superblock doesn't refer to a multiuser mount, then just return
3310 * the master tcon for the mount.
3311 *
3312 * First, search the radix tree for an existing tcon for this fsuid. If one
3313 * exists, then check to see if it's pending construction. If it is then wait
3314 * for construction to complete. Once it's no longer pending, check to see if
3315 * it failed and either return an error or retry construction, depending on
3316 * the timeout.
3317 *
3318 * If one doesn't exist then insert a new tcon_link struct into the tree and
3319 * try to construct a new one.
3320 */
3321struct tcon_link *
3322cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3323{
3324 int ret;
3325 unsigned long fsuid = (unsigned long) current_fsuid();
3326 struct tcon_link *tlink, *newtlink;
3327
3328 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3329 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3330
3331 spin_lock(&cifs_sb->tlink_tree_lock);
3332 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
3333 if (tlink)
3334 cifs_get_tlink(tlink);
3335 spin_unlock(&cifs_sb->tlink_tree_lock);
3336
3337 if (tlink == NULL) {
3338 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3339 if (newtlink == NULL)
3340 return ERR_PTR(-ENOMEM);
3341 newtlink->tl_index = fsuid;
3342 newtlink->tl_tcon = ERR_PTR(-EACCES);
3343 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3344 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3345 cifs_get_tlink(newtlink);
3346
3347 ret = radix_tree_preload(GFP_KERNEL);
3348 if (ret != 0) {
3349 kfree(newtlink);
3350 return ERR_PTR(ret);
3351 }
3352
3353 spin_lock(&cifs_sb->tlink_tree_lock);
3354 /* was one inserted after previous search? */
3355 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
3356 if (tlink) {
3357 cifs_get_tlink(tlink);
3358 spin_unlock(&cifs_sb->tlink_tree_lock);
3359 radix_tree_preload_end();
3360 kfree(newtlink);
3361 goto wait_for_construction;
3362 }
3363 ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
3364 spin_unlock(&cifs_sb->tlink_tree_lock);
3365 radix_tree_preload_end();
3366 if (ret) {
3367 kfree(newtlink);
3368 return ERR_PTR(ret);
3369 }
3370 tlink = newtlink;
3371 } else {
3372wait_for_construction:
3373 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
3374 cifs_sb_tcon_pending_wait,
3375 TASK_INTERRUPTIBLE);
3376 if (ret) {
3377 cifs_put_tlink(tlink);
3378 return ERR_PTR(ret);
3379 }
3380
3381 /* if it's good, return it */
3382 if (!IS_ERR(tlink->tl_tcon))
3383 return tlink;
3384
3385 /* return error if we tried this already recently */
3386 if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
3387 cifs_put_tlink(tlink);
3388 return ERR_PTR(-EACCES);
3389 }
3390
3391 if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
3392 goto wait_for_construction;
3393 }
3394
3395 tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
3396 clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
3397 wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
3398
3399 if (IS_ERR(tlink->tl_tcon)) {
3400 cifs_put_tlink(tlink);
3401 return ERR_PTR(-EACCES);
3402 }
3403
3404 return tlink;
3405}
3406
3407/*
3408 * periodic workqueue job that scans tcon_tree for a superblock and closes
3409 * out tcons.
3410 */
3411static void
3412cifs_prune_tlinks(struct work_struct *work)
3413{
3414 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3415 prune_tlinks.work);
3416 struct tcon_link *tlink[8];
3417 unsigned long now = jiffies;
3418 unsigned long index = 0;
3419 int i, ret;
3420
3421 do {
3422 spin_lock(&cifs_sb->tlink_tree_lock);
3423 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
3424 (void **)tlink, index,
3425 ARRAY_SIZE(tlink));
3426 /* increment index for next pass */
3427 if (ret > 0)
3428 index = tlink[ret - 1]->tl_index + 1;
3429 for (i = 0; i < ret; i++) {
3430 if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
3431 atomic_read(&tlink[i]->tl_count) != 0 ||
3432 time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
3433 now)) {
3434 tlink[i] = NULL;
3435 continue;
3436 }
3437 cifs_get_tlink(tlink[i]);
3438 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3439 radix_tree_delete(&cifs_sb->tlink_tree,
3440 tlink[i]->tl_index);
3441 }
3442 spin_unlock(&cifs_sb->tlink_tree_lock);
3443
3444 for (i = 0; i < ret; i++) {
3445 if (tlink[i] != NULL)
3446 cifs_put_tlink(tlink[i]);
3447 }
3448 } while (ret != 0);
3449
3450 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3451 TLINK_IDLE_EXPIRE);
3452}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e7ae78b66fa..3840eddbfb7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
54 int dfsplen; 54 int dfsplen;
55 char *full_path; 55 char *full_path;
56 char dirsep; 56 char dirsep;
57 struct cifs_sb_info *cifs_sb; 57 struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
58 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
58 59
59 if (direntry == NULL) 60 if (direntry == NULL)
60 return NULL; /* not much we can do if dentry is freed and 61 return NULL; /* not much we can do if dentry is freed and
61 we need to reopen the file after it was closed implicitly 62 we need to reopen the file after it was closed implicitly
62 when the server crashed */ 63 when the server crashed */
63 64
64 cifs_sb = CIFS_SB(direntry->d_sb);
65 dirsep = CIFS_DIR_SEP(cifs_sb); 65 dirsep = CIFS_DIR_SEP(cifs_sb);
66 pplen = cifs_sb->prepathlen; 66 pplen = cifs_sb->prepathlen;
67 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 67 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
68 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 68 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
69 else 69 else
70 dfsplen = 0; 70 dfsplen = 0;
71cifs_bp_rename_retry: 71cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
117 /* BB test paths to Windows with '/' in the midst of prepath */ 117 /* BB test paths to Windows with '/' in the midst of prepath */
118 118
119 if (dfsplen) { 119 if (dfsplen) {
120 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 120 strncpy(full_path, tcon->treeName, dfsplen);
121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { 121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
122 int i; 122 int i;
123 for (i = 0; i < dfsplen; i++) { 123 for (i = 0; i < dfsplen; i++) {
@@ -130,141 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133/*
134 * When called with struct file pointer set to NULL, there is no way we could
135 * update file->private_data, but getting it stuck on openFileList provides a
136 * way to access it from cifs_fill_filedata and thereby set file->private_data
137 * from cifs_open.
138 */
139struct cifsFileInfo *
140cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
141 struct file *file, struct vfsmount *mnt, unsigned int oflags)
142{
143 int oplock = 0;
144 struct cifsFileInfo *pCifsFile;
145 struct cifsInodeInfo *pCifsInode;
146 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
147
148 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
149 if (pCifsFile == NULL)
150 return pCifsFile;
151
152 if (oplockEnabled)
153 oplock = REQ_OPLOCK;
154
155 pCifsFile->netfid = fileHandle;
156 pCifsFile->pid = current->tgid;
157 pCifsFile->pInode = igrab(newinode);
158 pCifsFile->mnt = mnt;
159 pCifsFile->pfile = file;
160 pCifsFile->invalidHandle = false;
161 pCifsFile->closePend = false;
162 mutex_init(&pCifsFile->fh_mutex);
163 mutex_init(&pCifsFile->lock_mutex);
164 INIT_LIST_HEAD(&pCifsFile->llist);
165 atomic_set(&pCifsFile->count, 1);
166 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
167
168 write_lock(&GlobalSMBSeslock);
169 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
170 pCifsInode = CIFS_I(newinode);
171 if (pCifsInode) {
172 /* if readable file instance put first in list*/
173 if (oflags & FMODE_READ)
174 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
175 else
176 list_add_tail(&pCifsFile->flist,
177 &pCifsInode->openFileList);
178
179 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
180 pCifsInode->clientCanCacheAll = true;
181 pCifsInode->clientCanCacheRead = true;
182 cFYI(1, "Exclusive Oplock inode %p", newinode);
183 } else if ((oplock & 0xF) == OPLOCK_READ)
184 pCifsInode->clientCanCacheRead = true;
185 }
186 write_unlock(&GlobalSMBSeslock);
187
188 file->private_data = pCifsFile;
189
190 return pCifsFile;
191}
192
193int cifs_posix_open(char *full_path, struct inode **pinode,
194 struct super_block *sb, int mode, int oflags,
195 __u32 *poplock, __u16 *pnetfid, int xid)
196{
197 int rc;
198 FILE_UNIX_BASIC_INFO *presp_data;
199 __u32 posix_flags = 0;
200 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
201 struct cifs_fattr fattr;
202
203 cFYI(1, "posix open %s", full_path);
204
205 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
206 if (presp_data == NULL)
207 return -ENOMEM;
208
209/* So far cifs posix extensions can only map the following flags.
210 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
211 so far we do not seem to need them, and we can treat them as local only */
212 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
213 (FMODE_READ | FMODE_WRITE))
214 posix_flags = SMB_O_RDWR;
215 else if (oflags & FMODE_READ)
216 posix_flags = SMB_O_RDONLY;
217 else if (oflags & FMODE_WRITE)
218 posix_flags = SMB_O_WRONLY;
219 if (oflags & O_CREAT)
220 posix_flags |= SMB_O_CREAT;
221 if (oflags & O_EXCL)
222 posix_flags |= SMB_O_EXCL;
223 if (oflags & O_TRUNC)
224 posix_flags |= SMB_O_TRUNC;
225 /* be safe and imply O_SYNC for O_DSYNC */
226 if (oflags & O_DSYNC)
227 posix_flags |= SMB_O_SYNC;
228 if (oflags & O_DIRECTORY)
229 posix_flags |= SMB_O_DIRECTORY;
230 if (oflags & O_NOFOLLOW)
231 posix_flags |= SMB_O_NOFOLLOW;
232 if (oflags & O_DIRECT)
233 posix_flags |= SMB_O_DIRECT;
234
235 mode &= ~current_umask();
236 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
237 pnetfid, presp_data, poplock, full_path,
238 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
239 CIFS_MOUNT_MAP_SPECIAL_CHR);
240 if (rc)
241 goto posix_open_ret;
242
243 if (presp_data->Type == cpu_to_le32(-1))
244 goto posix_open_ret; /* open ok, caller does qpathinfo */
245
246 if (!pinode)
247 goto posix_open_ret; /* caller does not need info */
248
249 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
250
251 /* get new inode and set it up */
252 if (*pinode == NULL) {
253 cifs_fill_uniqueid(sb, &fattr);
254 *pinode = cifs_iget(sb, &fattr);
255 if (!*pinode) {
256 rc = -ENOMEM;
257 goto posix_open_ret;
258 }
259 } else {
260 cifs_fattr_to_inode(*pinode, &fattr);
261 }
262
263posix_open_ret:
264 kfree(presp_data);
265 return rc;
266}
267
268static void setup_cifs_dentry(struct cifsTconInfo *tcon, 133static void setup_cifs_dentry(struct cifsTconInfo *tcon,
269 struct dentry *direntry, 134 struct dentry *direntry,
270 struct inode *newinode) 135 struct inode *newinode)
@@ -297,6 +162,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
297 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 162 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
298 __u16 fileHandle; 163 __u16 fileHandle;
299 struct cifs_sb_info *cifs_sb; 164 struct cifs_sb_info *cifs_sb;
165 struct tcon_link *tlink;
300 struct cifsTconInfo *tcon; 166 struct cifsTconInfo *tcon;
301 char *full_path = NULL; 167 char *full_path = NULL;
302 FILE_ALL_INFO *buf = NULL; 168 FILE_ALL_INFO *buf = NULL;
@@ -306,22 +172,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
306 xid = GetXid(); 172 xid = GetXid();
307 173
308 cifs_sb = CIFS_SB(inode->i_sb); 174 cifs_sb = CIFS_SB(inode->i_sb);
309 tcon = cifs_sb->tcon; 175 tlink = cifs_sb_tlink(cifs_sb);
310 176 if (IS_ERR(tlink)) {
311 full_path = build_path_from_dentry(direntry);
312 if (full_path == NULL) {
313 rc = -ENOMEM;
314 FreeXid(xid); 177 FreeXid(xid);
315 return rc; 178 return PTR_ERR(tlink);
316 } 179 }
180 tcon = tlink_tcon(tlink);
317 181
318 if (oplockEnabled) 182 if (oplockEnabled)
319 oplock = REQ_OPLOCK; 183 oplock = REQ_OPLOCK;
320 184
321 if (nd && (nd->flags & LOOKUP_OPEN)) 185 if (nd && (nd->flags & LOOKUP_OPEN))
322 oflags = nd->intent.open.flags; 186 oflags = nd->intent.open.file->f_flags;
323 else 187 else
324 oflags = FMODE_READ | SMB_O_CREAT; 188 oflags = O_RDONLY | O_CREAT;
189
190 full_path = build_path_from_dentry(direntry);
191 if (full_path == NULL) {
192 rc = -ENOMEM;
193 goto cifs_create_out;
194 }
325 195
326 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 196 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
327 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 197 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -351,9 +221,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
351 /* if the file is going to stay open, then we 221 /* if the file is going to stay open, then we
352 need to set the desired access properly */ 222 need to set the desired access properly */
353 desiredAccess = 0; 223 desiredAccess = 0;
354 if (oflags & FMODE_READ) 224 if (OPEN_FMODE(oflags) & FMODE_READ)
355 desiredAccess |= GENERIC_READ; /* is this too little? */ 225 desiredAccess |= GENERIC_READ; /* is this too little? */
356 if (oflags & FMODE_WRITE) 226 if (OPEN_FMODE(oflags) & FMODE_WRITE)
357 desiredAccess |= GENERIC_WRITE; 227 desiredAccess |= GENERIC_WRITE;
358 228
359 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 229 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -371,9 +241,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
371 241
372 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 242 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
373 if (buf == NULL) { 243 if (buf == NULL) {
374 kfree(full_path); 244 rc = -ENOMEM;
375 FreeXid(xid); 245 goto cifs_create_out;
376 return -ENOMEM;
377 } 246 }
378 247
379 /* 248 /*
@@ -383,7 +252,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
383 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 252 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
384 create_options |= CREATE_OPTION_READONLY; 253 create_options |= CREATE_OPTION_READONLY;
385 254
386 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 255 if (tcon->ses->capabilities & CAP_NT_SMBS)
387 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 256 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
388 desiredAccess, create_options, 257 desiredAccess, create_options,
389 &fileHandle, &oplock, buf, cifs_sb->local_nls, 258 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -475,8 +344,7 @@ cifs_create_set_dentry:
475 goto cifs_create_out; 344 goto cifs_create_out;
476 } 345 }
477 346
478 pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, 347 pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
479 nd->path.mnt, oflags);
480 if (pfile_info == NULL) { 348 if (pfile_info == NULL) {
481 fput(filp); 349 fput(filp);
482 CIFSSMBClose(xid, tcon, fileHandle); 350 CIFSSMBClose(xid, tcon, fileHandle);
@@ -489,6 +357,7 @@ cifs_create_set_dentry:
489cifs_create_out: 357cifs_create_out:
490 kfree(buf); 358 kfree(buf);
491 kfree(full_path); 359 kfree(full_path);
360 cifs_put_tlink(tlink);
492 FreeXid(xid); 361 FreeXid(xid);
493 return rc; 362 return rc;
494} 363}
@@ -499,22 +368,35 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
499 int rc = -EPERM; 368 int rc = -EPERM;
500 int xid; 369 int xid;
501 struct cifs_sb_info *cifs_sb; 370 struct cifs_sb_info *cifs_sb;
371 struct tcon_link *tlink;
502 struct cifsTconInfo *pTcon; 372 struct cifsTconInfo *pTcon;
503 char *full_path = NULL; 373 char *full_path = NULL;
504 struct inode *newinode = NULL; 374 struct inode *newinode = NULL;
375 int oplock = 0;
376 u16 fileHandle;
377 FILE_ALL_INFO *buf = NULL;
378 unsigned int bytes_written;
379 struct win_dev *pdev;
505 380
506 if (!old_valid_dev(device_number)) 381 if (!old_valid_dev(device_number))
507 return -EINVAL; 382 return -EINVAL;
508 383
509 xid = GetXid();
510
511 cifs_sb = CIFS_SB(inode->i_sb); 384 cifs_sb = CIFS_SB(inode->i_sb);
512 pTcon = cifs_sb->tcon; 385 tlink = cifs_sb_tlink(cifs_sb);
386 if (IS_ERR(tlink))
387 return PTR_ERR(tlink);
388
389 pTcon = tlink_tcon(tlink);
390
391 xid = GetXid();
513 392
514 full_path = build_path_from_dentry(direntry); 393 full_path = build_path_from_dentry(direntry);
515 if (full_path == NULL) 394 if (full_path == NULL) {
516 rc = -ENOMEM; 395 rc = -ENOMEM;
517 else if (pTcon->unix_ext) { 396 goto mknod_out;
397 }
398
399 if (pTcon->unix_ext) {
518 struct cifs_unix_set_info_args args = { 400 struct cifs_unix_set_info_args args = {
519 .mode = mode & ~current_umask(), 401 .mode = mode & ~current_umask(),
520 .ctime = NO_CHANGE_64, 402 .ctime = NO_CHANGE_64,
@@ -533,88 +415,80 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
533 cifs_sb->local_nls, 415 cifs_sb->local_nls,
534 cifs_sb->mnt_cifs_flags & 416 cifs_sb->mnt_cifs_flags &
535 CIFS_MOUNT_MAP_SPECIAL_CHR); 417 CIFS_MOUNT_MAP_SPECIAL_CHR);
418 if (rc)
419 goto mknod_out;
536 420
537 if (!rc) { 421 rc = cifs_get_inode_info_unix(&newinode, full_path,
538 rc = cifs_get_inode_info_unix(&newinode, full_path,
539 inode->i_sb, xid); 422 inode->i_sb, xid);
540 if (pTcon->nocase) 423 if (pTcon->nocase)
541 direntry->d_op = &cifs_ci_dentry_ops; 424 direntry->d_op = &cifs_ci_dentry_ops;
542 else 425 else
543 direntry->d_op = &cifs_dentry_ops; 426 direntry->d_op = &cifs_dentry_ops;
544 if (rc == 0)
545 d_instantiate(direntry, newinode);
546 }
547 } else {
548 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
549 int oplock = 0;
550 u16 fileHandle;
551 FILE_ALL_INFO *buf;
552 427
553 cFYI(1, "sfu compat create special file"); 428 if (rc == 0)
429 d_instantiate(direntry, newinode);
430 goto mknod_out;
431 }
554 432
555 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 433 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
556 if (buf == NULL) { 434 goto mknod_out;
557 kfree(full_path);
558 rc = -ENOMEM;
559 FreeXid(xid);
560 return rc;
561 }
562 435
563 rc = CIFSSMBOpen(xid, pTcon, full_path, 436
564 FILE_CREATE, /* fail if exists */ 437 cFYI(1, "sfu compat create special file");
565 GENERIC_WRITE /* BB would 438
566 WRITE_OWNER | WRITE_DAC be better? */, 439 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
567 /* Create a file and set the 440 if (buf == NULL) {
568 file attribute to SYSTEM */ 441 kfree(full_path);
569 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 442 rc = -ENOMEM;
570 &fileHandle, &oplock, buf, 443 FreeXid(xid);
571 cifs_sb->local_nls, 444 return rc;
572 cifs_sb->mnt_cifs_flags &
573 CIFS_MOUNT_MAP_SPECIAL_CHR);
574
575 /* BB FIXME - add handling for backlevel servers
576 which need legacy open and check for all
577 calls to SMBOpen for fallback to SMBLeagcyOpen */
578 if (!rc) {
579 /* BB Do not bother to decode buf since no
580 local inode yet to put timestamps in,
581 but we can reuse it safely */
582 unsigned int bytes_written;
583 struct win_dev *pdev;
584 pdev = (struct win_dev *)buf;
585 if (S_ISCHR(mode)) {
586 memcpy(pdev->type, "IntxCHR", 8);
587 pdev->major =
588 cpu_to_le64(MAJOR(device_number));
589 pdev->minor =
590 cpu_to_le64(MINOR(device_number));
591 rc = CIFSSMBWrite(xid, pTcon,
592 fileHandle,
593 sizeof(struct win_dev),
594 0, &bytes_written, (char *)pdev,
595 NULL, 0);
596 } else if (S_ISBLK(mode)) {
597 memcpy(pdev->type, "IntxBLK", 8);
598 pdev->major =
599 cpu_to_le64(MAJOR(device_number));
600 pdev->minor =
601 cpu_to_le64(MINOR(device_number));
602 rc = CIFSSMBWrite(xid, pTcon,
603 fileHandle,
604 sizeof(struct win_dev),
605 0, &bytes_written, (char *)pdev,
606 NULL, 0);
607 } /* else if(S_ISFIFO */
608 CIFSSMBClose(xid, pTcon, fileHandle);
609 d_drop(direntry);
610 }
611 kfree(buf);
612 /* add code here to set EAs */
613 }
614 } 445 }
615 446
447 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
448 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
449 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
450 &fileHandle, &oplock, buf, cifs_sb->local_nls,
451 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
452 if (rc)
453 goto mknod_out;
454
455 /* BB Do not bother to decode buf since no local inode yet to put
456 * timestamps in, but we can reuse it safely */
457
458 pdev = (struct win_dev *)buf;
459 if (S_ISCHR(mode)) {
460 memcpy(pdev->type, "IntxCHR", 8);
461 pdev->major =
462 cpu_to_le64(MAJOR(device_number));
463 pdev->minor =
464 cpu_to_le64(MINOR(device_number));
465 rc = CIFSSMBWrite(xid, pTcon,
466 fileHandle,
467 sizeof(struct win_dev),
468 0, &bytes_written, (char *)pdev,
469 NULL, 0);
470 } else if (S_ISBLK(mode)) {
471 memcpy(pdev->type, "IntxBLK", 8);
472 pdev->major =
473 cpu_to_le64(MAJOR(device_number));
474 pdev->minor =
475 cpu_to_le64(MINOR(device_number));
476 rc = CIFSSMBWrite(xid, pTcon,
477 fileHandle,
478 sizeof(struct win_dev),
479 0, &bytes_written, (char *)pdev,
480 NULL, 0);
481 } /* else if (S_ISFIFO) */
482 CIFSSMBClose(xid, pTcon, fileHandle);
483 d_drop(direntry);
484
485 /* FIXME: add code here to set EAs */
486
487mknod_out:
616 kfree(full_path); 488 kfree(full_path);
489 kfree(buf);
617 FreeXid(xid); 490 FreeXid(xid);
491 cifs_put_tlink(tlink);
618 return rc; 492 return rc;
619} 493}
620 494
@@ -628,6 +502,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
628 __u16 fileHandle = 0; 502 __u16 fileHandle = 0;
629 bool posix_open = false; 503 bool posix_open = false;
630 struct cifs_sb_info *cifs_sb; 504 struct cifs_sb_info *cifs_sb;
505 struct tcon_link *tlink;
631 struct cifsTconInfo *pTcon; 506 struct cifsTconInfo *pTcon;
632 struct cifsFileInfo *cfile; 507 struct cifsFileInfo *cfile;
633 struct inode *newInode = NULL; 508 struct inode *newInode = NULL;
@@ -642,7 +517,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
642 /* check whether path exists */ 517 /* check whether path exists */
643 518
644 cifs_sb = CIFS_SB(parent_dir_inode->i_sb); 519 cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
645 pTcon = cifs_sb->tcon; 520 tlink = cifs_sb_tlink(cifs_sb);
521 if (IS_ERR(tlink)) {
522 FreeXid(xid);
523 return (struct dentry *)tlink;
524 }
525 pTcon = tlink_tcon(tlink);
646 526
647 /* 527 /*
648 * Don't allow the separator character in a path component. 528 * Don't allow the separator character in a path component.
@@ -653,8 +533,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
653 for (i = 0; i < direntry->d_name.len; i++) 533 for (i = 0; i < direntry->d_name.len; i++)
654 if (direntry->d_name.name[i] == '\\') { 534 if (direntry->d_name.name[i] == '\\') {
655 cFYI(1, "Invalid file name"); 535 cFYI(1, "Invalid file name");
656 FreeXid(xid); 536 rc = -EINVAL;
657 return ERR_PTR(-EINVAL); 537 goto lookup_out;
658 } 538 }
659 } 539 }
660 540
@@ -664,7 +544,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
664 */ 544 */
665 if (nd && (nd->flags & LOOKUP_EXCL)) { 545 if (nd && (nd->flags & LOOKUP_EXCL)) {
666 d_instantiate(direntry, NULL); 546 d_instantiate(direntry, NULL);
667 return NULL; 547 rc = 0;
548 goto lookup_out;
668 } 549 }
669 550
670 /* can not grab the rename sem here since it would 551 /* can not grab the rename sem here since it would
@@ -672,8 +553,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
672 in which we already have the sb rename sem */ 553 in which we already have the sb rename sem */
673 full_path = build_path_from_dentry(direntry); 554 full_path = build_path_from_dentry(direntry);
674 if (full_path == NULL) { 555 if (full_path == NULL) {
675 FreeXid(xid); 556 rc = -ENOMEM;
676 return ERR_PTR(-ENOMEM); 557 goto lookup_out;
677 } 558 }
678 559
679 if (direntry->d_inode != NULL) { 560 if (direntry->d_inode != NULL) {
@@ -696,11 +577,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
696 if (pTcon->unix_ext) { 577 if (pTcon->unix_ext) {
697 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 578 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
698 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 579 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
699 (nd->intent.open.flags & O_CREAT)) { 580 (nd->intent.open.file->f_flags & O_CREAT)) {
700 rc = cifs_posix_open(full_path, &newInode, 581 rc = cifs_posix_open(full_path, &newInode,
701 parent_dir_inode->i_sb, 582 parent_dir_inode->i_sb,
702 nd->intent.open.create_mode, 583 nd->intent.open.create_mode,
703 nd->intent.open.flags, &oplock, 584 nd->intent.open.file->f_flags, &oplock,
704 &fileHandle, xid); 585 &fileHandle, xid);
705 /* 586 /*
706 * The check below works around a bug in POSIX 587 * The check below works around a bug in POSIX
@@ -736,9 +617,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
736 goto lookup_out; 617 goto lookup_out;
737 } 618 }
738 619
739 cfile = cifs_new_fileinfo(newInode, fileHandle, filp, 620 cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
740 nd->path.mnt, 621 oplock);
741 nd->intent.open.flags);
742 if (cfile == NULL) { 622 if (cfile == NULL) {
743 fput(filp); 623 fput(filp);
744 CIFSSMBClose(xid, pTcon, fileHandle); 624 CIFSSMBClose(xid, pTcon, fileHandle);
@@ -768,6 +648,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
768 648
769lookup_out: 649lookup_out:
770 kfree(full_path); 650 kfree(full_path);
651 cifs_put_tlink(tlink);
771 FreeXid(xid); 652 FreeXid(xid);
772 return ERR_PTR(rc); 653 return ERR_PTR(rc);
773} 654}
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 853a968e82d..0eb87026cad 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
4 * Copyright (c) 2007 Igor Mammedov 4 * Copyright (c) 2007 Igor Mammedov
5 * Author(s): Igor Mammedov (niallain@gmail.com) 5 * Author(s): Igor Mammedov (niallain@gmail.com)
6 * Steve French (sfrench@us.ibm.com) 6 * Steve French (sfrench@us.ibm.com)
7 * Wang Lei (wang840925@gmail.com)
8 * David Howells (dhowells@redhat.com)
7 * 9 *
8 * Contains the CIFS DFS upcall routines used for hostname to 10 * Contains the CIFS DFS upcall routines used for hostname to
9 * IP address translation. 11 * IP address translation.
@@ -24,214 +26,73 @@
24 */ 26 */
25 27
26#include <linux/slab.h> 28#include <linux/slab.h>
27#include <linux/keyctl.h> 29#include <linux/dns_resolver.h>
28#include <linux/key-type.h>
29#include <keys/user-type.h>
30#include "dns_resolve.h" 30#include "dns_resolve.h"
31#include "cifsglob.h" 31#include "cifsglob.h"
32#include "cifsproto.h" 32#include "cifsproto.h"
33#include "cifs_debug.h" 33#include "cifs_debug.h"
34 34
35static const struct cred *dns_resolver_cache; 35/**
36 36 * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
37/* Checks if supplied name is IP address 37 * @unc: UNC path specifying the server
38 * returns: 38 * @ip_addr: Where to return the IP address.
39 * 1 - name is IP 39 *
40 * 0 - name is not IP 40 * The IP address will be returned in string form, and the caller is
41 */ 41 * responsible for freeing it.
42static int 42 *
43is_ip(char *name) 43 * Returns length of result on success, -ve on error.
44{
45 struct sockaddr_storage ss;
46
47 return cifs_convert_address(name, &ss);
48}
49
50static int
51dns_resolver_instantiate(struct key *key, const void *data,
52 size_t datalen)
53{
54 int rc = 0;
55 char *ip;
56
57 ip = kmalloc(datalen + 1, GFP_KERNEL);
58 if (!ip)
59 return -ENOMEM;
60
61 memcpy(ip, data, datalen);
62 ip[datalen] = '\0';
63
64 /* make sure this looks like an address */
65 if (!is_ip(ip)) {
66 kfree(ip);
67 return -EINVAL;
68 }
69
70 key->type_data.x[0] = datalen;
71 key->payload.data = ip;
72
73 return rc;
74}
75
76static void
77dns_resolver_destroy(struct key *key)
78{
79 kfree(key->payload.data);
80}
81
82struct key_type key_type_dns_resolver = {
83 .name = "dns_resolver",
84 .def_datalen = sizeof(struct in_addr),
85 .describe = user_describe,
86 .instantiate = dns_resolver_instantiate,
87 .destroy = dns_resolver_destroy,
88 .match = user_match,
89};
90
91/* Resolves server name to ip address.
92 * input:
93 * unc - server UNC
94 * output:
95 * *ip_addr - pointer to server ip, caller responcible for freeing it.
96 * return 0 on success
97 */ 44 */
98int 45int
99dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) 46dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
100{ 47{
101 const struct cred *saved_cred; 48 struct sockaddr_storage ss;
102 int rc = -EAGAIN; 49 const char *hostname, *sep;
103 struct key *rkey = ERR_PTR(-EAGAIN);
104 char *name; 50 char *name;
105 char *data = NULL; 51 int len, rc;
106 int len;
107 52
108 if (!ip_addr || !unc) 53 if (!ip_addr || !unc)
109 return -EINVAL; 54 return -EINVAL;
110 55
111 /* search for server name delimiter */
112 len = strlen(unc); 56 len = strlen(unc);
113 if (len < 3) { 57 if (len < 3) {
114 cFYI(1, "%s: unc is too short: %s", __func__, unc); 58 cFYI(1, "%s: unc is too short: %s", __func__, unc);
115 return -EINVAL; 59 return -EINVAL;
116 } 60 }
117 len -= 2;
118 name = memchr(unc+2, '\\', len);
119 if (!name) {
120 cFYI(1, "%s: probably server name is whole unc: %s",
121 __func__, unc);
122 } else {
123 len = (name - unc) - 2/* leading // */;
124 }
125
126 name = kmalloc(len+1, GFP_KERNEL);
127 if (!name) {
128 rc = -ENOMEM;
129 return rc;
130 }
131 memcpy(name, unc+2, len);
132 name[len] = 0;
133
134 if (is_ip(name)) {
135 cFYI(1, "%s: it is IP, skipping dns upcall: %s",
136 __func__, name);
137 data = name;
138 goto skip_upcall;
139 }
140 61
141 saved_cred = override_creds(dns_resolver_cache); 62 /* Discount leading slashes for cifs */
142 rkey = request_key(&key_type_dns_resolver, name, ""); 63 len -= 2;
143 revert_creds(saved_cred); 64 hostname = unc + 2;
144 if (!IS_ERR(rkey)) {
145 if (!(rkey->perm & KEY_USR_VIEW)) {
146 down_read(&rkey->sem);
147 rkey->perm |= KEY_USR_VIEW;
148 up_read(&rkey->sem);
149 }
150 len = rkey->type_data.x[0];
151 data = rkey->payload.data;
152 } else {
153 cERROR(1, "%s: unable to resolve: %s", __func__, name);
154 goto out;
155 }
156
157skip_upcall:
158 if (data) {
159 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
160 if (*ip_addr) {
161 memcpy(*ip_addr, data, len + 1);
162 if (!IS_ERR(rkey))
163 cFYI(1, "%s: resolved: %s to %s", __func__,
164 name,
165 *ip_addr
166 );
167 rc = 0;
168 } else {
169 rc = -ENOMEM;
170 }
171 if (!IS_ERR(rkey))
172 key_put(rkey);
173 }
174 65
175out: 66 /* Search for server name delimiter */
176 kfree(name); 67 sep = memchr(hostname, '\\', len);
68 if (sep)
69 len = sep - unc;
70 else
71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc);
73
74 /* Try to interpret hostname as an IPv4 or IPv6 address */
75 rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
76 if (rc > 0)
77 goto name_is_IP_address;
78
79 /* Perform the upcall */
80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
81 if (rc < 0)
82 cERROR(1, "%s: unable to resolve: %*.*s",
83 __func__, len, len, hostname);
84 else
85 cFYI(1, "%s: resolved: %*.*s to %s",
86 __func__, len, len, hostname, *ip_addr);
177 return rc; 87 return rc;
178}
179 88
180int __init cifs_init_dns_resolver(void) 89name_is_IP_address:
181{ 90 name = kmalloc(len + 1, GFP_KERNEL);
182 struct cred *cred; 91 if (!name)
183 struct key *keyring;
184 int ret;
185
186 printk(KERN_NOTICE "Registering the %s key type\n",
187 key_type_dns_resolver.name);
188
189 /* create an override credential set with a special thread keyring in
190 * which DNS requests are cached
191 *
192 * this is used to prevent malicious redirections from being installed
193 * with add_key().
194 */
195 cred = prepare_kernel_cred(NULL);
196 if (!cred)
197 return -ENOMEM; 92 return -ENOMEM;
198 93 memcpy(name, hostname, len);
199 keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, 94 name[len] = 0;
200 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 95 cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
201 KEY_USR_VIEW | KEY_USR_READ, 96 *ip_addr = name;
202 KEY_ALLOC_NOT_IN_QUOTA);
203 if (IS_ERR(keyring)) {
204 ret = PTR_ERR(keyring);
205 goto failed_put_cred;
206 }
207
208 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
209 if (ret < 0)
210 goto failed_put_key;
211
212 ret = register_key_type(&key_type_dns_resolver);
213 if (ret < 0)
214 goto failed_put_key;
215
216 /* instruct request_key() to use this special keyring as a cache for
217 * the results it looks up */
218 cred->thread_keyring = keyring;
219 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
220 dns_resolver_cache = cred;
221 return 0; 97 return 0;
222
223failed_put_key:
224 key_put(keyring);
225failed_put_cred:
226 put_cred(cred);
227 return ret;
228}
229
230void cifs_exit_dns_resolver(void)
231{
232 key_revoke(dns_resolver_cache->thread_keyring);
233 unregister_key_type(&key_type_dns_resolver);
234 put_cred(dns_resolver_cache);
235 printk(KERN_NOTICE "Unregistered %s key type\n",
236 key_type_dns_resolver.name);
237} 98}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5d7f291df16..d3f5d27f4d0 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
24#define _DNS_RESOLVE_H 24#define _DNS_RESOLVE_H
25 25
26#ifdef __KERNEL__ 26#ifdef __KERNEL__
27extern int __init cifs_init_dns_resolver(void);
28extern void cifs_exit_dns_resolver(void);
29extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); 27extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
30#endif /* KERNEL */ 28#endif /* KERNEL */
31 29
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 409e4f523e6..ae82159cf7f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -40,6 +40,7 @@
40#include "cifs_unicode.h" 40#include "cifs_unicode.h"
41#include "cifs_debug.h" 41#include "cifs_debug.h"
42#include "cifs_fs_sb.h" 42#include "cifs_fs_sb.h"
43#include "fscache.h"
43 44
44static inline int cifs_convert_flags(unsigned int flags) 45static inline int cifs_convert_flags(unsigned int flags)
45{ 46{
@@ -59,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
59 FILE_READ_DATA); 60 FILE_READ_DATA);
60} 61}
61 62
62static inline fmode_t cifs_posix_convert_flags(unsigned int flags) 63static u32 cifs_posix_convert_flags(unsigned int flags)
63{ 64{
64 fmode_t posix_flags = 0; 65 u32 posix_flags = 0;
65 66
66 if ((flags & O_ACCMODE) == O_RDONLY) 67 if ((flags & O_ACCMODE) == O_RDONLY)
67 posix_flags = FMODE_READ; 68 posix_flags = SMB_O_RDONLY;
68 else if ((flags & O_ACCMODE) == O_WRONLY) 69 else if ((flags & O_ACCMODE) == O_WRONLY)
69 posix_flags = FMODE_WRITE; 70 posix_flags = SMB_O_WRONLY;
70 else if ((flags & O_ACCMODE) == O_RDWR) { 71 else if ((flags & O_ACCMODE) == O_RDWR)
71 /* GENERIC_ALL is too much permission to request 72 posix_flags = SMB_O_RDWR;
72 can cause unnecessary access denied on create */ 73
73 /* return GENERIC_ALL; */ 74 if (flags & O_CREAT)
74 posix_flags = FMODE_READ | FMODE_WRITE; 75 posix_flags |= SMB_O_CREAT;
75 } 76 if (flags & O_EXCL)
76 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when 77 posix_flags |= SMB_O_EXCL;
77 reopening a file. They had their effect on the original open */ 78 if (flags & O_TRUNC)
78 if (flags & O_APPEND) 79 posix_flags |= SMB_O_TRUNC;
79 posix_flags |= (fmode_t)O_APPEND; 80 /* be safe and imply O_SYNC for O_DSYNC */
80 if (flags & O_DSYNC) 81 if (flags & O_DSYNC)
81 posix_flags |= (fmode_t)O_DSYNC; 82 posix_flags |= SMB_O_SYNC;
82 if (flags & __O_SYNC)
83 posix_flags |= (fmode_t)__O_SYNC;
84 if (flags & O_DIRECTORY) 83 if (flags & O_DIRECTORY)
85 posix_flags |= (fmode_t)O_DIRECTORY; 84 posix_flags |= SMB_O_DIRECTORY;
86 if (flags & O_NOFOLLOW) 85 if (flags & O_NOFOLLOW)
87 posix_flags |= (fmode_t)O_NOFOLLOW; 86 posix_flags |= SMB_O_NOFOLLOW;
88 if (flags & O_DIRECT) 87 if (flags & O_DIRECT)
89 posix_flags |= (fmode_t)O_DIRECT; 88 posix_flags |= SMB_O_DIRECT;
90 89
91 return posix_flags; 90 return posix_flags;
92} 91}
@@ -105,66 +104,8 @@ static inline int cifs_get_disposition(unsigned int flags)
105 return FILE_OPEN; 104 return FILE_OPEN;
106} 105}
107 106
108/* all arguments to this function must be checked for validity in caller */
109static inline int
110cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
111 struct cifsInodeInfo *pCifsInode, __u32 oplock,
112 u16 netfid)
113{
114
115 write_lock(&GlobalSMBSeslock);
116
117 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
118 if (pCifsInode == NULL) {
119 write_unlock(&GlobalSMBSeslock);
120 return -EINVAL;
121 }
122
123 if (pCifsInode->clientCanCacheRead) {
124 /* we have the inode open somewhere else
125 no need to discard cache data */
126 goto psx_client_can_cache;
127 }
128
129 /* BB FIXME need to fix this check to move it earlier into posix_open
130 BB fIX following section BB FIXME */
131
132 /* if not oplocked, invalidate inode pages if mtime or file
133 size changed */
134/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
135 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
136 (file->f_path.dentry->d_inode->i_size ==
137 (loff_t)le64_to_cpu(buf->EndOfFile))) {
138 cFYI(1, "inode unchanged on server");
139 } else {
140 if (file->f_path.dentry->d_inode->i_mapping) {
141 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
142 if (rc != 0)
143 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
144 }
145 cFYI(1, "invalidating remote inode since open detected it "
146 "changed");
147 invalidate_remote_inode(file->f_path.dentry->d_inode);
148 } */
149
150psx_client_can_cache:
151 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
152 pCifsInode->clientCanCacheAll = true;
153 pCifsInode->clientCanCacheRead = true;
154 cFYI(1, "Exclusive Oplock granted on inode %p",
155 file->f_path.dentry->d_inode);
156 } else if ((oplock & 0xF) == OPLOCK_READ)
157 pCifsInode->clientCanCacheRead = true;
158
159 /* will have to change the unlock if we reenable the
160 filemap_fdatawrite (which does not seem necessary */
161 write_unlock(&GlobalSMBSeslock);
162 return 0;
163}
164
165/* all arguments to this function must be checked for validity in caller */
166static inline int cifs_open_inode_helper(struct inode *inode, 107static inline int cifs_open_inode_helper(struct inode *inode,
167 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 108 struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
168 char *full_path, int xid) 109 char *full_path, int xid)
169{ 110{
170 struct cifsInodeInfo *pCifsInode = CIFS_I(inode); 111 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -190,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
190 /* BB no need to lock inode until after invalidate 131 /* BB no need to lock inode until after invalidate
191 since namei code should already have it locked? */ 132 since namei code should already have it locked? */
192 rc = filemap_write_and_wait(inode->i_mapping); 133 rc = filemap_write_and_wait(inode->i_mapping);
193 if (rc != 0) 134 mapping_set_error(inode->i_mapping, rc);
194 pCifsInode->write_behind_rc = rc;
195 } 135 }
196 cFYI(1, "invalidating remote inode since open detected it " 136 cFYI(1, "invalidating remote inode since open detected it "
197 "changed"); 137 "changed");
@@ -206,16 +146,176 @@ client_can_cache:
206 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
207 xid, NULL); 147 xid, NULL);
208 148
209 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 149 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
210 pCifsInode->clientCanCacheAll = true; 150 pCifsInode->clientCanCacheAll = true;
211 pCifsInode->clientCanCacheRead = true; 151 pCifsInode->clientCanCacheRead = true;
212 cFYI(1, "Exclusive Oplock granted on inode %p", inode); 152 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
213 } else if ((*oplock & 0xF) == OPLOCK_READ) 153 } else if ((oplock & 0xF) == OPLOCK_READ)
214 pCifsInode->clientCanCacheRead = true; 154 pCifsInode->clientCanCacheRead = true;
215 155
216 return rc; 156 return rc;
217} 157}
218 158
159int cifs_posix_open(char *full_path, struct inode **pinode,
160 struct super_block *sb, int mode, unsigned int f_flags,
161 __u32 *poplock, __u16 *pnetfid, int xid)
162{
163 int rc;
164 FILE_UNIX_BASIC_INFO *presp_data;
165 __u32 posix_flags = 0;
166 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
167 struct cifs_fattr fattr;
168 struct tcon_link *tlink;
169 struct cifsTconInfo *tcon;
170
171 cFYI(1, "posix open %s", full_path);
172
173 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
174 if (presp_data == NULL)
175 return -ENOMEM;
176
177 tlink = cifs_sb_tlink(cifs_sb);
178 if (IS_ERR(tlink)) {
179 rc = PTR_ERR(tlink);
180 goto posix_open_ret;
181 }
182
183 tcon = tlink_tcon(tlink);
184 mode &= ~current_umask();
185
186 posix_flags = cifs_posix_convert_flags(f_flags);
187 rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
188 poplock, full_path, cifs_sb->local_nls,
189 cifs_sb->mnt_cifs_flags &
190 CIFS_MOUNT_MAP_SPECIAL_CHR);
191 cifs_put_tlink(tlink);
192
193 if (rc)
194 goto posix_open_ret;
195
196 if (presp_data->Type == cpu_to_le32(-1))
197 goto posix_open_ret; /* open ok, caller does qpathinfo */
198
199 if (!pinode)
200 goto posix_open_ret; /* caller does not need info */
201
202 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
203
204 /* get new inode and set it up */
205 if (*pinode == NULL) {
206 cifs_fill_uniqueid(sb, &fattr);
207 *pinode = cifs_iget(sb, &fattr);
208 if (!*pinode) {
209 rc = -ENOMEM;
210 goto posix_open_ret;
211 }
212 } else {
213 cifs_fattr_to_inode(*pinode, &fattr);
214 }
215
216posix_open_ret:
217 kfree(presp_data);
218 return rc;
219}
220
221struct cifsFileInfo *
222cifs_new_fileinfo(__u16 fileHandle, struct file *file,
223 struct tcon_link *tlink, __u32 oplock)
224{
225 struct dentry *dentry = file->f_path.dentry;
226 struct inode *inode = dentry->d_inode;
227 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
228 struct cifsFileInfo *pCifsFile;
229
230 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
231 if (pCifsFile == NULL)
232 return pCifsFile;
233
234 pCifsFile->count = 1;
235 pCifsFile->netfid = fileHandle;
236 pCifsFile->pid = current->tgid;
237 pCifsFile->uid = current_fsuid();
238 pCifsFile->dentry = dget(dentry);
239 pCifsFile->f_flags = file->f_flags;
240 pCifsFile->invalidHandle = false;
241 pCifsFile->tlink = cifs_get_tlink(tlink);
242 mutex_init(&pCifsFile->fh_mutex);
243 mutex_init(&pCifsFile->lock_mutex);
244 INIT_LIST_HEAD(&pCifsFile->llist);
245 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
246
247 spin_lock(&cifs_file_list_lock);
248 list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
249 /* if readable file instance put first in list*/
250 if (file->f_mode & FMODE_READ)
251 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
252 else
253 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
254 spin_unlock(&cifs_file_list_lock);
255
256 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
257 pCifsInode->clientCanCacheAll = true;
258 pCifsInode->clientCanCacheRead = true;
259 cFYI(1, "Exclusive Oplock inode %p", inode);
260 } else if ((oplock & 0xF) == OPLOCK_READ)
261 pCifsInode->clientCanCacheRead = true;
262
263 file->private_data = pCifsFile;
264 return pCifsFile;
265}
266
267/*
268 * Release a reference on the file private data. This may involve closing
269 * the filehandle out on the server. Must be called without holding
270 * cifs_file_list_lock.
271 */
272void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
273{
274 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
275 struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
276 struct cifsLockInfo *li, *tmp;
277
278 spin_lock(&cifs_file_list_lock);
279 if (--cifs_file->count > 0) {
280 spin_unlock(&cifs_file_list_lock);
281 return;
282 }
283
284 /* remove it from the lists */
285 list_del(&cifs_file->flist);
286 list_del(&cifs_file->tlist);
287
288 if (list_empty(&cifsi->openFileList)) {
289 cFYI(1, "closing last open instance for inode %p",
290 cifs_file->dentry->d_inode);
291 cifsi->clientCanCacheRead = false;
292 cifsi->clientCanCacheAll = false;
293 }
294 spin_unlock(&cifs_file_list_lock);
295
296 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
297 int xid, rc;
298
299 xid = GetXid();
300 rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
301 FreeXid(xid);
302 }
303
304 /* Delete any outstanding lock records. We'll lose them when the file
305 * is closed anyway.
306 */
307 mutex_lock(&cifs_file->lock_mutex);
308 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
309 list_del(&li->llist);
310 kfree(li);
311 }
312 mutex_unlock(&cifs_file->lock_mutex);
313
314 cifs_put_tlink(cifs_file->tlink);
315 dput(cifs_file->dentry);
316 kfree(cifs_file);
317}
318
219int cifs_open(struct inode *inode, struct file *file) 319int cifs_open(struct inode *inode, struct file *file)
220{ 320{
221 int rc = -EACCES; 321 int rc = -EACCES;
@@ -223,6 +323,7 @@ int cifs_open(struct inode *inode, struct file *file)
223 __u32 oplock; 323 __u32 oplock;
224 struct cifs_sb_info *cifs_sb; 324 struct cifs_sb_info *cifs_sb;
225 struct cifsTconInfo *tcon; 325 struct cifsTconInfo *tcon;
326 struct tcon_link *tlink;
226 struct cifsFileInfo *pCifsFile = NULL; 327 struct cifsFileInfo *pCifsFile = NULL;
227 struct cifsInodeInfo *pCifsInode; 328 struct cifsInodeInfo *pCifsInode;
228 char *full_path = NULL; 329 char *full_path = NULL;
@@ -234,15 +335,19 @@ int cifs_open(struct inode *inode, struct file *file)
234 xid = GetXid(); 335 xid = GetXid();
235 336
236 cifs_sb = CIFS_SB(inode->i_sb); 337 cifs_sb = CIFS_SB(inode->i_sb);
237 tcon = cifs_sb->tcon; 338 tlink = cifs_sb_tlink(cifs_sb);
339 if (IS_ERR(tlink)) {
340 FreeXid(xid);
341 return PTR_ERR(tlink);
342 }
343 tcon = tlink_tcon(tlink);
238 344
239 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 345 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
240 346
241 full_path = build_path_from_dentry(file->f_path.dentry); 347 full_path = build_path_from_dentry(file->f_path.dentry);
242 if (full_path == NULL) { 348 if (full_path == NULL) {
243 rc = -ENOMEM; 349 rc = -ENOMEM;
244 FreeXid(xid); 350 goto out;
245 return rc;
246 } 351 }
247 352
248 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 353 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
@@ -257,31 +362,22 @@ int cifs_open(struct inode *inode, struct file *file)
257 (tcon->ses->capabilities & CAP_UNIX) && 362 (tcon->ses->capabilities & CAP_UNIX) &&
258 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 363 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
259 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 364 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
260 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
261 oflags |= SMB_O_CREAT;
262 /* can not refresh inode info since size could be stale */ 365 /* can not refresh inode info since size could be stale */
263 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 366 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
264 cifs_sb->mnt_file_mode /* ignored */, 367 cifs_sb->mnt_file_mode /* ignored */,
265 oflags, &oplock, &netfid, xid); 368 file->f_flags, &oplock, &netfid, xid);
266 if (rc == 0) { 369 if (rc == 0) {
267 cFYI(1, "posix open succeeded"); 370 cFYI(1, "posix open succeeded");
268 /* no need for special case handling of setting mode
269 on read only files needed here */
270 371
271 rc = cifs_posix_open_inode_helper(inode, file, 372 pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
272 pCifsInode, oplock, netfid); 373 oplock);
273 if (rc != 0) {
274 CIFSSMBClose(xid, tcon, netfid);
275 goto out;
276 }
277
278 pCifsFile = cifs_new_fileinfo(inode, netfid, file,
279 file->f_path.mnt,
280 oflags);
281 if (pCifsFile == NULL) { 374 if (pCifsFile == NULL) {
282 CIFSSMBClose(xid, tcon, netfid); 375 CIFSSMBClose(xid, tcon, netfid);
283 rc = -ENOMEM; 376 rc = -ENOMEM;
284 } 377 }
378
379 cifs_fscache_set_inode_cookie(inode, file);
380
285 goto out; 381 goto out;
286 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 382 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
287 if (tcon->ses->serverNOS) 383 if (tcon->ses->serverNOS)
@@ -342,7 +438,7 @@ int cifs_open(struct inode *inode, struct file *file)
342 goto out; 438 goto out;
343 } 439 }
344 440
345 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 441 if (tcon->ses->capabilities & CAP_NT_SMBS)
346 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 442 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
347 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 443 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
348 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 444 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
@@ -362,17 +458,18 @@ int cifs_open(struct inode *inode, struct file *file)
362 goto out; 458 goto out;
363 } 459 }
364 460
365 rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); 461 rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
366 if (rc != 0) 462 if (rc != 0)
367 goto out; 463 goto out;
368 464
369 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, 465 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
370 file->f_flags);
371 if (pCifsFile == NULL) { 466 if (pCifsFile == NULL) {
372 rc = -ENOMEM; 467 rc = -ENOMEM;
373 goto out; 468 goto out;
374 } 469 }
375 470
471 cifs_fscache_set_inode_cookie(inode, file);
472
376 if (oplock & CIFS_CREATE_ACTION) { 473 if (oplock & CIFS_CREATE_ACTION) {
377 /* time to set mode which we can not set earlier due to 474 /* time to set mode which we can not set earlier due to
378 problems creating new read-only files */ 475 problems creating new read-only files */
@@ -397,6 +494,7 @@ out:
397 kfree(buf); 494 kfree(buf);
398 kfree(full_path); 495 kfree(full_path);
399 FreeXid(xid); 496 FreeXid(xid);
497 cifs_put_tlink(tlink);
400 return rc; 498 return rc;
401} 499}
402 500
@@ -411,14 +509,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
411 return rc; 509 return rc;
412} 510}
413 511
414static int cifs_reopen_file(struct file *file, bool can_flush) 512static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
415{ 513{
416 int rc = -EACCES; 514 int rc = -EACCES;
417 int xid; 515 int xid;
418 __u32 oplock; 516 __u32 oplock;
419 struct cifs_sb_info *cifs_sb; 517 struct cifs_sb_info *cifs_sb;
420 struct cifsTconInfo *tcon; 518 struct cifsTconInfo *tcon;
421 struct cifsFileInfo *pCifsFile;
422 struct cifsInodeInfo *pCifsInode; 519 struct cifsInodeInfo *pCifsInode;
423 struct inode *inode; 520 struct inode *inode;
424 char *full_path = NULL; 521 char *full_path = NULL;
@@ -426,11 +523,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
426 int disposition = FILE_OPEN; 523 int disposition = FILE_OPEN;
427 __u16 netfid; 524 __u16 netfid;
428 525
429 if (file->private_data)
430 pCifsFile = (struct cifsFileInfo *)file->private_data;
431 else
432 return -EBADF;
433
434 xid = GetXid(); 526 xid = GetXid();
435 mutex_lock(&pCifsFile->fh_mutex); 527 mutex_lock(&pCifsFile->fh_mutex);
436 if (!pCifsFile->invalidHandle) { 528 if (!pCifsFile->invalidHandle) {
@@ -440,39 +532,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
440 return rc; 532 return rc;
441 } 533 }
442 534
443 if (file->f_path.dentry == NULL) { 535 inode = pCifsFile->dentry->d_inode;
444 cERROR(1, "no valid name if dentry freed");
445 dump_stack();
446 rc = -EBADF;
447 goto reopen_error_exit;
448 }
449
450 inode = file->f_path.dentry->d_inode;
451 if (inode == NULL) {
452 cERROR(1, "inode not valid");
453 dump_stack();
454 rc = -EBADF;
455 goto reopen_error_exit;
456 }
457
458 cifs_sb = CIFS_SB(inode->i_sb); 536 cifs_sb = CIFS_SB(inode->i_sb);
459 tcon = cifs_sb->tcon; 537 tcon = tlink_tcon(pCifsFile->tlink);
460 538
461/* can not grab rename sem here because various ops, including 539/* can not grab rename sem here because various ops, including
462 those that already have the rename sem can end up causing writepage 540 those that already have the rename sem can end up causing writepage
463 to get called and if the server was down that means we end up here, 541 to get called and if the server was down that means we end up here,
464 and we can never tell if the caller already has the rename_sem */ 542 and we can never tell if the caller already has the rename_sem */
465 full_path = build_path_from_dentry(file->f_path.dentry); 543 full_path = build_path_from_dentry(pCifsFile->dentry);
466 if (full_path == NULL) { 544 if (full_path == NULL) {
467 rc = -ENOMEM; 545 rc = -ENOMEM;
468reopen_error_exit:
469 mutex_unlock(&pCifsFile->fh_mutex); 546 mutex_unlock(&pCifsFile->fh_mutex);
470 FreeXid(xid); 547 FreeXid(xid);
471 return rc; 548 return rc;
472 } 549 }
473 550
474 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 551 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
475 inode, file->f_flags, full_path); 552 inode, pCifsFile->f_flags, full_path);
476 553
477 if (oplockEnabled) 554 if (oplockEnabled)
478 oplock = REQ_OPLOCK; 555 oplock = REQ_OPLOCK;
@@ -482,8 +559,14 @@ reopen_error_exit:
482 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 559 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
483 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 560 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
484 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 561 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
485 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 562
486 /* can not refresh inode info since size could be stale */ 563 /*
564 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
565 * original open. Must mask them off for a reopen.
566 */
567 unsigned int oflags = pCifsFile->f_flags &
568 ~(O_CREAT | O_EXCL | O_TRUNC);
569
487 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 570 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
488 cifs_sb->mnt_file_mode /* ignored */, 571 cifs_sb->mnt_file_mode /* ignored */,
489 oflags, &oplock, &netfid, xid); 572 oflags, &oplock, &netfid, xid);
@@ -495,7 +578,7 @@ reopen_error_exit:
495 in the reconnect path it is important to retry hard */ 578 in the reconnect path it is important to retry hard */
496 } 579 }
497 580
498 desiredAccess = cifs_convert_flags(file->f_flags); 581 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
499 582
500 /* Can not refresh inode by passing in file_info buf to be returned 583 /* Can not refresh inode by passing in file_info buf to be returned
501 by SMBOpen and then calling get_inode_info with returned buf 584 by SMBOpen and then calling get_inode_info with returned buf
@@ -511,49 +594,49 @@ reopen_error_exit:
511 mutex_unlock(&pCifsFile->fh_mutex); 594 mutex_unlock(&pCifsFile->fh_mutex);
512 cFYI(1, "cifs_open returned 0x%x", rc); 595 cFYI(1, "cifs_open returned 0x%x", rc);
513 cFYI(1, "oplock: %d", oplock); 596 cFYI(1, "oplock: %d", oplock);
514 } else { 597 goto reopen_error_exit;
598 }
599
515reopen_success: 600reopen_success:
516 pCifsFile->netfid = netfid; 601 pCifsFile->netfid = netfid;
517 pCifsFile->invalidHandle = false; 602 pCifsFile->invalidHandle = false;
518 mutex_unlock(&pCifsFile->fh_mutex); 603 mutex_unlock(&pCifsFile->fh_mutex);
519 pCifsInode = CIFS_I(inode); 604 pCifsInode = CIFS_I(inode);
520 if (pCifsInode) { 605
521 if (can_flush) { 606 if (can_flush) {
522 rc = filemap_write_and_wait(inode->i_mapping); 607 rc = filemap_write_and_wait(inode->i_mapping);
523 if (rc != 0) 608 mapping_set_error(inode->i_mapping, rc);
524 CIFS_I(inode)->write_behind_rc = rc; 609
525 /* temporarily disable caching while we 610 pCifsInode->clientCanCacheAll = false;
526 go to server to get inode info */ 611 pCifsInode->clientCanCacheRead = false;
527 pCifsInode->clientCanCacheAll = false; 612 if (tcon->unix_ext)
528 pCifsInode->clientCanCacheRead = false; 613 rc = cifs_get_inode_info_unix(&inode,
529 if (tcon->unix_ext) 614 full_path, inode->i_sb, xid);
530 rc = cifs_get_inode_info_unix(&inode, 615 else
531 full_path, inode->i_sb, xid); 616 rc = cifs_get_inode_info(&inode,
532 else 617 full_path, NULL, inode->i_sb,
533 rc = cifs_get_inode_info(&inode, 618 xid, NULL);
534 full_path, NULL, inode->i_sb, 619 } /* else we are writing out data to server already
535 xid, NULL); 620 and could deadlock if we tried to flush data, and
536 } /* else we are writing out data to server already 621 since we do not know if we have data that would
537 and could deadlock if we tried to flush data, and 622 invalidate the current end of file on the server
538 since we do not know if we have data that would 623 we can not go to the server to get the new inod
539 invalidate the current end of file on the server 624 info */
540 we can not go to the server to get the new inod 625 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
541 info */ 626 pCifsInode->clientCanCacheAll = true;
542 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 627 pCifsInode->clientCanCacheRead = true;
543 pCifsInode->clientCanCacheAll = true; 628 cFYI(1, "Exclusive Oplock granted on inode %p",
544 pCifsInode->clientCanCacheRead = true; 629 pCifsFile->dentry->d_inode);
545 cFYI(1, "Exclusive Oplock granted on inode %p", 630 } else if ((oplock & 0xF) == OPLOCK_READ) {
546 file->f_path.dentry->d_inode); 631 pCifsInode->clientCanCacheRead = true;
547 } else if ((oplock & 0xF) == OPLOCK_READ) { 632 pCifsInode->clientCanCacheAll = false;
548 pCifsInode->clientCanCacheRead = true; 633 } else {
549 pCifsInode->clientCanCacheAll = false; 634 pCifsInode->clientCanCacheRead = false;
550 } else { 635 pCifsInode->clientCanCacheAll = false;
551 pCifsInode->clientCanCacheRead = false;
552 pCifsInode->clientCanCacheAll = false;
553 }
554 cifs_relock_file(pCifsFile);
555 }
556 } 636 }
637 cifs_relock_file(pCifsFile);
638
639reopen_error_exit:
557 kfree(full_path); 640 kfree(full_path);
558 FreeXid(xid); 641 FreeXid(xid);
559 return rc; 642 return rc;
@@ -561,88 +644,18 @@ reopen_success:
561 644
562int cifs_close(struct inode *inode, struct file *file) 645int cifs_close(struct inode *inode, struct file *file)
563{ 646{
564 int rc = 0; 647 cifsFileInfo_put(file->private_data);
565 int xid, timeout; 648 file->private_data = NULL;
566 struct cifs_sb_info *cifs_sb;
567 struct cifsTconInfo *pTcon;
568 struct cifsFileInfo *pSMBFile =
569 (struct cifsFileInfo *)file->private_data;
570
571 xid = GetXid();
572
573 cifs_sb = CIFS_SB(inode->i_sb);
574 pTcon = cifs_sb->tcon;
575 if (pSMBFile) {
576 struct cifsLockInfo *li, *tmp;
577 write_lock(&GlobalSMBSeslock);
578 pSMBFile->closePend = true;
579 if (pTcon) {
580 /* no sense reconnecting to close a file that is
581 already closed */
582 if (!pTcon->need_reconnect) {
583 write_unlock(&GlobalSMBSeslock);
584 timeout = 2;
585 while ((atomic_read(&pSMBFile->count) != 1)
586 && (timeout <= 2048)) {
587 /* Give write a better chance to get to
588 server ahead of the close. We do not
589 want to add a wait_q here as it would
590 increase the memory utilization as
591 the struct would be in each open file,
592 but this should give enough time to
593 clear the socket */
594 cFYI(DBG2, "close delay, write pending");
595 msleep(timeout);
596 timeout *= 4;
597 }
598 if (!pTcon->need_reconnect &&
599 !pSMBFile->invalidHandle)
600 rc = CIFSSMBClose(xid, pTcon,
601 pSMBFile->netfid);
602 } else
603 write_unlock(&GlobalSMBSeslock);
604 } else
605 write_unlock(&GlobalSMBSeslock);
606
607 /* Delete any outstanding lock records.
608 We'll lose them when the file is closed anyway. */
609 mutex_lock(&pSMBFile->lock_mutex);
610 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
611 list_del(&li->llist);
612 kfree(li);
613 }
614 mutex_unlock(&pSMBFile->lock_mutex);
615 649
616 write_lock(&GlobalSMBSeslock); 650 /* return code from the ->release op is always ignored */
617 list_del(&pSMBFile->flist); 651 return 0;
618 list_del(&pSMBFile->tlist);
619 write_unlock(&GlobalSMBSeslock);
620 cifsFileInfo_put(file->private_data);
621 file->private_data = NULL;
622 } else
623 rc = -EBADF;
624
625 read_lock(&GlobalSMBSeslock);
626 if (list_empty(&(CIFS_I(inode)->openFileList))) {
627 cFYI(1, "closing last open instance for inode %p", inode);
628 /* if the file is not open we do not know if we can cache info
629 on this inode, much less write behind and read ahead */
630 CIFS_I(inode)->clientCanCacheRead = false;
631 CIFS_I(inode)->clientCanCacheAll = false;
632 }
633 read_unlock(&GlobalSMBSeslock);
634 if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
635 rc = CIFS_I(inode)->write_behind_rc;
636 FreeXid(xid);
637 return rc;
638} 652}
639 653
640int cifs_closedir(struct inode *inode, struct file *file) 654int cifs_closedir(struct inode *inode, struct file *file)
641{ 655{
642 int rc = 0; 656 int rc = 0;
643 int xid; 657 int xid;
644 struct cifsFileInfo *pCFileStruct = 658 struct cifsFileInfo *pCFileStruct = file->private_data;
645 (struct cifsFileInfo *)file->private_data;
646 char *ptmp; 659 char *ptmp;
647 660
648 cFYI(1, "Closedir inode = 0x%p", inode); 661 cFYI(1, "Closedir inode = 0x%p", inode);
@@ -650,25 +663,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
650 xid = GetXid(); 663 xid = GetXid();
651 664
652 if (pCFileStruct) { 665 if (pCFileStruct) {
653 struct cifsTconInfo *pTcon; 666 struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
654 struct cifs_sb_info *cifs_sb =
655 CIFS_SB(file->f_path.dentry->d_sb);
656
657 pTcon = cifs_sb->tcon;
658 667
659 cFYI(1, "Freeing private data in close dir"); 668 cFYI(1, "Freeing private data in close dir");
660 write_lock(&GlobalSMBSeslock); 669 spin_lock(&cifs_file_list_lock);
661 if (!pCFileStruct->srch_inf.endOfSearch && 670 if (!pCFileStruct->srch_inf.endOfSearch &&
662 !pCFileStruct->invalidHandle) { 671 !pCFileStruct->invalidHandle) {
663 pCFileStruct->invalidHandle = true; 672 pCFileStruct->invalidHandle = true;
664 write_unlock(&GlobalSMBSeslock); 673 spin_unlock(&cifs_file_list_lock);
665 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 674 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
666 cFYI(1, "Closing uncompleted readdir with rc %d", 675 cFYI(1, "Closing uncompleted readdir with rc %d",
667 rc); 676 rc);
668 /* not much we can do if it fails anyway, ignore rc */ 677 /* not much we can do if it fails anyway, ignore rc */
669 rc = 0; 678 rc = 0;
670 } else 679 } else
671 write_unlock(&GlobalSMBSeslock); 680 spin_unlock(&cifs_file_list_lock);
672 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 681 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
673 if (ptmp) { 682 if (ptmp) {
674 cFYI(1, "closedir free smb buf in srch struct"); 683 cFYI(1, "closedir free smb buf in srch struct");
@@ -678,6 +687,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
678 else 687 else
679 cifs_buf_release(ptmp); 688 cifs_buf_release(ptmp);
680 } 689 }
690 cifs_put_tlink(pCFileStruct->tlink);
681 kfree(file->private_data); 691 kfree(file->private_data);
682 file->private_data = NULL; 692 file->private_data = NULL;
683 } 693 }
@@ -764,7 +774,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
764 cFYI(1, "Unknown type of lock"); 774 cFYI(1, "Unknown type of lock");
765 775
766 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 776 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
767 tcon = cifs_sb->tcon; 777 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
768 778
769 if (file->private_data == NULL) { 779 if (file->private_data == NULL) {
770 rc = -EBADF; 780 rc = -EBADF;
@@ -863,8 +873,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
863 length, pfLock, 873 length, pfLock,
864 posix_lock_type, wait_flag); 874 posix_lock_type, wait_flag);
865 } else { 875 } else {
866 struct cifsFileInfo *fid = 876 struct cifsFileInfo *fid = file->private_data;
867 (struct cifsFileInfo *)file->private_data;
868 877
869 if (numLock) { 878 if (numLock) {
870 rc = CIFSSMBLock(xid, tcon, netfid, length, 879 rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -958,14 +967,14 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
958 967
959 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 968 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
960 969
961 pTcon = cifs_sb->tcon;
962
963 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size, 970 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
964 *poffset, file->f_path.dentry->d_name.name); */ 971 *poffset, file->f_path.dentry->d_name.name); */
965 972
966 if (file->private_data == NULL) 973 if (file->private_data == NULL)
967 return -EBADF; 974 return -EBADF;
968 open_file = (struct cifsFileInfo *) file->private_data; 975
976 open_file = file->private_data;
977 pTcon = tlink_tcon(open_file->tlink);
969 978
970 rc = generic_write_checks(file, poffset, &write_size, 0); 979 rc = generic_write_checks(file, poffset, &write_size, 0);
971 if (rc) 980 if (rc)
@@ -986,19 +995,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
986 we blocked so return what we managed to write */ 995 we blocked so return what we managed to write */
987 return total_written; 996 return total_written;
988 } 997 }
989 if (open_file->closePend) {
990 FreeXid(xid);
991 if (total_written)
992 return total_written;
993 else
994 return -EBADF;
995 }
996 if (open_file->invalidHandle) { 998 if (open_file->invalidHandle) {
997 /* we could deadlock if we called 999 /* we could deadlock if we called
998 filemap_fdatawait from here so tell 1000 filemap_fdatawait from here so tell
999 reopen_file not to flush data to server 1001 reopen_file not to flush data to server
1000 now */ 1002 now */
1001 rc = cifs_reopen_file(file, false); 1003 rc = cifs_reopen_file(open_file, false);
1002 if (rc != 0) 1004 if (rc != 0)
1003 break; 1005 break;
1004 } 1006 }
@@ -1046,8 +1048,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1046 return total_written; 1048 return total_written;
1047} 1049}
1048 1050
1049static ssize_t cifs_write(struct file *file, const char *write_data, 1051static ssize_t cifs_write(struct cifsFileInfo *open_file,
1050 size_t write_size, loff_t *poffset) 1052 const char *write_data, size_t write_size,
1053 loff_t *poffset)
1051{ 1054{
1052 int rc = 0; 1055 int rc = 0;
1053 unsigned int bytes_written = 0; 1056 unsigned int bytes_written = 0;
@@ -1055,19 +1058,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1055 struct cifs_sb_info *cifs_sb; 1058 struct cifs_sb_info *cifs_sb;
1056 struct cifsTconInfo *pTcon; 1059 struct cifsTconInfo *pTcon;
1057 int xid, long_op; 1060 int xid, long_op;
1058 struct cifsFileInfo *open_file; 1061 struct dentry *dentry = open_file->dentry;
1059 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 1062 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1060 1063
1061 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1064 cifs_sb = CIFS_SB(dentry->d_sb);
1062
1063 pTcon = cifs_sb->tcon;
1064 1065
1065 cFYI(1, "write %zd bytes to offset %lld of %s", write_size, 1066 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1066 *poffset, file->f_path.dentry->d_name.name); 1067 *poffset, dentry->d_name.name);
1067 1068
1068 if (file->private_data == NULL) 1069 pTcon = tlink_tcon(open_file->tlink);
1069 return -EBADF;
1070 open_file = (struct cifsFileInfo *)file->private_data;
1071 1070
1072 xid = GetXid(); 1071 xid = GetXid();
1073 1072
@@ -1076,28 +1075,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1076 total_written += bytes_written) { 1075 total_written += bytes_written) {
1077 rc = -EAGAIN; 1076 rc = -EAGAIN;
1078 while (rc == -EAGAIN) { 1077 while (rc == -EAGAIN) {
1079 if (file->private_data == NULL) {
1080 /* file has been closed on us */
1081 FreeXid(xid);
1082 /* if we have gotten here we have written some data
1083 and blocked, and the file has been freed on us
1084 while we blocked so return what we managed to
1085 write */
1086 return total_written;
1087 }
1088 if (open_file->closePend) {
1089 FreeXid(xid);
1090 if (total_written)
1091 return total_written;
1092 else
1093 return -EBADF;
1094 }
1095 if (open_file->invalidHandle) { 1078 if (open_file->invalidHandle) {
1096 /* we could deadlock if we called 1079 /* we could deadlock if we called
1097 filemap_fdatawait from here so tell 1080 filemap_fdatawait from here so tell
1098 reopen_file not to flush data to 1081 reopen_file not to flush data to
1099 server now */ 1082 server now */
1100 rc = cifs_reopen_file(file, false); 1083 rc = cifs_reopen_file(open_file, false);
1101 if (rc != 0) 1084 if (rc != 0)
1102 break; 1085 break;
1103 } 1086 }
@@ -1144,43 +1127,41 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1144 1127
1145 cifs_stats_bytes_written(pTcon, total_written); 1128 cifs_stats_bytes_written(pTcon, total_written);
1146 1129
1147 /* since the write may have blocked check these pointers again */ 1130 if (total_written > 0) {
1148 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { 1131 spin_lock(&dentry->d_inode->i_lock);
1149/*BB We could make this contingent on superblock ATIME flag too */ 1132 if (*poffset > dentry->d_inode->i_size)
1150/* file->f_path.dentry->d_inode->i_ctime = 1133 i_size_write(dentry->d_inode, *poffset);
1151 file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/ 1134 spin_unlock(&dentry->d_inode->i_lock);
1152 if (total_written > 0) {
1153 spin_lock(&file->f_path.dentry->d_inode->i_lock);
1154 if (*poffset > file->f_path.dentry->d_inode->i_size)
1155 i_size_write(file->f_path.dentry->d_inode,
1156 *poffset);
1157 spin_unlock(&file->f_path.dentry->d_inode->i_lock);
1158 }
1159 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1160 } 1135 }
1136 mark_inode_dirty_sync(dentry->d_inode);
1161 FreeXid(xid); 1137 FreeXid(xid);
1162 return total_written; 1138 return total_written;
1163} 1139}
1164 1140
1165#ifdef CONFIG_CIFS_EXPERIMENTAL 1141#ifdef CONFIG_CIFS_EXPERIMENTAL
1166struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) 1142struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1143 bool fsuid_only)
1167{ 1144{
1168 struct cifsFileInfo *open_file = NULL; 1145 struct cifsFileInfo *open_file = NULL;
1146 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1147
1148 /* only filter by fsuid on multiuser mounts */
1149 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1150 fsuid_only = false;
1169 1151
1170 read_lock(&GlobalSMBSeslock); 1152 spin_lock(&cifs_file_list_lock);
1171 /* we could simply get the first_list_entry since write-only entries 1153 /* we could simply get the first_list_entry since write-only entries
1172 are always at the end of the list but since the first entry might 1154 are always at the end of the list but since the first entry might
1173 have a close pending, we go through the whole list */ 1155 have a close pending, we go through the whole list */
1174 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1156 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1175 if (open_file->closePend) 1157 if (fsuid_only && open_file->uid != current_fsuid())
1176 continue; 1158 continue;
1177 if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || 1159 if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
1178 (open_file->pfile->f_flags & O_RDONLY))) {
1179 if (!open_file->invalidHandle) { 1160 if (!open_file->invalidHandle) {
1180 /* found a good file */ 1161 /* found a good file */
1181 /* lock it so it will not be closed on us */ 1162 /* lock it so it will not be closed on us */
1182 cifsFileInfo_get(open_file); 1163 cifsFileInfo_get(open_file);
1183 read_unlock(&GlobalSMBSeslock); 1164 spin_unlock(&cifs_file_list_lock);
1184 return open_file; 1165 return open_file;
1185 } /* else might as well continue, and look for 1166 } /* else might as well continue, and look for
1186 another, or simply have the caller reopen it 1167 another, or simply have the caller reopen it
@@ -1188,14 +1169,16 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1188 } else /* write only file */ 1169 } else /* write only file */
1189 break; /* write only files are last so must be done */ 1170 break; /* write only files are last so must be done */
1190 } 1171 }
1191 read_unlock(&GlobalSMBSeslock); 1172 spin_unlock(&cifs_file_list_lock);
1192 return NULL; 1173 return NULL;
1193} 1174}
1194#endif 1175#endif
1195 1176
1196struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1177struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1178 bool fsuid_only)
1197{ 1179{
1198 struct cifsFileInfo *open_file; 1180 struct cifsFileInfo *open_file;
1181 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1199 bool any_available = false; 1182 bool any_available = false;
1200 int rc; 1183 int rc;
1201 1184
@@ -1209,53 +1192,39 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1209 return NULL; 1192 return NULL;
1210 } 1193 }
1211 1194
1212 read_lock(&GlobalSMBSeslock); 1195 /* only filter by fsuid on multiuser mounts */
1196 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1197 fsuid_only = false;
1198
1199 spin_lock(&cifs_file_list_lock);
1213refind_writable: 1200refind_writable:
1214 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1201 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1215 if (open_file->closePend || 1202 if (!any_available && open_file->pid != current->tgid)
1216 (!any_available && open_file->pid != current->tgid))
1217 continue; 1203 continue;
1218 1204 if (fsuid_only && open_file->uid != current_fsuid())
1219 if (open_file->pfile && 1205 continue;
1220 ((open_file->pfile->f_flags & O_RDWR) || 1206 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
1221 (open_file->pfile->f_flags & O_WRONLY))) {
1222 cifsFileInfo_get(open_file); 1207 cifsFileInfo_get(open_file);
1223 1208
1224 if (!open_file->invalidHandle) { 1209 if (!open_file->invalidHandle) {
1225 /* found a good writable file */ 1210 /* found a good writable file */
1226 read_unlock(&GlobalSMBSeslock); 1211 spin_unlock(&cifs_file_list_lock);
1227 return open_file; 1212 return open_file;
1228 } 1213 }
1229 1214
1230 read_unlock(&GlobalSMBSeslock); 1215 spin_unlock(&cifs_file_list_lock);
1216
1231 /* Had to unlock since following call can block */ 1217 /* Had to unlock since following call can block */
1232 rc = cifs_reopen_file(open_file->pfile, false); 1218 rc = cifs_reopen_file(open_file, false);
1233 if (!rc) { 1219 if (!rc)
1234 if (!open_file->closePend) 1220 return open_file;
1235 return open_file;
1236 else { /* start over in case this was deleted */
1237 /* since the list could be modified */
1238 read_lock(&GlobalSMBSeslock);
1239 cifsFileInfo_put(open_file);
1240 goto refind_writable;
1241 }
1242 }
1243 1221
1244 /* if it fails, try another handle if possible - 1222 /* if it fails, try another handle if possible */
1245 (we can not do this if closePending since
1246 loop could be modified - in which case we
1247 have to start at the beginning of the list
1248 again. Note that it would be bad
1249 to hold up writepages here (rather than
1250 in caller) with continuous retries */
1251 cFYI(1, "wp failed on reopen file"); 1223 cFYI(1, "wp failed on reopen file");
1252 read_lock(&GlobalSMBSeslock);
1253 /* can not use this handle, no write
1254 pending on this one after all */
1255 cifsFileInfo_put(open_file); 1224 cifsFileInfo_put(open_file);
1256 1225
1257 if (open_file->closePend) /* list could have changed */ 1226 spin_lock(&cifs_file_list_lock);
1258 goto refind_writable; 1227
1259 /* else we simply continue to the next entry. Thus 1228 /* else we simply continue to the next entry. Thus
1260 we do not loop on reopen errors. If we 1229 we do not loop on reopen errors. If we
1261 can not reopen the file, for example if we 1230 can not reopen the file, for example if we
@@ -1270,7 +1239,7 @@ refind_writable:
1270 any_available = true; 1239 any_available = true;
1271 goto refind_writable; 1240 goto refind_writable;
1272 } 1241 }
1273 read_unlock(&GlobalSMBSeslock); 1242 spin_unlock(&cifs_file_list_lock);
1274 return NULL; 1243 return NULL;
1275} 1244}
1276 1245
@@ -1282,7 +1251,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1282 int rc = -EFAULT; 1251 int rc = -EFAULT;
1283 int bytes_written = 0; 1252 int bytes_written = 0;
1284 struct cifs_sb_info *cifs_sb; 1253 struct cifs_sb_info *cifs_sb;
1285 struct cifsTconInfo *pTcon;
1286 struct inode *inode; 1254 struct inode *inode;
1287 struct cifsFileInfo *open_file; 1255 struct cifsFileInfo *open_file;
1288 1256
@@ -1291,7 +1259,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1291 1259
1292 inode = page->mapping->host; 1260 inode = page->mapping->host;
1293 cifs_sb = CIFS_SB(inode->i_sb); 1261 cifs_sb = CIFS_SB(inode->i_sb);
1294 pTcon = cifs_sb->tcon;
1295 1262
1296 offset += (loff_t)from; 1263 offset += (loff_t)from;
1297 write_data = kmap(page); 1264 write_data = kmap(page);
@@ -1312,10 +1279,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1312 if (mapping->host->i_size - offset < (loff_t)to) 1279 if (mapping->host->i_size - offset < (loff_t)to)
1313 to = (unsigned)(mapping->host->i_size - offset); 1280 to = (unsigned)(mapping->host->i_size - offset);
1314 1281
1315 open_file = find_writable_file(CIFS_I(mapping->host)); 1282 open_file = find_writable_file(CIFS_I(mapping->host), false);
1316 if (open_file) { 1283 if (open_file) {
1317 bytes_written = cifs_write(open_file->pfile, write_data, 1284 bytes_written = cifs_write(open_file, write_data,
1318 to-from, &offset); 1285 to - from, &offset);
1319 cifsFileInfo_put(open_file); 1286 cifsFileInfo_put(open_file);
1320 /* Does mm or vfs already set times? */ 1287 /* Does mm or vfs already set times? */
1321 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1288 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1335,7 +1302,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1335static int cifs_writepages(struct address_space *mapping, 1302static int cifs_writepages(struct address_space *mapping,
1336 struct writeback_control *wbc) 1303 struct writeback_control *wbc)
1337{ 1304{
1338 struct backing_dev_info *bdi = mapping->backing_dev_info;
1339 unsigned int bytes_to_write; 1305 unsigned int bytes_to_write;
1340 unsigned int bytes_written; 1306 unsigned int bytes_written;
1341 struct cifs_sb_info *cifs_sb; 1307 struct cifs_sb_info *cifs_sb;
@@ -1350,6 +1316,7 @@ static int cifs_writepages(struct address_space *mapping,
1350 int nr_pages; 1316 int nr_pages;
1351 __u64 offset = 0; 1317 __u64 offset = 0;
1352 struct cifsFileInfo *open_file; 1318 struct cifsFileInfo *open_file;
1319 struct cifsTconInfo *tcon;
1353 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host); 1320 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1354 struct page *page; 1321 struct page *page;
1355 struct pagevec pvec; 1322 struct pagevec pvec;
@@ -1366,26 +1333,29 @@ static int cifs_writepages(struct address_space *mapping,
1366 if (cifs_sb->wsize < PAGE_CACHE_SIZE) 1333 if (cifs_sb->wsize < PAGE_CACHE_SIZE)
1367 return generic_writepages(mapping, wbc); 1334 return generic_writepages(mapping, wbc);
1368 1335
1369 if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
1370 if (cifs_sb->tcon->ses->server->secMode &
1371 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
1372 if (!experimEnabled)
1373 return generic_writepages(mapping, wbc);
1374
1375 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); 1336 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
1376 if (iov == NULL) 1337 if (iov == NULL)
1377 return generic_writepages(mapping, wbc); 1338 return generic_writepages(mapping, wbc);
1378 1339
1379
1380 /* 1340 /*
1381 * BB: Is this meaningful for a non-block-device file system? 1341 * if there's no open file, then this is likely to fail too,
1382 * If it is, we should test it again after we do I/O 1342 * but it'll at least handle the return. Maybe it should be
1343 * a BUG() instead?
1383 */ 1344 */
1384 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1345 open_file = find_writable_file(CIFS_I(mapping->host), false);
1385 wbc->encountered_congestion = 1; 1346 if (!open_file) {
1386 kfree(iov); 1347 kfree(iov);
1387 return 0; 1348 return generic_writepages(mapping, wbc);
1349 }
1350
1351 tcon = tlink_tcon(open_file->tlink);
1352 if (!experimEnabled && tcon->ses->server->secMode &
1353 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1354 cifsFileInfo_put(open_file);
1355 kfree(iov);
1356 return generic_writepages(mapping, wbc);
1388 } 1357 }
1358 cifsFileInfo_put(open_file);
1389 1359
1390 xid = GetXid(); 1360 xid = GetXid();
1391 1361
@@ -1490,38 +1460,29 @@ retry:
1490 break; 1460 break;
1491 } 1461 }
1492 if (n_iov) { 1462 if (n_iov) {
1493 /* Search for a writable handle every time we call 1463 open_file = find_writable_file(CIFS_I(mapping->host),
1494 * CIFSSMBWrite2. We can't rely on the last handle 1464 false);
1495 * we used to still be valid
1496 */
1497 open_file = find_writable_file(CIFS_I(mapping->host));
1498 if (!open_file) { 1465 if (!open_file) {
1499 cERROR(1, "No writable handles for inode"); 1466 cERROR(1, "No writable handles for inode");
1500 rc = -EBADF; 1467 rc = -EBADF;
1501 } else { 1468 } else {
1502 long_op = cifs_write_timeout(cifsi, offset); 1469 long_op = cifs_write_timeout(cifsi, offset);
1503 rc = CIFSSMBWrite2(xid, cifs_sb->tcon, 1470 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1504 open_file->netfid,
1505 bytes_to_write, offset, 1471 bytes_to_write, offset,
1506 &bytes_written, iov, n_iov, 1472 &bytes_written, iov, n_iov,
1507 long_op); 1473 long_op);
1508 cifsFileInfo_put(open_file); 1474 cifsFileInfo_put(open_file);
1509 cifs_update_eof(cifsi, offset, bytes_written); 1475 cifs_update_eof(cifsi, offset, bytes_written);
1476 }
1510 1477
1511 if (rc || bytes_written < bytes_to_write) { 1478 if (rc || bytes_written < bytes_to_write) {
1512 cERROR(1, "Write2 ret %d, wrote %d", 1479 cERROR(1, "Write2 ret %d, wrote %d",
1513 rc, bytes_written); 1480 rc, bytes_written);
1514 /* BB what if continued retry is 1481 mapping_set_error(mapping, rc);
1515 requested via mount flags? */ 1482 } else {
1516 if (rc == -ENOSPC) 1483 cifs_stats_bytes_written(tcon, bytes_written);
1517 set_bit(AS_ENOSPC, &mapping->flags);
1518 else
1519 set_bit(AS_EIO, &mapping->flags);
1520 } else {
1521 cifs_stats_bytes_written(cifs_sb->tcon,
1522 bytes_written);
1523 }
1524 } 1484 }
1485
1525 for (i = 0; i < n_iov; i++) { 1486 for (i = 0; i < n_iov; i++) {
1526 page = pvec.pages[first + i]; 1487 page = pvec.pages[first + i];
1527 /* Should we also set page error on 1488 /* Should we also set page error on
@@ -1622,7 +1583,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1622 /* BB check if anything else missing out of ppw 1583 /* BB check if anything else missing out of ppw
1623 such as updating last write time */ 1584 such as updating last write time */
1624 page_data = kmap(page); 1585 page_data = kmap(page);
1625 rc = cifs_write(file, page_data + offset, copied, &pos); 1586 rc = cifs_write(file->private_data, page_data + offset,
1587 copied, &pos);
1626 /* if (rc < 0) should we set writebehind rc? */ 1588 /* if (rc < 0) should we set writebehind rc? */
1627 kunmap(page); 1589 kunmap(page);
1628 1590
@@ -1651,8 +1613,7 @@ int cifs_fsync(struct file *file, int datasync)
1651 int xid; 1613 int xid;
1652 int rc = 0; 1614 int rc = 0;
1653 struct cifsTconInfo *tcon; 1615 struct cifsTconInfo *tcon;
1654 struct cifsFileInfo *smbfile = 1616 struct cifsFileInfo *smbfile = file->private_data;
1655 (struct cifsFileInfo *)file->private_data;
1656 struct inode *inode = file->f_path.dentry->d_inode; 1617 struct inode *inode = file->f_path.dentry->d_inode;
1657 1618
1658 xid = GetXid(); 1619 xid = GetXid();
@@ -1662,11 +1623,10 @@ int cifs_fsync(struct file *file, int datasync)
1662 1623
1663 rc = filemap_write_and_wait(inode->i_mapping); 1624 rc = filemap_write_and_wait(inode->i_mapping);
1664 if (rc == 0) { 1625 if (rc == 0) {
1665 rc = CIFS_I(inode)->write_behind_rc; 1626 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1666 CIFS_I(inode)->write_behind_rc = 0; 1627
1667 tcon = CIFS_SB(inode->i_sb)->tcon; 1628 tcon = tlink_tcon(smbfile->tlink);
1668 if (!rc && tcon && smbfile && 1629 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1669 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1670 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1630 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1671 } 1631 }
1672 1632
@@ -1711,21 +1671,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
1711 struct inode *inode = file->f_path.dentry->d_inode; 1671 struct inode *inode = file->f_path.dentry->d_inode;
1712 int rc = 0; 1672 int rc = 0;
1713 1673
1714 /* Rather than do the steps manually: 1674 if (file->f_mode & FMODE_WRITE)
1715 lock the inode for writing 1675 rc = filemap_write_and_wait(inode->i_mapping);
1716 loop through pages looking for write behind data (dirty pages)
1717 coalesce into contiguous 16K (or smaller) chunks to write to server
1718 send to server (prefer in parallel)
1719 deal with writebehind errors
1720 unlock inode for writing
1721 filemapfdatawrite appears easier for the time being */
1722
1723 rc = filemap_fdatawrite(inode->i_mapping);
1724 /* reset wb rc if we were able to write out dirty pages */
1725 if (!rc) {
1726 rc = CIFS_I(inode)->write_behind_rc;
1727 CIFS_I(inode)->write_behind_rc = 0;
1728 }
1729 1676
1730 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); 1677 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1731 1678
@@ -1749,14 +1696,14 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1749 1696
1750 xid = GetXid(); 1697 xid = GetXid();
1751 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1698 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1752 pTcon = cifs_sb->tcon;
1753 1699
1754 if (file->private_data == NULL) { 1700 if (file->private_data == NULL) {
1755 rc = -EBADF; 1701 rc = -EBADF;
1756 FreeXid(xid); 1702 FreeXid(xid);
1757 return rc; 1703 return rc;
1758 } 1704 }
1759 open_file = (struct cifsFileInfo *)file->private_data; 1705 open_file = file->private_data;
1706 pTcon = tlink_tcon(open_file->tlink);
1760 1707
1761 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1708 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1762 cFYI(1, "attempting read on write only file instance"); 1709 cFYI(1, "attempting read on write only file instance");
@@ -1770,9 +1717,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1770 smb_read_data = NULL; 1717 smb_read_data = NULL;
1771 while (rc == -EAGAIN) { 1718 while (rc == -EAGAIN) {
1772 int buf_type = CIFS_NO_BUFFER; 1719 int buf_type = CIFS_NO_BUFFER;
1773 if ((open_file->invalidHandle) && 1720 if (open_file->invalidHandle) {
1774 (!open_file->closePend)) { 1721 rc = cifs_reopen_file(open_file, true);
1775 rc = cifs_reopen_file(file, true);
1776 if (rc != 0) 1722 if (rc != 0)
1777 break; 1723 break;
1778 } 1724 }
@@ -1830,14 +1776,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1830 1776
1831 xid = GetXid(); 1777 xid = GetXid();
1832 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1778 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1833 pTcon = cifs_sb->tcon;
1834 1779
1835 if (file->private_data == NULL) { 1780 if (file->private_data == NULL) {
1836 rc = -EBADF; 1781 rc = -EBADF;
1837 FreeXid(xid); 1782 FreeXid(xid);
1838 return rc; 1783 return rc;
1839 } 1784 }
1840 open_file = (struct cifsFileInfo *)file->private_data; 1785 open_file = file->private_data;
1786 pTcon = tlink_tcon(open_file->tlink);
1841 1787
1842 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1788 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1843 cFYI(1, "attempting read on write only file instance"); 1789 cFYI(1, "attempting read on write only file instance");
@@ -1856,9 +1802,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1856 } 1802 }
1857 rc = -EAGAIN; 1803 rc = -EAGAIN;
1858 while (rc == -EAGAIN) { 1804 while (rc == -EAGAIN) {
1859 if ((open_file->invalidHandle) && 1805 if (open_file->invalidHandle) {
1860 (!open_file->closePend)) { 1806 rc = cifs_reopen_file(open_file, true);
1861 rc = cifs_reopen_file(file, true);
1862 if (rc != 0) 1807 if (rc != 0)
1863 break; 1808 break;
1864 } 1809 }
@@ -1942,6 +1887,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1942 SetPageUptodate(page); 1887 SetPageUptodate(page);
1943 unlock_page(page); 1888 unlock_page(page);
1944 data += PAGE_CACHE_SIZE; 1889 data += PAGE_CACHE_SIZE;
1890
1891 /* add page to FS-Cache */
1892 cifs_readpage_to_fscache(mapping->host, page);
1945 } 1893 }
1946 return; 1894 return;
1947} 1895}
@@ -1968,9 +1916,18 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1968 FreeXid(xid); 1916 FreeXid(xid);
1969 return rc; 1917 return rc;
1970 } 1918 }
1971 open_file = (struct cifsFileInfo *)file->private_data; 1919 open_file = file->private_data;
1972 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1920 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1973 pTcon = cifs_sb->tcon; 1921 pTcon = tlink_tcon(open_file->tlink);
1922
1923 /*
1924 * Reads as many pages as possible from fscache. Returns -ENOBUFS
1925 * immediately if the cookie is negative
1926 */
1927 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
1928 &num_pages);
1929 if (rc == 0)
1930 goto read_complete;
1974 1931
1975 cFYI(DBG2, "rpages: num pages %d", num_pages); 1932 cFYI(DBG2, "rpages: num pages %d", num_pages);
1976 for (i = 0; i < num_pages; ) { 1933 for (i = 0; i < num_pages; ) {
@@ -2009,9 +1966,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2009 read_size, contig_pages); 1966 read_size, contig_pages);
2010 rc = -EAGAIN; 1967 rc = -EAGAIN;
2011 while (rc == -EAGAIN) { 1968 while (rc == -EAGAIN) {
2012 if ((open_file->invalidHandle) && 1969 if (open_file->invalidHandle) {
2013 (!open_file->closePend)) { 1970 rc = cifs_reopen_file(open_file, true);
2014 rc = cifs_reopen_file(file, true);
2015 if (rc != 0) 1971 if (rc != 0)
2016 break; 1972 break;
2017 } 1973 }
@@ -2082,6 +2038,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2082 smb_read_data = NULL; 2038 smb_read_data = NULL;
2083 } 2039 }
2084 2040
2041read_complete:
2085 FreeXid(xid); 2042 FreeXid(xid);
2086 return rc; 2043 return rc;
2087} 2044}
@@ -2092,6 +2049,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2092 char *read_data; 2049 char *read_data;
2093 int rc; 2050 int rc;
2094 2051
2052 /* Is the page cached? */
2053 rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
2054 if (rc == 0)
2055 goto read_complete;
2056
2095 page_cache_get(page); 2057 page_cache_get(page);
2096 read_data = kmap(page); 2058 read_data = kmap(page);
2097 /* for reads over a certain size could initiate async read ahead */ 2059 /* for reads over a certain size could initiate async read ahead */
@@ -2111,11 +2073,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2111 2073
2112 flush_dcache_page(page); 2074 flush_dcache_page(page);
2113 SetPageUptodate(page); 2075 SetPageUptodate(page);
2076
2077 /* send this page to the cache */
2078 cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
2079
2114 rc = 0; 2080 rc = 0;
2115 2081
2116io_error: 2082io_error:
2117 kunmap(page); 2083 kunmap(page);
2118 page_cache_release(page); 2084 page_cache_release(page);
2085
2086read_complete:
2119 return rc; 2087 return rc;
2120} 2088}
2121 2089
@@ -2148,18 +2116,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
2148{ 2116{
2149 struct cifsFileInfo *open_file; 2117 struct cifsFileInfo *open_file;
2150 2118
2151 read_lock(&GlobalSMBSeslock); 2119 spin_lock(&cifs_file_list_lock);
2152 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 2120 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
2153 if (open_file->closePend) 2121 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
2154 continue; 2122 spin_unlock(&cifs_file_list_lock);
2155 if (open_file->pfile &&
2156 ((open_file->pfile->f_flags & O_RDWR) ||
2157 (open_file->pfile->f_flags & O_WRONLY))) {
2158 read_unlock(&GlobalSMBSeslock);
2159 return 1; 2123 return 1;
2160 } 2124 }
2161 } 2125 }
2162 read_unlock(&GlobalSMBSeslock); 2126 spin_unlock(&cifs_file_list_lock);
2163 return 0; 2127 return 0;
2164} 2128}
2165 2129
@@ -2265,15 +2229,29 @@ out:
2265 return rc; 2229 return rc;
2266} 2230}
2267 2231
2268static void 2232static int cifs_release_page(struct page *page, gfp_t gfp)
2269cifs_oplock_break(struct slow_work *work) 2233{
2234 if (PagePrivate(page))
2235 return 0;
2236
2237 return cifs_fscache_release_page(page, gfp);
2238}
2239
2240static void cifs_invalidate_page(struct page *page, unsigned long offset)
2241{
2242 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
2243
2244 if (offset == 0)
2245 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
2246}
2247
2248void cifs_oplock_break(struct work_struct *work)
2270{ 2249{
2271 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2250 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2272 oplock_break); 2251 oplock_break);
2273 struct inode *inode = cfile->pInode; 2252 struct inode *inode = cfile->dentry->d_inode;
2274 struct cifsInodeInfo *cinode = CIFS_I(inode); 2253 struct cifsInodeInfo *cinode = CIFS_I(inode);
2275 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb); 2254 int rc = 0;
2276 int rc, waitrc = 0;
2277 2255
2278 if (inode && S_ISREG(inode->i_mode)) { 2256 if (inode && S_ISREG(inode->i_mode)) {
2279 if (cinode->clientCanCacheRead) 2257 if (cinode->clientCanCacheRead)
@@ -2282,13 +2260,10 @@ cifs_oplock_break(struct slow_work *work)
2282 break_lease(inode, O_WRONLY); 2260 break_lease(inode, O_WRONLY);
2283 rc = filemap_fdatawrite(inode->i_mapping); 2261 rc = filemap_fdatawrite(inode->i_mapping);
2284 if (cinode->clientCanCacheRead == 0) { 2262 if (cinode->clientCanCacheRead == 0) {
2285 waitrc = filemap_fdatawait(inode->i_mapping); 2263 rc = filemap_fdatawait(inode->i_mapping);
2264 mapping_set_error(inode->i_mapping, rc);
2286 invalidate_remote_inode(inode); 2265 invalidate_remote_inode(inode);
2287 } 2266 }
2288 if (!rc)
2289 rc = waitrc;
2290 if (rc)
2291 cinode->write_behind_rc = rc;
2292 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 2267 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2293 } 2268 }
2294 2269
@@ -2298,38 +2273,36 @@ cifs_oplock_break(struct slow_work *work)
2298 * not bother sending an oplock release if session to server still is 2273 * not bother sending an oplock release if session to server still is
2299 * disconnected since oplock already released by the server 2274 * disconnected since oplock already released by the server
2300 */ 2275 */
2301 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2276 if (!cfile->oplock_break_cancelled) {
2302 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2277 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2303 LOCKING_ANDX_OPLOCK_RELEASE, false); 2278 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
2304 cFYI(1, "Oplock release rc = %d", rc); 2279 cFYI(1, "Oplock release rc = %d", rc);
2305 } 2280 }
2281
2282 /*
2283 * We might have kicked in before is_valid_oplock_break()
2284 * finished grabbing reference for us. Make sure it's done by
2285 * waiting for cifs_file_list_lock.
2286 */
2287 spin_lock(&cifs_file_list_lock);
2288 spin_unlock(&cifs_file_list_lock);
2289
2290 cifs_oplock_break_put(cfile);
2306} 2291}
2307 2292
2308static int 2293/* must be called while holding cifs_file_list_lock */
2309cifs_oplock_break_get(struct slow_work *work) 2294void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2310{ 2295{
2311 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2296 cifs_sb_active(cfile->dentry->d_sb);
2312 oplock_break);
2313 mntget(cfile->mnt);
2314 cifsFileInfo_get(cfile); 2297 cifsFileInfo_get(cfile);
2315 return 0;
2316} 2298}
2317 2299
2318static void 2300void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2319cifs_oplock_break_put(struct slow_work *work)
2320{ 2301{
2321 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2322 oplock_break);
2323 mntput(cfile->mnt);
2324 cifsFileInfo_put(cfile); 2302 cifsFileInfo_put(cfile);
2303 cifs_sb_deactive(cfile->dentry->d_sb);
2325} 2304}
2326 2305
2327const struct slow_work_ops cifs_oplock_break_ops = {
2328 .get_ref = cifs_oplock_break_get,
2329 .put_ref = cifs_oplock_break_put,
2330 .execute = cifs_oplock_break,
2331};
2332
2333const struct address_space_operations cifs_addr_ops = { 2306const struct address_space_operations cifs_addr_ops = {
2334 .readpage = cifs_readpage, 2307 .readpage = cifs_readpage,
2335 .readpages = cifs_readpages, 2308 .readpages = cifs_readpages,
@@ -2338,6 +2311,8 @@ const struct address_space_operations cifs_addr_ops = {
2338 .write_begin = cifs_write_begin, 2311 .write_begin = cifs_write_begin,
2339 .write_end = cifs_write_end, 2312 .write_end = cifs_write_end,
2340 .set_page_dirty = __set_page_dirty_nobuffers, 2313 .set_page_dirty = __set_page_dirty_nobuffers,
2314 .releasepage = cifs_release_page,
2315 .invalidatepage = cifs_invalidate_page,
2341 /* .sync_page = cifs_sync_page, */ 2316 /* .sync_page = cifs_sync_page, */
2342 /* .direct_IO = */ 2317 /* .direct_IO = */
2343}; 2318};
@@ -2354,6 +2329,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2354 .write_begin = cifs_write_begin, 2329 .write_begin = cifs_write_begin,
2355 .write_end = cifs_write_end, 2330 .write_end = cifs_write_end,
2356 .set_page_dirty = __set_page_dirty_nobuffers, 2331 .set_page_dirty = __set_page_dirty_nobuffers,
2332 .releasepage = cifs_release_page,
2333 .invalidatepage = cifs_invalidate_page,
2357 /* .sync_page = cifs_sync_page, */ 2334 /* .sync_page = cifs_sync_page, */
2358 /* .direct_IO = */ 2335 /* .direct_IO = */
2359}; 2336};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 00000000000..a2ad94efcfe
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,237 @@
1/*
2 * fs/cifs/fscache.c - CIFS filesystem cache interface
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Author(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#include "fscache.h"
22#include "cifsglob.h"
23#include "cifs_debug.h"
24#include "cifs_fs_sb.h"
25
26void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
27{
28 server->fscache =
29 fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
30 &cifs_fscache_server_index_def, server);
31 cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
32 server->fscache);
33}
34
35void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
36{
37 cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
38 server->fscache);
39 fscache_relinquish_cookie(server->fscache, 0);
40 server->fscache = NULL;
41}
42
43void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
44{
45 struct TCP_Server_Info *server = tcon->ses->server;
46
47 tcon->fscache =
48 fscache_acquire_cookie(server->fscache,
49 &cifs_fscache_super_index_def, tcon);
50 cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
51 server->fscache, tcon->fscache);
52}
53
54void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
55{
56 cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
57 fscache_relinquish_cookie(tcon->fscache, 0);
58 tcon->fscache = NULL;
59}
60
61static void cifs_fscache_enable_inode_cookie(struct inode *inode)
62{
63 struct cifsInodeInfo *cifsi = CIFS_I(inode);
64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
65 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
66
67 if (cifsi->fscache)
68 return;
69
70 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
71 &cifs_fscache_inode_object_def, cifsi);
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
73 cifsi->fscache);
74}
75
76void cifs_fscache_release_inode_cookie(struct inode *inode)
77{
78 struct cifsInodeInfo *cifsi = CIFS_I(inode);
79
80 if (cifsi->fscache) {
81 cFYI(1, "CIFS releasing inode cookie (0x%p)",
82 cifsi->fscache);
83 fscache_relinquish_cookie(cifsi->fscache, 0);
84 cifsi->fscache = NULL;
85 }
86}
87
88static void cifs_fscache_disable_inode_cookie(struct inode *inode)
89{
90 struct cifsInodeInfo *cifsi = CIFS_I(inode);
91
92 if (cifsi->fscache) {
93 cFYI(1, "CIFS disabling inode cookie (0x%p)",
94 cifsi->fscache);
95 fscache_relinquish_cookie(cifsi->fscache, 1);
96 cifsi->fscache = NULL;
97 }
98}
99
100void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
101{
102 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
103 cifs_fscache_disable_inode_cookie(inode);
104 else {
105 cifs_fscache_enable_inode_cookie(inode);
106 cFYI(1, "CIFS: fscache inode cookie set");
107 }
108}
109
110void cifs_fscache_reset_inode_cookie(struct inode *inode)
111{
112 struct cifsInodeInfo *cifsi = CIFS_I(inode);
113 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
114 struct fscache_cookie *old = cifsi->fscache;
115
116 if (cifsi->fscache) {
117 /* retire the current fscache cache and get a new one */
118 fscache_relinquish_cookie(cifsi->fscache, 1);
119
120 cifsi->fscache = fscache_acquire_cookie(
121 cifs_sb_master_tcon(cifs_sb)->fscache,
122 &cifs_fscache_inode_object_def,
123 cifsi);
124 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
125 cifsi->fscache, old);
126 }
127}
128
129int cifs_fscache_release_page(struct page *page, gfp_t gfp)
130{
131 if (PageFsCache(page)) {
132 struct inode *inode = page->mapping->host;
133 struct cifsInodeInfo *cifsi = CIFS_I(inode);
134
135 cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
136 page, cifsi->fscache);
137 if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
138 return 0;
139 }
140
141 return 1;
142}
143
144static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
145 int error)
146{
147 cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
148 page, error);
149 if (!error)
150 SetPageUptodate(page);
151 unlock_page(page);
152}
153
154/*
155 * Retrieve a page from FS-Cache
156 */
157int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
158{
159 int ret;
160
161 cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
162 CIFS_I(inode)->fscache, page, inode);
163 ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
164 cifs_readpage_from_fscache_complete,
165 NULL,
166 GFP_KERNEL);
167 switch (ret) {
168
169 case 0: /* page found in fscache, read submitted */
170 cFYI(1, "CIFS: readpage_from_fscache: submitted");
171 return ret;
172 case -ENOBUFS: /* page won't be cached */
173 case -ENODATA: /* page not in cache */
174 cFYI(1, "CIFS: readpage_from_fscache %d", ret);
175 return 1;
176
177 default:
178 cERROR(1, "unknown error ret = %d", ret);
179 }
180 return ret;
181}
182
183/*
184 * Retrieve a set of pages from FS-Cache
185 */
186int __cifs_readpages_from_fscache(struct inode *inode,
187 struct address_space *mapping,
188 struct list_head *pages,
189 unsigned *nr_pages)
190{
191 int ret;
192
193 cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
194 CIFS_I(inode)->fscache, *nr_pages, inode);
195 ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
196 pages, nr_pages,
197 cifs_readpage_from_fscache_complete,
198 NULL,
199 mapping_gfp_mask(mapping));
200 switch (ret) {
201 case 0: /* read submitted to the cache for all pages */
202 cFYI(1, "CIFS: readpages_from_fscache: submitted");
203 return ret;
204
205 case -ENOBUFS: /* some pages are not cached and can't be */
206 case -ENODATA: /* some pages are not cached */
207 cFYI(1, "CIFS: readpages_from_fscache: no page");
208 return 1;
209
210 default:
211 cFYI(1, "unknown error ret = %d", ret);
212 }
213
214 return ret;
215}
216
217void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
218{
219 int ret;
220
221 cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
222 CIFS_I(inode)->fscache, page, inode);
223 ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
224 if (ret != 0)
225 fscache_uncache_page(CIFS_I(inode)->fscache, page);
226}
227
228void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
229{
230 struct cifsInodeInfo *cifsi = CIFS_I(inode);
231 struct fscache_cookie *cookie = cifsi->fscache;
232
233 cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
234 fscache_wait_on_page_write(cookie, page);
235 fscache_uncache_page(cookie, page);
236}
237
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 00000000000..31b88ec2341
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
1/*
2 * fs/cifs/fscache.h - CIFS filesystem cache interface definitions
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#ifndef _CIFS_FSCACHE_H
22#define _CIFS_FSCACHE_H
23
24#include <linux/fscache.h>
25
26#include "cifsglob.h"
27
28#ifdef CONFIG_CIFS_FSCACHE
29
30extern struct fscache_netfs cifs_fscache_netfs;
31extern const struct fscache_cookie_def cifs_fscache_server_index_def;
32extern const struct fscache_cookie_def cifs_fscache_super_index_def;
33extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
34
35extern int cifs_fscache_register(void);
36extern void cifs_fscache_unregister(void);
37
38/*
39 * fscache.c
40 */
41extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
42extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
43extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
44extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
45
46extern void cifs_fscache_release_inode_cookie(struct inode *);
47extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
48extern void cifs_fscache_reset_inode_cookie(struct inode *);
49
50extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
51extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
52extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
53extern int __cifs_readpages_from_fscache(struct inode *,
54 struct address_space *,
55 struct list_head *,
56 unsigned *);
57
58extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
59
60static inline void cifs_fscache_invalidate_page(struct page *page,
61 struct inode *inode)
62{
63 if (PageFsCache(page))
64 __cifs_fscache_invalidate_page(page, inode);
65}
66
67static inline int cifs_readpage_from_fscache(struct inode *inode,
68 struct page *page)
69{
70 if (CIFS_I(inode)->fscache)
71 return __cifs_readpage_from_fscache(inode, page);
72
73 return -ENOBUFS;
74}
75
76static inline int cifs_readpages_from_fscache(struct inode *inode,
77 struct address_space *mapping,
78 struct list_head *pages,
79 unsigned *nr_pages)
80{
81 if (CIFS_I(inode)->fscache)
82 return __cifs_readpages_from_fscache(inode, mapping, pages,
83 nr_pages);
84 return -ENOBUFS;
85}
86
87static inline void cifs_readpage_to_fscache(struct inode *inode,
88 struct page *page)
89{
90 if (PageFsCache(page))
91 __cifs_readpage_to_fscache(inode, page);
92}
93
94#else /* CONFIG_CIFS_FSCACHE */
95static inline int cifs_fscache_register(void) { return 0; }
96static inline void cifs_fscache_unregister(void) {}
97
98static inline void
99cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
100static inline void
101cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
102static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
103static inline void
104cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
105
106static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
107static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
108 struct file *filp) {}
109static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
110static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
111{
112 return 1; /* May release page */
113}
114
115static inline void cifs_fscache_invalidate_page(struct page *page,
116 struct inode *inode) {}
117static inline int
118cifs_readpage_from_fscache(struct inode *inode, struct page *page)
119{
120 return -ENOBUFS;
121}
122
123static inline int cifs_readpages_from_fscache(struct inode *inode,
124 struct address_space *mapping,
125 struct list_head *pages,
126 unsigned *nr_pages)
127{
128 return -ENOBUFS;
129}
130
131static inline void cifs_readpage_to_fscache(struct inode *inode,
132 struct page *page) {}
133
134#endif /* CONFIG_CIFS_FSCACHE */
135
136#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6f0683c6895..39869c3c3ef 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -29,6 +29,7 @@
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31#include "cifs_fs_sb.h" 31#include "cifs_fs_sb.h"
32#include "fscache.h"
32 33
33 34
34static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -51,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
51 52
52 53
53 /* check if server can support readpages */ 54 /* check if server can support readpages */
54 if (cifs_sb->tcon->ses->server->maxBuf < 55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
55 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
56 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
57 else 58 else
@@ -287,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
287 struct cifs_fattr fattr; 288 struct cifs_fattr fattr;
288 struct inode *inode = filp->f_path.dentry->d_inode; 289 struct inode *inode = filp->f_path.dentry->d_inode;
289 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
290 struct cifsTconInfo *tcon = cifs_sb->tcon; 291 struct cifsFileInfo *cfile = filp->private_data;
291 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; 292 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
292 293
293 xid = GetXid(); 294 xid = GetXid();
294 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -312,15 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
312 FILE_UNIX_BASIC_INFO find_data; 313 FILE_UNIX_BASIC_INFO find_data;
313 struct cifs_fattr fattr; 314 struct cifs_fattr fattr;
314 struct cifsTconInfo *tcon; 315 struct cifsTconInfo *tcon;
316 struct tcon_link *tlink;
315 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 317 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
316 318
317 tcon = cifs_sb->tcon;
318 cFYI(1, "Getting info on %s", full_path); 319 cFYI(1, "Getting info on %s", full_path);
319 320
321 tlink = cifs_sb_tlink(cifs_sb);
322 if (IS_ERR(tlink))
323 return PTR_ERR(tlink);
324 tcon = tlink_tcon(tlink);
325
320 /* could have done a find first instead but this returns more info */ 326 /* could have done a find first instead but this returns more info */
321 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 327 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
322 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 328 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
323 CIFS_MOUNT_MAP_SPECIAL_CHR); 329 CIFS_MOUNT_MAP_SPECIAL_CHR);
330 cifs_put_tlink(tlink);
324 331
325 if (!rc) { 332 if (!rc) {
326 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); 333 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -331,6 +338,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
331 return rc; 338 return rc;
332 } 339 }
333 340
341 /* check for Minshall+French symlinks */
342 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
343 int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
344 if (tmprc)
345 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
346 }
347
334 if (*pinode == NULL) { 348 if (*pinode == NULL) {
335 /* get new inode */ 349 /* get new inode */
336 cifs_fill_uniqueid(sb, &fattr); 350 cifs_fill_uniqueid(sb, &fattr);
@@ -352,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
352 int rc; 366 int rc;
353 int oplock = 0; 367 int oplock = 0;
354 __u16 netfid; 368 __u16 netfid;
355 struct cifsTconInfo *pTcon = cifs_sb->tcon; 369 struct tcon_link *tlink;
370 struct cifsTconInfo *tcon;
356 char buf[24]; 371 char buf[24];
357 unsigned int bytes_read; 372 unsigned int bytes_read;
358 char *pbuf; 373 char *pbuf;
@@ -371,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
371 return -EINVAL; /* EOPNOTSUPP? */ 386 return -EINVAL; /* EOPNOTSUPP? */
372 } 387 }
373 388
374 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, 389 tlink = cifs_sb_tlink(cifs_sb);
390 if (IS_ERR(tlink))
391 return PTR_ERR(tlink);
392 tcon = tlink_tcon(tlink);
393
394 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
375 CREATE_NOT_DIR, &netfid, &oplock, NULL, 395 CREATE_NOT_DIR, &netfid, &oplock, NULL,
376 cifs_sb->local_nls, 396 cifs_sb->local_nls,
377 cifs_sb->mnt_cifs_flags & 397 cifs_sb->mnt_cifs_flags &
@@ -379,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
379 if (rc == 0) { 399 if (rc == 0) {
380 int buf_type = CIFS_NO_BUFFER; 400 int buf_type = CIFS_NO_BUFFER;
381 /* Read header */ 401 /* Read header */
382 rc = CIFSSMBRead(xid, pTcon, netfid, 402 rc = CIFSSMBRead(xid, tcon, netfid,
383 24 /* length */, 0 /* offset */, 403 24 /* length */, 0 /* offset */,
384 &bytes_read, &pbuf, &buf_type); 404 &bytes_read, &pbuf, &buf_type);
385 if ((rc == 0) && (bytes_read >= 8)) { 405 if ((rc == 0) && (bytes_read >= 8)) {
@@ -421,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
421 fattr->cf_dtype = DT_REG; 441 fattr->cf_dtype = DT_REG;
422 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 442 rc = -EOPNOTSUPP; /* or some unknown SFU type */
423 } 443 }
424 CIFSSMBClose(xid, pTcon, netfid); 444 CIFSSMBClose(xid, tcon, netfid);
425 } 445 }
446 cifs_put_tlink(tlink);
426 return rc; 447 return rc;
427} 448}
428 449
@@ -440,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
440 ssize_t rc; 461 ssize_t rc;
441 char ea_value[4]; 462 char ea_value[4];
442 __u32 mode; 463 __u32 mode;
464 struct tcon_link *tlink;
465 struct cifsTconInfo *tcon;
466
467 tlink = cifs_sb_tlink(cifs_sb);
468 if (IS_ERR(tlink))
469 return PTR_ERR(tlink);
470 tcon = tlink_tcon(tlink);
443 471
444 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS", 472 rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
445 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 473 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
446 cifs_sb->mnt_cifs_flags & 474 cifs_sb->mnt_cifs_flags &
447 CIFS_MOUNT_MAP_SPECIAL_CHR); 475 CIFS_MOUNT_MAP_SPECIAL_CHR);
476 cifs_put_tlink(tlink);
448 if (rc < 0) 477 if (rc < 0)
449 return (int)rc; 478 return (int)rc;
450 else if (rc > 3) { 479 else if (rc > 3) {
@@ -467,6 +496,8 @@ static void
467cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 496cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
468 struct cifs_sb_info *cifs_sb, bool adjust_tz) 497 struct cifs_sb_info *cifs_sb, bool adjust_tz)
469{ 498{
499 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
500
470 memset(fattr, 0, sizeof(*fattr)); 501 memset(fattr, 0, sizeof(*fattr));
471 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); 502 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
472 if (info->DeletePending) 503 if (info->DeletePending)
@@ -481,8 +512,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
481 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 512 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
482 513
483 if (adjust_tz) { 514 if (adjust_tz) {
484 fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 515 fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
485 fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 516 fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
486 } 517 }
487 518
488 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 519 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -514,8 +545,8 @@ int cifs_get_file_info(struct file *filp)
514 struct cifs_fattr fattr; 545 struct cifs_fattr fattr;
515 struct inode *inode = filp->f_path.dentry->d_inode; 546 struct inode *inode = filp->f_path.dentry->d_inode;
516 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 547 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
517 struct cifsTconInfo *tcon = cifs_sb->tcon; 548 struct cifsFileInfo *cfile = filp->private_data;
518 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; 549 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
519 550
520 xid = GetXid(); 551 xid = GetXid();
521 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 552 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -553,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
553{ 584{
554 int rc = 0, tmprc; 585 int rc = 0, tmprc;
555 struct cifsTconInfo *pTcon; 586 struct cifsTconInfo *pTcon;
587 struct tcon_link *tlink;
556 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 588 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
557 char *buf = NULL; 589 char *buf = NULL;
558 bool adjustTZ = false; 590 bool adjustTZ = false;
559 struct cifs_fattr fattr; 591 struct cifs_fattr fattr;
560 592
561 pTcon = cifs_sb->tcon; 593 tlink = cifs_sb_tlink(cifs_sb);
594 if (IS_ERR(tlink))
595 return PTR_ERR(tlink);
596 pTcon = tlink_tcon(tlink);
597
562 cFYI(1, "Getting info on %s", full_path); 598 cFYI(1, "Getting info on %s", full_path);
563 599
564 if ((pfindData == NULL) && (*pinode != NULL)) { 600 if ((pfindData == NULL) && (*pinode != NULL)) {
565 if (CIFS_I(*pinode)->clientCanCacheRead) { 601 if (CIFS_I(*pinode)->clientCanCacheRead) {
566 cFYI(1, "No need to revalidate cached inode sizes"); 602 cFYI(1, "No need to revalidate cached inode sizes");
567 return rc; 603 goto cgii_exit;
568 } 604 }
569 } 605 }
570 606
571 /* if file info not passed in then get it from server */ 607 /* if file info not passed in then get it from server */
572 if (pfindData == NULL) { 608 if (pfindData == NULL) {
573 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 609 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
574 if (buf == NULL) 610 if (buf == NULL) {
575 return -ENOMEM; 611 rc = -ENOMEM;
612 goto cgii_exit;
613 }
576 pfindData = (FILE_ALL_INFO *)buf; 614 pfindData = (FILE_ALL_INFO *)buf;
577 615
578 /* could do find first instead but this returns more info */ 616 /* could do find first instead but this returns more info */
@@ -660,6 +698,13 @@ int cifs_get_inode_info(struct inode **pinode,
660 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
661 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); 699 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
662 700
701 /* check for Minshall+French symlinks */
702 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
703 tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
704 if (tmprc)
705 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
706 }
707
663 if (!*pinode) { 708 if (!*pinode) {
664 *pinode = cifs_iget(sb, &fattr); 709 *pinode = cifs_iget(sb, &fattr);
665 if (!*pinode) 710 if (!*pinode)
@@ -670,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
670 715
671cgii_exit: 716cgii_exit:
672 kfree(buf); 717 kfree(buf);
718 cifs_put_tlink(tlink);
673 return rc; 719 return rc;
674} 720}
675 721
@@ -682,6 +728,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
682 int pplen = cifs_sb->prepathlen; 728 int pplen = cifs_sb->prepathlen;
683 int dfsplen; 729 int dfsplen;
684 char *full_path = NULL; 730 char *full_path = NULL;
731 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
685 732
686 /* if no prefix path, simply set path to the root of share to "" */ 733 /* if no prefix path, simply set path to the root of share to "" */
687 if (pplen == 0) { 734 if (pplen == 0) {
@@ -691,8 +738,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
691 return full_path; 738 return full_path;
692 } 739 }
693 740
694 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 741 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
695 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 742 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
696 else 743 else
697 dfsplen = 0; 744 dfsplen = 0;
698 745
@@ -701,7 +748,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
701 return full_path; 748 return full_path;
702 749
703 if (dfsplen) { 750 if (dfsplen) {
704 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 751 strncpy(full_path, tcon->treeName, dfsplen);
705 /* switch slash direction in prepath depending on whether 752 /* switch slash direction in prepath depending on whether
706 * windows or posix style path names 753 * windows or posix style path names
707 */ 754 */
@@ -723,18 +770,17 @@ cifs_find_inode(struct inode *inode, void *opaque)
723{ 770{
724 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; 771 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
725 772
773 /* don't match inode with different uniqueid */
726 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 774 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
727 return 0; 775 return 0;
728 776
729 /* 777 /* don't match inode of different type */
730 * uh oh -- it's a directory. We can't use it since hardlinked dirs are 778 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
731 * verboten. Disable serverino and return it as if it were found, the 779 return 0;
732 * caller can discard it, generate a uniqueid and retry the find 780
733 */ 781 /* if it's not a directory or has no dentries, then flag it */
734 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { 782 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
735 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; 783 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
736 cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
737 }
738 784
739 return 1; 785 return 1;
740} 786}
@@ -748,6 +794,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
748 return 0; 794 return 0;
749} 795}
750 796
797/*
798 * walk dentry list for an inode and report whether it has aliases that
799 * are hashed. We use this to determine if a directory inode can actually
800 * be used.
801 */
802static bool
803inode_has_hashed_dentries(struct inode *inode)
804{
805 struct dentry *dentry;
806
807 spin_lock(&dcache_lock);
808 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
809 if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
810 spin_unlock(&dcache_lock);
811 return true;
812 }
813 }
814 spin_unlock(&dcache_lock);
815 return false;
816}
817
751/* Given fattrs, get a corresponding inode */ 818/* Given fattrs, get a corresponding inode */
752struct inode * 819struct inode *
753cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) 820cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -763,12 +830,16 @@ retry_iget5_locked:
763 830
764 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); 831 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
765 if (inode) { 832 if (inode) {
766 /* was there a problematic inode number collision? */ 833 /* was there a potentially problematic inode collision? */
767 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { 834 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
768 iput(inode);
769 fattr->cf_uniqueid = iunique(sb, ROOT_I);
770 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; 835 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
771 goto retry_iget5_locked; 836
837 if (inode_has_hashed_dentries(inode)) {
838 cifs_autodisable_serverino(CIFS_SB(sb));
839 iput(inode);
840 fattr->cf_uniqueid = iunique(sb, ROOT_I);
841 goto retry_iget5_locked;
842 }
772 } 843 }
773 844
774 cifs_fattr_to_inode(inode, fattr); 845 cifs_fattr_to_inode(inode, fattr);
@@ -776,6 +847,12 @@ retry_iget5_locked:
776 inode->i_flags |= S_NOATIME | S_NOCMTIME; 847 inode->i_flags |= S_NOATIME | S_NOCMTIME;
777 if (inode->i_state & I_NEW) { 848 if (inode->i_state & I_NEW) {
778 inode->i_ino = hash; 849 inode->i_ino = hash;
850 if (S_ISREG(inode->i_mode))
851 inode->i_data.backing_dev_info = sb->s_bdi;
852#ifdef CONFIG_CIFS_FSCACHE
853 /* initialize per-inode cache cookie pointer */
854 CIFS_I(inode)->fscache = NULL;
855#endif
779 unlock_new_inode(inode); 856 unlock_new_inode(inode);
780 } 857 }
781 } 858 }
@@ -787,27 +864,32 @@ retry_iget5_locked:
787struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) 864struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
788{ 865{
789 int xid; 866 int xid;
790 struct cifs_sb_info *cifs_sb; 867 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
791 struct inode *inode = NULL; 868 struct inode *inode = NULL;
792 long rc; 869 long rc;
793 char *full_path; 870 char *full_path;
871 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
794 872
795 cifs_sb = CIFS_SB(sb);
796 full_path = cifs_build_path_to_root(cifs_sb); 873 full_path = cifs_build_path_to_root(cifs_sb);
797 if (full_path == NULL) 874 if (full_path == NULL)
798 return ERR_PTR(-ENOMEM); 875 return ERR_PTR(-ENOMEM);
799 876
800 xid = GetXid(); 877 xid = GetXid();
801 if (cifs_sb->tcon->unix_ext) 878 if (tcon->unix_ext)
802 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 879 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
803 else 880 else
804 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 881 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
805 xid, NULL); 882 xid, NULL);
806 883
807 if (!inode) 884 if (!inode)
808 return ERR_PTR(-ENOMEM); 885 return ERR_PTR(rc);
886
887#ifdef CONFIG_CIFS_FSCACHE
888 /* populate tcon->resource_id */
889 tcon->resource_id = CIFS_I(inode)->uniqueid;
890#endif
809 891
810 if (rc && cifs_sb->tcon->ipc) { 892 if (rc && tcon->ipc) {
811 cFYI(1, "ipc connection - fake read inode"); 893 cFYI(1, "ipc connection - fake read inode");
812 inode->i_mode |= S_IFDIR; 894 inode->i_mode |= S_IFDIR;
813 inode->i_nlink = 2; 895 inode->i_nlink = 2;
@@ -843,7 +925,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
843 struct cifsFileInfo *open_file; 925 struct cifsFileInfo *open_file;
844 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 926 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
845 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 927 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
846 struct cifsTconInfo *pTcon = cifs_sb->tcon; 928 struct tcon_link *tlink = NULL;
929 struct cifsTconInfo *pTcon;
847 FILE_BASIC_INFO info_buf; 930 FILE_BASIC_INFO info_buf;
848 931
849 if (attrs == NULL) 932 if (attrs == NULL)
@@ -882,13 +965,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
882 /* 965 /*
883 * If the file is already open for write, just use that fileid 966 * If the file is already open for write, just use that fileid
884 */ 967 */
885 open_file = find_writable_file(cifsInode); 968 open_file = find_writable_file(cifsInode, true);
886 if (open_file) { 969 if (open_file) {
887 netfid = open_file->netfid; 970 netfid = open_file->netfid;
888 netpid = open_file->pid; 971 netpid = open_file->pid;
972 pTcon = tlink_tcon(open_file->tlink);
889 goto set_via_filehandle; 973 goto set_via_filehandle;
890 } 974 }
891 975
976 tlink = cifs_sb_tlink(cifs_sb);
977 if (IS_ERR(tlink)) {
978 rc = PTR_ERR(tlink);
979 tlink = NULL;
980 goto out;
981 }
982 pTcon = tlink_tcon(tlink);
983
892 /* 984 /*
893 * NT4 apparently returns success on this call, but it doesn't 985 * NT4 apparently returns success on this call, but it doesn't
894 * really work. 986 * really work.
@@ -932,6 +1024,8 @@ set_via_filehandle:
932 else 1024 else
933 cifsFileInfo_put(open_file); 1025 cifsFileInfo_put(open_file);
934out: 1026out:
1027 if (tlink != NULL)
1028 cifs_put_tlink(tlink);
935 return rc; 1029 return rc;
936} 1030}
937 1031
@@ -949,10 +1043,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
949 struct inode *inode = dentry->d_inode; 1043 struct inode *inode = dentry->d_inode;
950 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1044 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
951 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1045 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
952 struct cifsTconInfo *tcon = cifs_sb->tcon; 1046 struct tcon_link *tlink;
1047 struct cifsTconInfo *tcon;
953 __u32 dosattr, origattr; 1048 __u32 dosattr, origattr;
954 FILE_BASIC_INFO *info_buf = NULL; 1049 FILE_BASIC_INFO *info_buf = NULL;
955 1050
1051 tlink = cifs_sb_tlink(cifs_sb);
1052 if (IS_ERR(tlink))
1053 return PTR_ERR(tlink);
1054 tcon = tlink_tcon(tlink);
1055
956 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 1056 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
957 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 1057 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
958 &netfid, &oplock, NULL, cifs_sb->local_nls, 1058 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1021,6 +1121,7 @@ out_close:
1021 CIFSSMBClose(xid, tcon, netfid); 1121 CIFSSMBClose(xid, tcon, netfid);
1022out: 1122out:
1023 kfree(info_buf); 1123 kfree(info_buf);
1124 cifs_put_tlink(tlink);
1024 return rc; 1125 return rc;
1025 1126
1026 /* 1127 /*
@@ -1060,12 +1161,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1060 struct cifsInodeInfo *cifs_inode; 1161 struct cifsInodeInfo *cifs_inode;
1061 struct super_block *sb = dir->i_sb; 1162 struct super_block *sb = dir->i_sb;
1062 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 1163 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
1063 struct cifsTconInfo *tcon = cifs_sb->tcon; 1164 struct tcon_link *tlink;
1165 struct cifsTconInfo *tcon;
1064 struct iattr *attrs = NULL; 1166 struct iattr *attrs = NULL;
1065 __u32 dosattr = 0, origattr = 0; 1167 __u32 dosattr = 0, origattr = 0;
1066 1168
1067 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); 1169 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1068 1170
1171 tlink = cifs_sb_tlink(cifs_sb);
1172 if (IS_ERR(tlink))
1173 return PTR_ERR(tlink);
1174 tcon = tlink_tcon(tlink);
1175
1069 xid = GetXid(); 1176 xid = GetXid();
1070 1177
1071 /* Unlink can be called from rename so we can not take the 1178 /* Unlink can be called from rename so we can not take the
@@ -1073,8 +1180,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1073 full_path = build_path_from_dentry(dentry); 1180 full_path = build_path_from_dentry(dentry);
1074 if (full_path == NULL) { 1181 if (full_path == NULL) {
1075 rc = -ENOMEM; 1182 rc = -ENOMEM;
1076 FreeXid(xid); 1183 goto unlink_out;
1077 return rc;
1078 } 1184 }
1079 1185
1080 if ((tcon->ses->capabilities & CAP_UNIX) && 1186 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1140,10 +1246,11 @@ out_reval:
1140 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1246 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1141 cifs_inode = CIFS_I(dir); 1247 cifs_inode = CIFS_I(dir);
1142 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1248 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1143 1249unlink_out:
1144 kfree(full_path); 1250 kfree(full_path);
1145 kfree(attrs); 1251 kfree(attrs);
1146 FreeXid(xid); 1252 FreeXid(xid);
1253 cifs_put_tlink(tlink);
1147 return rc; 1254 return rc;
1148} 1255}
1149 1256
@@ -1152,6 +1259,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1152 int rc = 0, tmprc; 1259 int rc = 0, tmprc;
1153 int xid; 1260 int xid;
1154 struct cifs_sb_info *cifs_sb; 1261 struct cifs_sb_info *cifs_sb;
1262 struct tcon_link *tlink;
1155 struct cifsTconInfo *pTcon; 1263 struct cifsTconInfo *pTcon;
1156 char *full_path = NULL; 1264 char *full_path = NULL;
1157 struct inode *newinode = NULL; 1265 struct inode *newinode = NULL;
@@ -1159,16 +1267,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1159 1267
1160 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); 1268 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1161 1269
1162 xid = GetXid();
1163
1164 cifs_sb = CIFS_SB(inode->i_sb); 1270 cifs_sb = CIFS_SB(inode->i_sb);
1165 pTcon = cifs_sb->tcon; 1271 tlink = cifs_sb_tlink(cifs_sb);
1272 if (IS_ERR(tlink))
1273 return PTR_ERR(tlink);
1274 pTcon = tlink_tcon(tlink);
1275
1276 xid = GetXid();
1166 1277
1167 full_path = build_path_from_dentry(direntry); 1278 full_path = build_path_from_dentry(direntry);
1168 if (full_path == NULL) { 1279 if (full_path == NULL) {
1169 rc = -ENOMEM; 1280 rc = -ENOMEM;
1170 FreeXid(xid); 1281 goto mkdir_out;
1171 return rc;
1172 } 1282 }
1173 1283
1174 if ((pTcon->ses->capabilities & CAP_UNIX) && 1284 if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1326,6 +1436,7 @@ mkdir_get_info:
1326mkdir_out: 1436mkdir_out:
1327 kfree(full_path); 1437 kfree(full_path);
1328 FreeXid(xid); 1438 FreeXid(xid);
1439 cifs_put_tlink(tlink);
1329 return rc; 1440 return rc;
1330} 1441}
1331 1442
@@ -1334,6 +1445,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1334 int rc = 0; 1445 int rc = 0;
1335 int xid; 1446 int xid;
1336 struct cifs_sb_info *cifs_sb; 1447 struct cifs_sb_info *cifs_sb;
1448 struct tcon_link *tlink;
1337 struct cifsTconInfo *pTcon; 1449 struct cifsTconInfo *pTcon;
1338 char *full_path = NULL; 1450 char *full_path = NULL;
1339 struct cifsInodeInfo *cifsInode; 1451 struct cifsInodeInfo *cifsInode;
@@ -1342,18 +1454,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1342 1454
1343 xid = GetXid(); 1455 xid = GetXid();
1344 1456
1345 cifs_sb = CIFS_SB(inode->i_sb);
1346 pTcon = cifs_sb->tcon;
1347
1348 full_path = build_path_from_dentry(direntry); 1457 full_path = build_path_from_dentry(direntry);
1349 if (full_path == NULL) { 1458 if (full_path == NULL) {
1350 rc = -ENOMEM; 1459 rc = -ENOMEM;
1351 FreeXid(xid); 1460 goto rmdir_exit;
1352 return rc; 1461 }
1462
1463 cifs_sb = CIFS_SB(inode->i_sb);
1464 tlink = cifs_sb_tlink(cifs_sb);
1465 if (IS_ERR(tlink)) {
1466 rc = PTR_ERR(tlink);
1467 goto rmdir_exit;
1353 } 1468 }
1469 pTcon = tlink_tcon(tlink);
1354 1470
1355 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1471 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
1356 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1472 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1473 cifs_put_tlink(tlink);
1357 1474
1358 if (!rc) { 1475 if (!rc) {
1359 drop_nlink(inode); 1476 drop_nlink(inode);
@@ -1374,6 +1491,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1374 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1491 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1375 current_fs_time(inode->i_sb); 1492 current_fs_time(inode->i_sb);
1376 1493
1494rmdir_exit:
1377 kfree(full_path); 1495 kfree(full_path);
1378 FreeXid(xid); 1496 FreeXid(xid);
1379 return rc; 1497 return rc;
@@ -1384,10 +1502,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1384 struct dentry *to_dentry, const char *toPath) 1502 struct dentry *to_dentry, const char *toPath)
1385{ 1503{
1386 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); 1504 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1387 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1505 struct tcon_link *tlink;
1506 struct cifsTconInfo *pTcon;
1388 __u16 srcfid; 1507 __u16 srcfid;
1389 int oplock, rc; 1508 int oplock, rc;
1390 1509
1510 tlink = cifs_sb_tlink(cifs_sb);
1511 if (IS_ERR(tlink))
1512 return PTR_ERR(tlink);
1513 pTcon = tlink_tcon(tlink);
1514
1391 /* try path-based rename first */ 1515 /* try path-based rename first */
1392 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, 1516 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1393 cifs_sb->mnt_cifs_flags & 1517 cifs_sb->mnt_cifs_flags &
@@ -1399,11 +1523,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1399 * rename by filehandle to various Windows servers. 1523 * rename by filehandle to various Windows servers.
1400 */ 1524 */
1401 if (rc == 0 || rc != -ETXTBSY) 1525 if (rc == 0 || rc != -ETXTBSY)
1402 return rc; 1526 goto do_rename_exit;
1403 1527
1404 /* open-file renames don't work across directories */ 1528 /* open-file renames don't work across directories */
1405 if (to_dentry->d_parent != from_dentry->d_parent) 1529 if (to_dentry->d_parent != from_dentry->d_parent)
1406 return rc; 1530 goto do_rename_exit;
1407 1531
1408 /* open the file to be renamed -- we need DELETE perms */ 1532 /* open the file to be renamed -- we need DELETE perms */
1409 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1533 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1419,7 +1543,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1419 1543
1420 CIFSSMBClose(xid, pTcon, srcfid); 1544 CIFSSMBClose(xid, pTcon, srcfid);
1421 } 1545 }
1422 1546do_rename_exit:
1547 cifs_put_tlink(tlink);
1423 return rc; 1548 return rc;
1424} 1549}
1425 1550
@@ -1428,29 +1553,22 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1428{ 1553{
1429 char *fromName = NULL; 1554 char *fromName = NULL;
1430 char *toName = NULL; 1555 char *toName = NULL;
1431 struct cifs_sb_info *cifs_sb_source; 1556 struct cifs_sb_info *cifs_sb;
1432 struct cifs_sb_info *cifs_sb_target; 1557 struct tcon_link *tlink;
1433 struct cifsTconInfo *tcon; 1558 struct cifsTconInfo *tcon;
1434 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1559 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1435 FILE_UNIX_BASIC_INFO *info_buf_target; 1560 FILE_UNIX_BASIC_INFO *info_buf_target;
1436 int xid, rc, tmprc; 1561 int xid, rc, tmprc;
1437 1562
1438 cifs_sb_target = CIFS_SB(target_dir->i_sb); 1563 cifs_sb = CIFS_SB(source_dir->i_sb);
1439 cifs_sb_source = CIFS_SB(source_dir->i_sb); 1564 tlink = cifs_sb_tlink(cifs_sb);
1440 tcon = cifs_sb_source->tcon; 1565 if (IS_ERR(tlink))
1566 return PTR_ERR(tlink);
1567 tcon = tlink_tcon(tlink);
1441 1568
1442 xid = GetXid(); 1569 xid = GetXid();
1443 1570
1444 /* 1571 /*
1445 * BB: this might be allowed if same server, but different share.
1446 * Consider adding support for this
1447 */
1448 if (tcon != cifs_sb_target->tcon) {
1449 rc = -EXDEV;
1450 goto cifs_rename_exit;
1451 }
1452
1453 /*
1454 * we already have the rename sem so we do not need to 1572 * we already have the rename sem so we do not need to
1455 * grab it again here to protect the path integrity 1573 * grab it again here to protect the path integrity
1456 */ 1574 */
@@ -1485,17 +1603,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1485 info_buf_target = info_buf_source + 1; 1603 info_buf_target = info_buf_source + 1;
1486 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, 1604 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
1487 info_buf_source, 1605 info_buf_source,
1488 cifs_sb_source->local_nls, 1606 cifs_sb->local_nls,
1489 cifs_sb_source->mnt_cifs_flags & 1607 cifs_sb->mnt_cifs_flags &
1490 CIFS_MOUNT_MAP_SPECIAL_CHR); 1608 CIFS_MOUNT_MAP_SPECIAL_CHR);
1491 if (tmprc != 0) 1609 if (tmprc != 0)
1492 goto unlink_target; 1610 goto unlink_target;
1493 1611
1494 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, 1612 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
1495 toName, info_buf_target, 1613 info_buf_target,
1496 cifs_sb_target->local_nls, 1614 cifs_sb->local_nls,
1497 /* remap based on source sb */ 1615 cifs_sb->mnt_cifs_flags &
1498 cifs_sb_source->mnt_cifs_flags &
1499 CIFS_MOUNT_MAP_SPECIAL_CHR); 1616 CIFS_MOUNT_MAP_SPECIAL_CHR);
1500 1617
1501 if (tmprc == 0 && (info_buf_source->UniqueId == 1618 if (tmprc == 0 && (info_buf_source->UniqueId ==
@@ -1523,6 +1640,7 @@ cifs_rename_exit:
1523 kfree(fromName); 1640 kfree(fromName);
1524 kfree(toName); 1641 kfree(toName);
1525 FreeXid(xid); 1642 FreeXid(xid);
1643 cifs_put_tlink(tlink);
1526 return rc; 1644 return rc;
1527} 1645}
1528 1646
@@ -1564,21 +1682,22 @@ cifs_invalidate_mapping(struct inode *inode)
1564 /* write back any cached data */ 1682 /* write back any cached data */
1565 if (inode->i_mapping && inode->i_mapping->nrpages != 0) { 1683 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1566 rc = filemap_write_and_wait(inode->i_mapping); 1684 rc = filemap_write_and_wait(inode->i_mapping);
1567 if (rc) 1685 mapping_set_error(inode->i_mapping, rc);
1568 cifs_i->write_behind_rc = rc;
1569 } 1686 }
1570 invalidate_remote_inode(inode); 1687 invalidate_remote_inode(inode);
1688 cifs_fscache_reset_inode_cookie(inode);
1571} 1689}
1572 1690
1573int cifs_revalidate_file(struct file *filp) 1691int cifs_revalidate_file(struct file *filp)
1574{ 1692{
1575 int rc = 0; 1693 int rc = 0;
1576 struct inode *inode = filp->f_path.dentry->d_inode; 1694 struct inode *inode = filp->f_path.dentry->d_inode;
1695 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
1577 1696
1578 if (!cifs_inode_needs_reval(inode)) 1697 if (!cifs_inode_needs_reval(inode))
1579 goto check_inval; 1698 goto check_inval;
1580 1699
1581 if (CIFS_SB(inode->i_sb)->tcon->unix_ext) 1700 if (tlink_tcon(cfile->tlink)->unix_ext)
1582 rc = cifs_get_file_info_unix(filp); 1701 rc = cifs_get_file_info_unix(filp);
1583 else 1702 else
1584 rc = cifs_get_file_info(filp); 1703 rc = cifs_get_file_info(filp);
@@ -1619,7 +1738,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1619 "jiffies %ld", full_path, inode, inode->i_count.counter, 1738 "jiffies %ld", full_path, inode, inode->i_count.counter,
1620 dentry, dentry->d_time, jiffies); 1739 dentry, dentry->d_time, jiffies);
1621 1740
1622 if (CIFS_SB(sb)->tcon->unix_ext) 1741 if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
1623 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1742 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1624 else 1743 else
1625 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 1744 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1635,13 +1754,29 @@ check_inval:
1635} 1754}
1636 1755
1637int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1756int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1638 struct kstat *stat) 1757 struct kstat *stat)
1639{ 1758{
1759 struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
1760 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
1640 int err = cifs_revalidate_dentry(dentry); 1761 int err = cifs_revalidate_dentry(dentry);
1762
1641 if (!err) { 1763 if (!err) {
1642 generic_fillattr(dentry->d_inode, stat); 1764 generic_fillattr(dentry->d_inode, stat);
1643 stat->blksize = CIFS_MAX_MSGSIZE; 1765 stat->blksize = CIFS_MAX_MSGSIZE;
1644 stat->ino = CIFS_I(dentry->d_inode)->uniqueid; 1766 stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
1767
1768 /*
1769 * If on a multiuser mount without unix extensions, and the
1770 * admin hasn't overridden them, set the ownership to the
1771 * fsuid/fsgid of the current process.
1772 */
1773 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1774 !tcon->unix_ext) {
1775 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1776 stat->uid = current_fsuid();
1777 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
1778 stat->gid = current_fsgid();
1779 }
1645 } 1780 }
1646 return err; 1781 return err;
1647} 1782}
@@ -1663,26 +1798,16 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
1663 return rc; 1798 return rc;
1664} 1799}
1665 1800
1666static int cifs_vmtruncate(struct inode *inode, loff_t offset) 1801static void cifs_setsize(struct inode *inode, loff_t offset)
1667{ 1802{
1668 loff_t oldsize; 1803 loff_t oldsize;
1669 int err;
1670 1804
1671 spin_lock(&inode->i_lock); 1805 spin_lock(&inode->i_lock);
1672 err = inode_newsize_ok(inode, offset);
1673 if (err) {
1674 spin_unlock(&inode->i_lock);
1675 goto out;
1676 }
1677
1678 oldsize = inode->i_size; 1806 oldsize = inode->i_size;
1679 i_size_write(inode, offset); 1807 i_size_write(inode, offset);
1680 spin_unlock(&inode->i_lock); 1808 spin_unlock(&inode->i_lock);
1809
1681 truncate_pagecache(inode, oldsize, offset); 1810 truncate_pagecache(inode, oldsize, offset);
1682 if (inode->i_op->truncate)
1683 inode->i_op->truncate(inode);
1684out:
1685 return err;
1686} 1811}
1687 1812
1688static int 1813static int
@@ -1693,7 +1818,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1693 struct cifsFileInfo *open_file; 1818 struct cifsFileInfo *open_file;
1694 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1819 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1695 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1820 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1696 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1821 struct tcon_link *tlink = NULL;
1822 struct cifsTconInfo *pTcon = NULL;
1697 1823
1698 /* 1824 /*
1699 * To avoid spurious oplock breaks from server, in the case of 1825 * To avoid spurious oplock breaks from server, in the case of
@@ -1704,10 +1830,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1704 * writebehind data than the SMB timeout for the SetPathInfo 1830 * writebehind data than the SMB timeout for the SetPathInfo
1705 * request would allow 1831 * request would allow
1706 */ 1832 */
1707 open_file = find_writable_file(cifsInode); 1833 open_file = find_writable_file(cifsInode, true);
1708 if (open_file) { 1834 if (open_file) {
1709 __u16 nfid = open_file->netfid; 1835 __u16 nfid = open_file->netfid;
1710 __u32 npid = open_file->pid; 1836 __u32 npid = open_file->pid;
1837 pTcon = tlink_tcon(open_file->tlink);
1711 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1838 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1712 npid, false); 1839 npid, false);
1713 cifsFileInfo_put(open_file); 1840 cifsFileInfo_put(open_file);
@@ -1722,6 +1849,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1722 rc = -EINVAL; 1849 rc = -EINVAL;
1723 1850
1724 if (rc != 0) { 1851 if (rc != 0) {
1852 if (pTcon == NULL) {
1853 tlink = cifs_sb_tlink(cifs_sb);
1854 if (IS_ERR(tlink))
1855 return PTR_ERR(tlink);
1856 pTcon = tlink_tcon(tlink);
1857 }
1858
1725 /* Set file size by pathname rather than by handle 1859 /* Set file size by pathname rather than by handle
1726 either because no valid, writeable file handle for 1860 either because no valid, writeable file handle for
1727 it was found or because there was an error setting 1861 it was found or because there was an error setting
@@ -1751,11 +1885,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1751 CIFSSMBClose(xid, pTcon, netfid); 1885 CIFSSMBClose(xid, pTcon, netfid);
1752 } 1886 }
1753 } 1887 }
1888 if (tlink)
1889 cifs_put_tlink(tlink);
1754 } 1890 }
1755 1891
1756 if (rc == 0) { 1892 if (rc == 0) {
1757 cifsInode->server_eof = attrs->ia_size; 1893 cifsInode->server_eof = attrs->ia_size;
1758 rc = cifs_vmtruncate(inode, attrs->ia_size); 1894 cifs_setsize(inode, attrs->ia_size);
1759 cifs_truncate_page(inode->i_mapping, inode->i_size); 1895 cifs_truncate_page(inode->i_mapping, inode->i_size);
1760 } 1896 }
1761 1897
@@ -1771,7 +1907,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1771 struct inode *inode = direntry->d_inode; 1907 struct inode *inode = direntry->d_inode;
1772 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1908 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1773 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1909 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1774 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1910 struct tcon_link *tlink;
1911 struct cifsTconInfo *pTcon;
1775 struct cifs_unix_set_info_args *args = NULL; 1912 struct cifs_unix_set_info_args *args = NULL;
1776 struct cifsFileInfo *open_file; 1913 struct cifsFileInfo *open_file;
1777 1914
@@ -1780,14 +1917,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1780 1917
1781 xid = GetXid(); 1918 xid = GetXid();
1782 1919
1783 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 1920 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
1784 /* check if we have permission to change attrs */ 1921 attrs->ia_valid |= ATTR_FORCE;
1785 rc = inode_change_ok(inode, attrs); 1922
1786 if (rc < 0) 1923 rc = inode_change_ok(inode, attrs);
1787 goto out; 1924 if (rc < 0)
1788 else 1925 goto out;
1789 rc = 0;
1790 }
1791 1926
1792 full_path = build_path_from_dentry(direntry); 1927 full_path = build_path_from_dentry(direntry);
1793 if (full_path == NULL) { 1928 if (full_path == NULL) {
@@ -1807,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1807 * the flush returns error? 1942 * the flush returns error?
1808 */ 1943 */
1809 rc = filemap_write_and_wait(inode->i_mapping); 1944 rc = filemap_write_and_wait(inode->i_mapping);
1810 if (rc != 0) { 1945 mapping_set_error(inode->i_mapping, rc);
1811 cifsInode->write_behind_rc = rc; 1946 rc = 0;
1812 rc = 0;
1813 }
1814 1947
1815 if (attrs->ia_valid & ATTR_SIZE) { 1948 if (attrs->ia_valid & ATTR_SIZE) {
1816 rc = cifs_set_file_size(inode, attrs, xid, full_path); 1949 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1860,31 +1993,45 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1860 args->ctime = NO_CHANGE_64; 1993 args->ctime = NO_CHANGE_64;
1861 1994
1862 args->device = 0; 1995 args->device = 0;
1863 open_file = find_writable_file(cifsInode); 1996 open_file = find_writable_file(cifsInode, true);
1864 if (open_file) { 1997 if (open_file) {
1865 u16 nfid = open_file->netfid; 1998 u16 nfid = open_file->netfid;
1866 u32 npid = open_file->pid; 1999 u32 npid = open_file->pid;
2000 pTcon = tlink_tcon(open_file->tlink);
1867 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 2001 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1868 cifsFileInfo_put(open_file); 2002 cifsFileInfo_put(open_file);
1869 } else { 2003 } else {
2004 tlink = cifs_sb_tlink(cifs_sb);
2005 if (IS_ERR(tlink)) {
2006 rc = PTR_ERR(tlink);
2007 goto out;
2008 }
2009 pTcon = tlink_tcon(tlink);
1870 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 2010 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1871 cifs_sb->local_nls, 2011 cifs_sb->local_nls,
1872 cifs_sb->mnt_cifs_flags & 2012 cifs_sb->mnt_cifs_flags &
1873 CIFS_MOUNT_MAP_SPECIAL_CHR); 2013 CIFS_MOUNT_MAP_SPECIAL_CHR);
2014 cifs_put_tlink(tlink);
1874 } 2015 }
1875 2016
1876 if (!rc) { 2017 if (rc)
1877 rc = inode_setattr(inode, attrs); 2018 goto out;
1878 2019
1879 /* force revalidate when any of these times are set since some 2020 if ((attrs->ia_valid & ATTR_SIZE) &&
1880 of the fs types (eg ext3, fat) do not have fine enough 2021 attrs->ia_size != i_size_read(inode))
1881 time granularity to match protocol, and we do not have a 2022 truncate_setsize(inode, attrs->ia_size);
1882 a way (yet) to query the server fs's time granularity (and 2023
1883 whether it rounds times down). 2024 setattr_copy(inode, attrs);
1884 */ 2025 mark_inode_dirty(inode);
1885 if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))) 2026
1886 cifsInode->time = 0; 2027 /* force revalidate when any of these times are set since some
1887 } 2028 of the fs types (eg ext3, fat) do not have fine enough
2029 time granularity to match protocol, and we do not have a
2030 a way (yet) to query the server fs's time granularity (and
2031 whether it rounds times down).
2032 */
2033 if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
2034 cifsInode->time = 0;
1888out: 2035out:
1889 kfree(args); 2036 kfree(args);
1890 kfree(full_path); 2037 kfree(full_path);
@@ -1909,14 +2056,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1909 cFYI(1, "setattr on file %s attrs->iavalid 0x%x", 2056 cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
1910 direntry->d_name.name, attrs->ia_valid); 2057 direntry->d_name.name, attrs->ia_valid);
1911 2058
1912 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 2059 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
1913 /* check if we have permission to change attrs */ 2060 attrs->ia_valid |= ATTR_FORCE;
1914 rc = inode_change_ok(inode, attrs); 2061
1915 if (rc < 0) { 2062 rc = inode_change_ok(inode, attrs);
1916 FreeXid(xid); 2063 if (rc < 0) {
1917 return rc; 2064 FreeXid(xid);
1918 } else 2065 return rc;
1919 rc = 0;
1920 } 2066 }
1921 2067
1922 full_path = build_path_from_dentry(direntry); 2068 full_path = build_path_from_dentry(direntry);
@@ -1938,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1938 * the flush returns error? 2084 * the flush returns error?
1939 */ 2085 */
1940 rc = filemap_write_and_wait(inode->i_mapping); 2086 rc = filemap_write_and_wait(inode->i_mapping);
1941 if (rc != 0) { 2087 mapping_set_error(inode->i_mapping, rc);
1942 cifsInode->write_behind_rc = rc; 2088 rc = 0;
1943 rc = 0;
1944 }
1945 2089
1946 if (attrs->ia_valid & ATTR_SIZE) { 2090 if (attrs->ia_valid & ATTR_SIZE) {
1947 rc = cifs_set_file_size(inode, attrs, xid, full_path); 2091 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2024,8 +2168,17 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2024 2168
2025 /* do not need local check to inode_check_ok since the server does 2169 /* do not need local check to inode_check_ok since the server does
2026 that */ 2170 that */
2027 if (!rc) 2171 if (rc)
2028 rc = inode_setattr(inode, attrs); 2172 goto cifs_setattr_exit;
2173
2174 if ((attrs->ia_valid & ATTR_SIZE) &&
2175 attrs->ia_size != i_size_read(inode))
2176 truncate_setsize(inode, attrs->ia_size);
2177
2178 setattr_copy(inode, attrs);
2179 mark_inode_dirty(inode);
2180 return 0;
2181
2029cifs_setattr_exit: 2182cifs_setattr_exit:
2030 kfree(full_path); 2183 kfree(full_path);
2031 FreeXid(xid); 2184 FreeXid(xid);
@@ -2037,7 +2190,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2037{ 2190{
2038 struct inode *inode = direntry->d_inode; 2191 struct inode *inode = direntry->d_inode;
2039 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2192 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2040 struct cifsTconInfo *pTcon = cifs_sb->tcon; 2193 struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
2041 2194
2042 if (pTcon->unix_ext) 2195 if (pTcon->unix_ext)
2043 return cifs_setattr_unix(direntry, attrs); 2196 return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 505926f1ee6..077bf756f34 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,12 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
37 int xid; 37 int xid;
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
40 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
41 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
42 __u64 caps; 44 __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
43 struct cifsTconInfo *tcon;
44 struct cifsFileInfo *pSMBFile =
45 (struct cifsFileInfo *)filep->private_data;
46#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
47 46
48 xid = GetXid(); 47 xid = GetXid();
@@ -51,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
51 50
52 cifs_sb = CIFS_SB(inode->i_sb); 51 cifs_sb = CIFS_SB(inode->i_sb);
53 52
54#ifdef CONFIG_CIFS_POSIX
55 tcon = cifs_sb->tcon;
56 if (tcon)
57 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
58 else {
59 rc = -EIO;
60 FreeXid(xid);
61 return -EIO;
62 }
63#endif /* CONFIG_CIFS_POSIX */
64
65 switch (command) { 53 switch (command) {
66 case CIFS_IOC_CHECKUMOUNT: 54 case CIFS_IOC_CHECKUMOUNT:
67 cFYI(1, "User unmount attempted"); 55 cFYI(1, "User unmount attempted");
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca803365..85cdbf831e7 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,296 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
35#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
36#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
37#define CIFS_MF_SYMLINK_FILE_SIZE \
38 (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
39
40#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
41#define CIFS_MF_SYMLINK_MD5_FORMAT \
42 "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
43#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
44 md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \
45 md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \
46 md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48
49static int
50CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len,
52 unsigned int *_link_len,
53 char **_link_str)
54{
55 int rc;
56 unsigned int link_len;
57 const char *md5_str1;
58 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16];
61 char md5_str2[34];
62
63 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
64 return -EINVAL;
65
66 md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
67 link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
68
69 rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
70 if (rc != 1)
71 return -EINVAL;
72
73 cifs_MD5_init(&md5_ctx);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
75 cifs_MD5_final(md5_hash, &md5_ctx);
76
77 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT,
79 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
80
81 if (strncmp(md5_str1, md5_str2, 17) != 0)
82 return -EINVAL;
83
84 if (_link_str) {
85 *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
86 if (!*_link_str)
87 return -ENOMEM;
88 }
89
90 *_link_len = link_len;
91 return 0;
92}
93
94static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{
97 unsigned int link_len;
98 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16];
101
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
103 return -EINVAL;
104
105 link_len = strlen(link_str);
106
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG;
109
110 cifs_MD5_init(&md5_ctx);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
112 cifs_MD5_final(md5_hash, &md5_ctx);
113
114 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
116 link_len,
117 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
118
119 ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
120 memcpy(buf + ofs, link_str, link_len);
121
122 ofs += link_len;
123 if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
124 buf[ofs] = '\n';
125 ofs++;
126 }
127
128 while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
129 buf[ofs] = ' ';
130 ofs++;
131 }
132
133 return 0;
134}
135
136static int
137CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
138 const char *fromName, const char *toName,
139 const struct nls_table *nls_codepage, int remap)
140{
141 int rc;
142 int oplock = 0;
143 __u16 netfid = 0;
144 u8 *buf;
145 unsigned int bytes_written = 0;
146
147 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
148 if (!buf)
149 return -ENOMEM;
150
151 rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
152 if (rc != 0) {
153 kfree(buf);
154 return rc;
155 }
156
157 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
158 CREATE_NOT_DIR, &netfid, &oplock, NULL,
159 nls_codepage, remap);
160 if (rc != 0) {
161 kfree(buf);
162 return rc;
163 }
164
165 rc = CIFSSMBWrite(xid, tcon, netfid,
166 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
167 0 /* offset */,
168 &bytes_written, buf, NULL, 0);
169 CIFSSMBClose(xid, tcon, netfid);
170 kfree(buf);
171 if (rc != 0)
172 return rc;
173
174 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
175 return -EIO;
176
177 return 0;
178}
179
180static int
181CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
182 const unsigned char *searchName, char **symlinkinfo,
183 const struct nls_table *nls_codepage, int remap)
184{
185 int rc;
186 int oplock = 0;
187 __u16 netfid = 0;
188 u8 *buf;
189 char *pbuf;
190 unsigned int bytes_read = 0;
191 int buf_type = CIFS_NO_BUFFER;
192 unsigned int link_len = 0;
193 FILE_ALL_INFO file_info;
194
195 rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
196 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
197 nls_codepage, remap);
198 if (rc != 0)
199 return rc;
200
201 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
202 CIFSSMBClose(xid, tcon, netfid);
203 /* it's not a symlink */
204 return -EINVAL;
205 }
206
207 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
208 if (!buf)
209 return -ENOMEM;
210 pbuf = buf;
211
212 rc = CIFSSMBRead(xid, tcon, netfid,
213 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
214 0 /* offset */,
215 &bytes_read, &pbuf, &buf_type);
216 CIFSSMBClose(xid, tcon, netfid);
217 if (rc != 0) {
218 kfree(buf);
219 return rc;
220 }
221
222 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
223 kfree(buf);
224 if (rc != 0)
225 return rc;
226
227 return 0;
228}
229
230bool
231CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
232{
233 if (!(fattr->cf_mode & S_IFREG))
234 /* it's not a symlink */
235 return false;
236
237 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
238 /* it's not a symlink */
239 return false;
240
241 return true;
242}
243
244int
245CIFSCheckMFSymlink(struct cifs_fattr *fattr,
246 const unsigned char *path,
247 struct cifs_sb_info *cifs_sb, int xid)
248{
249 int rc;
250 int oplock = 0;
251 __u16 netfid = 0;
252 struct tcon_link *tlink;
253 struct cifsTconInfo *pTcon;
254 u8 *buf;
255 char *pbuf;
256 unsigned int bytes_read = 0;
257 int buf_type = CIFS_NO_BUFFER;
258 unsigned int link_len = 0;
259 FILE_ALL_INFO file_info;
260
261 if (!CIFSCouldBeMFSymlink(fattr))
262 /* it's not a symlink */
263 return 0;
264
265 tlink = cifs_sb_tlink(cifs_sb);
266 if (IS_ERR(tlink))
267 return PTR_ERR(tlink);
268 pTcon = tlink_tcon(tlink);
269
270 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
271 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
272 cifs_sb->local_nls,
273 cifs_sb->mnt_cifs_flags &
274 CIFS_MOUNT_MAP_SPECIAL_CHR);
275 if (rc != 0)
276 goto out;
277
278 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
279 CIFSSMBClose(xid, pTcon, netfid);
280 /* it's not a symlink */
281 goto out;
282 }
283
284 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
285 if (!buf) {
286 rc = -ENOMEM;
287 goto out;
288 }
289 pbuf = buf;
290
291 rc = CIFSSMBRead(xid, pTcon, netfid,
292 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
293 0 /* offset */,
294 &bytes_read, &pbuf, &buf_type);
295 CIFSSMBClose(xid, pTcon, netfid);
296 if (rc != 0) {
297 kfree(buf);
298 goto out;
299 }
300
301 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
302 kfree(buf);
303 if (rc == -EINVAL) {
304 /* it's not a symlink */
305 rc = 0;
306 goto out;
307 }
308
309 if (rc != 0)
310 goto out;
311
312 /* it is a symlink */
313 fattr->cf_eof = link_len;
314 fattr->cf_mode &= ~S_IFMT;
315 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
316 fattr->cf_dtype = DT_LNK;
317out:
318 cifs_put_tlink(tlink);
319 return rc;
320}
31 321
32int 322int
33cifs_hardlink(struct dentry *old_file, struct inode *inode, 323cifs_hardlink(struct dentry *old_file, struct inode *inode,
@@ -37,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
37 int xid; 327 int xid;
38 char *fromName = NULL; 328 char *fromName = NULL;
39 char *toName = NULL; 329 char *toName = NULL;
40 struct cifs_sb_info *cifs_sb_target; 330 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
331 struct tcon_link *tlink;
41 struct cifsTconInfo *pTcon; 332 struct cifsTconInfo *pTcon;
42 struct cifsInodeInfo *cifsInode; 333 struct cifsInodeInfo *cifsInode;
43 334
44 xid = GetXid(); 335 tlink = cifs_sb_tlink(cifs_sb);
45 336 if (IS_ERR(tlink))
46 cifs_sb_target = CIFS_SB(inode->i_sb); 337 return PTR_ERR(tlink);
47 pTcon = cifs_sb_target->tcon; 338 pTcon = tlink_tcon(tlink);
48 339
49/* No need to check for cross device links since server will do that 340 xid = GetXid();
50 BB note DFS case in future though (when we may have to check) */
51 341
52 fromName = build_path_from_dentry(old_file); 342 fromName = build_path_from_dentry(old_file);
53 toName = build_path_from_dentry(direntry); 343 toName = build_path_from_dentry(direntry);
@@ -56,16 +346,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
56 goto cifs_hl_exit; 346 goto cifs_hl_exit;
57 } 347 }
58 348
59/* if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
60 if (pTcon->unix_ext) 349 if (pTcon->unix_ext)
61 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, 350 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
62 cifs_sb_target->local_nls, 351 cifs_sb->local_nls,
63 cifs_sb_target->mnt_cifs_flags & 352 cifs_sb->mnt_cifs_flags &
64 CIFS_MOUNT_MAP_SPECIAL_CHR); 353 CIFS_MOUNT_MAP_SPECIAL_CHR);
65 else { 354 else {
66 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, 355 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
67 cifs_sb_target->local_nls, 356 cifs_sb->local_nls,
68 cifs_sb_target->mnt_cifs_flags & 357 cifs_sb->mnt_cifs_flags &
69 CIFS_MOUNT_MAP_SPECIAL_CHR); 358 CIFS_MOUNT_MAP_SPECIAL_CHR);
70 if ((rc == -EIO) || (rc == -EINVAL)) 359 if ((rc == -EIO) || (rc == -EINVAL))
71 rc = -EOPNOTSUPP; 360 rc = -EOPNOTSUPP;
@@ -101,6 +390,7 @@ cifs_hl_exit:
101 kfree(fromName); 390 kfree(fromName);
102 kfree(toName); 391 kfree(toName);
103 FreeXid(xid); 392 FreeXid(xid);
393 cifs_put_tlink(tlink);
104 return rc; 394 return rc;
105} 395}
106 396
@@ -113,10 +403,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
113 char *full_path = NULL; 403 char *full_path = NULL;
114 char *target_path = NULL; 404 char *target_path = NULL;
115 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 405 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
116 struct cifsTconInfo *tcon = cifs_sb->tcon; 406 struct tcon_link *tlink = NULL;
407 struct cifsTconInfo *tcon;
117 408
118 xid = GetXid(); 409 xid = GetXid();
119 410
411 tlink = cifs_sb_tlink(cifs_sb);
412 if (IS_ERR(tlink)) {
413 rc = PTR_ERR(tlink);
414 tlink = NULL;
415 goto out;
416 }
417 tcon = tlink_tcon(tlink);
418
120 /* 419 /*
121 * For now, we just handle symlinks with unix extensions enabled. 420 * For now, we just handle symlinks with unix extensions enabled.
122 * Eventually we should handle NTFS reparse points, and MacOS 421 * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +429,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
130 * but there doesn't seem to be any harm in allowing the client to 429 * but there doesn't seem to be any harm in allowing the client to
131 * read them. 430 * read them.
132 */ 431 */
133 if (!(tcon->ses->capabilities & CAP_UNIX)) { 432 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
433 && !(tcon->ses->capabilities & CAP_UNIX)) {
134 rc = -EACCES; 434 rc = -EACCES;
135 goto out; 435 goto out;
136 } 436 }
@@ -141,8 +441,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
141 441
142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); 442 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
143 443
144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 444 rc = -EACCES;
145 cifs_sb->local_nls); 445 /*
446 * First try Minshall+French Symlinks, if configured
447 * and fallback to UNIX Extensions Symlinks.
448 */
449 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
450 rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
451 cifs_sb->local_nls,
452 cifs_sb->mnt_cifs_flags &
453 CIFS_MOUNT_MAP_SPECIAL_CHR);
454
455 if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
456 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
457 cifs_sb->local_nls);
458
146 kfree(full_path); 459 kfree(full_path);
147out: 460out:
148 if (rc != 0) { 461 if (rc != 0) {
@@ -151,6 +464,8 @@ out:
151 } 464 }
152 465
153 FreeXid(xid); 466 FreeXid(xid);
467 if (tlink)
468 cifs_put_tlink(tlink);
154 nd_set_link(nd, target_path); 469 nd_set_link(nd, target_path);
155 return NULL; 470 return NULL;
156} 471}
@@ -160,29 +475,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
160{ 475{
161 int rc = -EOPNOTSUPP; 476 int rc = -EOPNOTSUPP;
162 int xid; 477 int xid;
163 struct cifs_sb_info *cifs_sb; 478 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
479 struct tcon_link *tlink;
164 struct cifsTconInfo *pTcon; 480 struct cifsTconInfo *pTcon;
165 char *full_path = NULL; 481 char *full_path = NULL;
166 struct inode *newinode = NULL; 482 struct inode *newinode = NULL;
167 483
168 xid = GetXid(); 484 xid = GetXid();
169 485
170 cifs_sb = CIFS_SB(inode->i_sb); 486 tlink = cifs_sb_tlink(cifs_sb);
171 pTcon = cifs_sb->tcon; 487 if (IS_ERR(tlink)) {
488 rc = PTR_ERR(tlink);
489 goto symlink_exit;
490 }
491 pTcon = tlink_tcon(tlink);
172 492
173 full_path = build_path_from_dentry(direntry); 493 full_path = build_path_from_dentry(direntry);
174
175 if (full_path == NULL) { 494 if (full_path == NULL) {
176 rc = -ENOMEM; 495 rc = -ENOMEM;
177 FreeXid(xid); 496 goto symlink_exit;
178 return rc;
179 } 497 }
180 498
181 cFYI(1, "Full path: %s", full_path); 499 cFYI(1, "Full path: %s", full_path);
182 cFYI(1, "symname is %s", symname); 500 cFYI(1, "symname is %s", symname);
183 501
184 /* BB what if DFS and this volume is on different share? BB */ 502 /* BB what if DFS and this volume is on different share? BB */
185 if (pTcon->unix_ext) 503 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
504 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
505 cifs_sb->local_nls,
506 cifs_sb->mnt_cifs_flags &
507 CIFS_MOUNT_MAP_SPECIAL_CHR);
508 else if (pTcon->unix_ext)
186 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 509 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
187 cifs_sb->local_nls); 510 cifs_sb->local_nls);
188 /* else 511 /* else
@@ -208,8 +531,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
208 d_instantiate(direntry, newinode); 531 d_instantiate(direntry, newinode);
209 } 532 }
210 } 533 }
211 534symlink_exit:
212 kfree(full_path); 535 kfree(full_path);
536 cifs_put_tlink(tlink);
213 FreeXid(xid); 537 FreeXid(xid);
214 return rc; 538 return rc;
215} 539}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1394aa37f26..c4e296fe351 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, "Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid"); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 spin_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
361 } 361 }
362 } 362 }
363 } 363 }
364 read_unlock(&cifs_tcp_ses_lock); 364 spin_unlock(&cifs_tcp_ses_lock);
365 } 365 }
366 } 366 }
367 } 367 }
@@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
498 struct cifsTconInfo *tcon; 498 struct cifsTconInfo *tcon;
499 struct cifsInodeInfo *pCifsInode; 499 struct cifsInodeInfo *pCifsInode;
500 struct cifsFileInfo *netfile; 500 struct cifsFileInfo *netfile;
501 int rc;
502 501
503 cFYI(1, "Checking for oplock break or dnotify response"); 502 cFYI(1, "Checking for oplock break or dnotify response");
504 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 503 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -552,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
552 return false; 551 return false;
553 552
554 /* look up tcon based on tid & uid */ 553 /* look up tcon based on tid & uid */
555 read_lock(&cifs_tcp_ses_lock); 554 spin_lock(&cifs_tcp_ses_lock);
556 list_for_each(tmp, &srv->smb_ses_list) { 555 list_for_each(tmp, &srv->smb_ses_list) {
557 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
558 list_for_each(tmp1, &ses->tcon_list) { 557 list_for_each(tmp1, &ses->tcon_list) {
@@ -561,46 +560,41 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
561 continue; 560 continue;
562 561
563 cifs_stats_inc(&tcon->num_oplock_brks); 562 cifs_stats_inc(&tcon->num_oplock_brks);
564 read_lock(&GlobalSMBSeslock); 563 spin_lock(&cifs_file_list_lock);
565 list_for_each(tmp2, &tcon->openFileList) { 564 list_for_each(tmp2, &tcon->openFileList) {
566 netfile = list_entry(tmp2, struct cifsFileInfo, 565 netfile = list_entry(tmp2, struct cifsFileInfo,
567 tlist); 566 tlist);
568 if (pSMB->Fid != netfile->netfid) 567 if (pSMB->Fid != netfile->netfid)
569 continue; 568 continue;
570 569
571 /*
572 * don't do anything if file is about to be
573 * closed anyway.
574 */
575 if (netfile->closePend) {
576 read_unlock(&GlobalSMBSeslock);
577 read_unlock(&cifs_tcp_ses_lock);
578 return true;
579 }
580
581 cFYI(1, "file id match, oplock break"); 570 cFYI(1, "file id match, oplock break");
582 pCifsInode = CIFS_I(netfile->pInode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
583 pCifsInode->clientCanCacheAll = false; 572 pCifsInode->clientCanCacheAll = false;
584 if (pSMB->OplockLevel == 0) 573 if (pSMB->OplockLevel == 0)
585 pCifsInode->clientCanCacheRead = false; 574 pCifsInode->clientCanCacheRead = false;
586 rc = slow_work_enqueue(&netfile->oplock_break); 575
587 if (rc) { 576 /*
588 cERROR(1, "failed to enqueue oplock " 577 * cifs_oplock_break_put() can't be called
589 "break: %d\n", rc); 578 * from here. Get reference after queueing
590 } else { 579 * succeeded. cifs_oplock_break() will
591 netfile->oplock_break_cancelled = false; 580 * synchronize using cifs_file_list_lock.
592 } 581 */
593 read_unlock(&GlobalSMBSeslock); 582 if (queue_work(system_nrt_wq,
594 read_unlock(&cifs_tcp_ses_lock); 583 &netfile->oplock_break))
584 cifs_oplock_break_get(netfile);
585 netfile->oplock_break_cancelled = false;
586
587 spin_unlock(&cifs_file_list_lock);
588 spin_unlock(&cifs_tcp_ses_lock);
595 return true; 589 return true;
596 } 590 }
597 read_unlock(&GlobalSMBSeslock); 591 spin_unlock(&cifs_file_list_lock);
598 read_unlock(&cifs_tcp_ses_lock); 592 spin_unlock(&cifs_tcp_ses_lock);
599 cFYI(1, "No matching file for oplock break"); 593 cFYI(1, "No matching file for oplock break");
600 return true; 594 return true;
601 } 595 }
602 } 596 }
603 read_unlock(&cifs_tcp_ses_lock); 597 spin_unlock(&cifs_tcp_ses_lock);
604 cFYI(1, "Can not process oplock break for non-existent connection"); 598 cFYI(1, "Can not process oplock break for non-existent connection");
605 return true; 599 return true;
606} 600}
@@ -725,6 +719,6 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
725 "properly. Hardlinks will not be recognized on this " 719 "properly. Hardlinks will not be recognized on this "
726 "mount. Consider mounting with the \"noserverino\" " 720 "mount. Consider mounting with the \"noserverino\" "
727 "option to silence this message.", 721 "option to silence this message.",
728 cifs_sb->tcon->treeName); 722 cifs_sb_master_tcon(cifs_sb)->treeName);
729 } 723 }
730} 724}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d35d52889cb..9aad47a2d62 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
61 {ERRremcd, -EACCES}, 61 {ERRremcd, -EACCES},
62 {ERRdiffdevice, -EXDEV}, 62 {ERRdiffdevice, -EXDEV},
63 {ERRnofiles, -ENOENT}, 63 {ERRnofiles, -ENOENT},
64 {ERRwriteprot, -EROFS},
64 {ERRbadshare, -ETXTBSY}, 65 {ERRbadshare, -ETXTBSY},
65 {ERRlock, -EACCES}, 66 {ERRlock, -EACCES},
66 {ERRunsup, -EINVAL}, 67 {ERRunsup, -EINVAL},
@@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
139 * Returns 0 on failure. 140 * Returns 0 on failure.
140 */ 141 */
141static int 142static int
142cifs_inet_pton(const int address_family, const char *cp, void *dst) 143cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
143{ 144{
144 int ret = 0; 145 int ret = 0;
145 146
146 /* calculate length by finding first slash or NULL */ 147 /* calculate length by finding first slash or NULL */
147 if (address_family == AF_INET) 148 if (address_family == AF_INET)
148 ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); 149 ret = in4_pton(cp, len, dst, '\\', NULL);
149 else if (address_family == AF_INET6) 150 else if (address_family == AF_INET6)
150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); 151 ret = in6_pton(cp, len, dst , '\\', NULL);
151 152
152 cFYI(DBG2, "address conversion returned %d for %s", ret, cp); 153 cFYI(DBG2, "address conversion returned %d for %*.*s",
154 ret, len, len, cp);
153 if (ret > 0) 155 if (ret > 0)
154 ret = 1; 156 ret = 1;
155 return ret; 157 return ret;
@@ -164,43 +166,70 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
164 * Returns 0 on failure. 166 * Returns 0 on failure.
165 */ 167 */
166int 168int
167cifs_convert_address(char *src, void *dst) 169cifs_convert_address(struct sockaddr *dst, const char *src, int len)
168{ 170{
169 int rc; 171 int rc, alen, slen;
170 char *pct, *endp; 172 const char *pct;
173 char *endp, scope_id[13];
171 struct sockaddr_in *s4 = (struct sockaddr_in *) dst; 174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
172 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; 175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
173 176
174 /* IPv4 address */ 177 /* IPv4 address */
175 if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { 178 if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
176 s4->sin_family = AF_INET; 179 s4->sin_family = AF_INET;
177 return 1; 180 return 1;
178 } 181 }
179 182
180 /* temporarily terminate string */ 183 /* attempt to exclude the scope ID from the address part */
181 pct = strchr(src, '%'); 184 pct = memchr(src, '%', len);
182 if (pct) 185 alen = pct ? pct - src : len;
183 *pct = '\0';
184
185 rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
186
187 /* repair temp termination (if any) and make pct point to scopeid */
188 if (pct)
189 *pct++ = '%';
190 186
187 rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
191 if (!rc) 188 if (!rc)
192 return rc; 189 return rc;
193 190
194 s6->sin6_family = AF_INET6; 191 s6->sin6_family = AF_INET6;
195 if (pct) { 192 if (pct) {
193 /* grab the scope ID */
194 slen = len - (alen + 1);
195 if (slen <= 0 || slen > 12)
196 return 0;
197 memcpy(scope_id, pct + 1, slen);
198 scope_id[slen] = '\0';
199
196 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); 200 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
197 if (!*pct || *endp) 201 if (endp != scope_id + slen)
198 return 0; 202 return 0;
199 } 203 }
200 204
201 return rc; 205 return rc;
202} 206}
203 207
208int
209cifs_set_port(struct sockaddr *addr, const unsigned short int port)
210{
211 switch (addr->sa_family) {
212 case AF_INET:
213 ((struct sockaddr_in *)addr)->sin_port = htons(port);
214 break;
215 case AF_INET6:
216 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
217 break;
218 default:
219 return 0;
220 }
221 return 1;
222}
223
224int
225cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
226 const unsigned short int port)
227{
228 if (!cifs_convert_address(dst, src, len))
229 return 0;
230 return cifs_set_port(dst, port);
231}
232
204/***************************************************************************** 233/*****************************************************************************
205convert a NT status code to a dos class/code 234convert a NT status code to a dos class/code
206 *****************************************************************************/ 235 *****************************************************************************/
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e7531..5d52e4a3b1e 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000 62#define NTLMSSP_NEGOTIATE_56 0x80000000
63 63
64/* Define AV Pair Field IDs */
65enum av_field_type {
66 NTLMSSP_AV_EOL = 0,
67 NTLMSSP_AV_NB_COMPUTER_NAME,
68 NTLMSSP_AV_NB_DOMAIN_NAME,
69 NTLMSSP_AV_DNS_COMPUTER_NAME,
70 NTLMSSP_AV_DNS_DOMAIN_NAME,
71 NTLMSSP_AV_DNS_TREE_NAME,
72 NTLMSSP_AV_FLAGS,
73 NTLMSSP_AV_TIMESTAMP,
74 NTLMSSP_AV_RESTRICTION,
75 NTLMSSP_AV_TARGET_NAME,
76 NTLMSSP_AV_CHANNEL_BINDINGS
77};
78
64/* Although typedefs are not commonly used for structure definitions */ 79/* Although typedefs are not commonly used for structure definitions */
65/* in the Linux kernel, in this particular case they are useful */ 80/* in the Linux kernel, in this particular case they are useful */
66/* to more closely match the standards document for NTLMSSP from */ 81/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index daf1753af67..ef7bb7b50f5 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (CIFS_SB(sb)->tcon->nocase) 105 if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
106 dentry->d_op = &cifs_ci_dentry_ops; 106 dentry->d_op = &cifs_ci_dentry_ops;
107 else 107 else
108 dentry->d_op = &cifs_dentry_ops; 108 dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
172 struct cifs_sb_info *cifs_sb) 172 struct cifs_sb_info *cifs_sb)
173{ 173{
174 int offset = cifs_sb->tcon->ses->server->timeAdj; 174 int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
175 175
176 memset(fattr, 0, sizeof(*fattr)); 176 memset(fattr, 0, sizeof(*fattr));
177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, 177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
199 int len; 199 int len;
200 int oplock = 0; 200 int oplock = 0;
201 int rc; 201 int rc;
202 struct cifsTconInfo *ptcon = cifs_sb->tcon; 202 struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
203 char *tmpbuffer; 203 char *tmpbuffer;
204 204
205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, 205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
223static int initiate_cifs_search(const int xid, struct file *file) 223static int initiate_cifs_search(const int xid, struct file *file)
224{ 224{
225 int rc = 0; 225 int rc = 0;
226 char *full_path; 226 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 227 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb; 228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
229 struct tcon_link *tlink;
229 struct cifsTconInfo *pTcon; 230 struct cifsTconInfo *pTcon;
230 231
231 if (file->private_data == NULL) { 232 tlink = cifs_sb_tlink(cifs_sb);
233 if (IS_ERR(tlink))
234 return PTR_ERR(tlink);
235 pTcon = tlink_tcon(tlink);
236
237 if (file->private_data == NULL)
232 file->private_data = 238 file->private_data =
233 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 239 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
240 if (file->private_data == NULL) {
241 rc = -ENOMEM;
242 goto error_exit;
234 } 243 }
235 244
236 if (file->private_data == NULL)
237 return -ENOMEM;
238 cifsFile = file->private_data; 245 cifsFile = file->private_data;
239 cifsFile->invalidHandle = true; 246 cifsFile->invalidHandle = true;
240 cifsFile->srch_inf.endOfSearch = false; 247 cifsFile->srch_inf.endOfSearch = false;
241 248 cifsFile->tlink = cifs_get_tlink(tlink);
242 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
243 if (cifs_sb == NULL)
244 return -EINVAL;
245
246 pTcon = cifs_sb->tcon;
247 if (pTcon == NULL)
248 return -EINVAL;
249 249
250 full_path = build_path_from_dentry(file->f_path.dentry); 250 full_path = build_path_from_dentry(file->f_path.dentry);
251 251 if (full_path == NULL) {
252 if (full_path == NULL) 252 rc = -ENOMEM;
253 return -ENOMEM; 253 goto error_exit;
254 }
254 255
255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); 256 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
256 257
@@ -283,7 +284,9 @@ ffirst_retry:
283 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 284 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
284 goto ffirst_retry; 285 goto ffirst_retry;
285 } 286 }
287error_exit:
286 kfree(full_path); 288 kfree(full_path);
289 cifs_put_tlink(tlink);
287 return rc; 290 return rc;
288} 291}
289 292
@@ -525,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
525 (index_to_find < first_entry_in_buffer)) { 528 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 529 /* close and restart search */
527 cFYI(1, "search backing up - close and restart search"); 530 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 531 spin_lock(&cifs_file_list_lock);
529 if (!cifsFile->srch_inf.endOfSearch && 532 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 533 !cifsFile->invalidHandle) {
531 cifsFile->invalidHandle = true; 534 cifsFile->invalidHandle = true;
532 write_unlock(&GlobalSMBSeslock); 535 spin_unlock(&cifs_file_list_lock);
533 CIFSFindClose(xid, pTcon, cifsFile->netfid); 536 CIFSFindClose(xid, pTcon, cifsFile->netfid);
534 } else 537 } else
535 write_unlock(&GlobalSMBSeslock); 538 spin_unlock(&cifs_file_list_lock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 539 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, "freeing SMB ff cache buf on search rewind"); 540 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 541 if (cifsFile->srch_inf.smallBuf)
@@ -738,6 +741,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
738 cifs_autodisable_serverino(cifs_sb); 741 cifs_autodisable_serverino(cifs_sb);
739 } 742 }
740 743
744 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
745 CIFSCouldBeMFSymlink(&fattr))
746 /*
747 * trying to get the type and mode can be slow,
748 * so just call those regular files for now, and mark
749 * for reval
750 */
751 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
752
741 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 753 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
742 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 754 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
743 755
@@ -777,9 +789,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
777 xid = GetXid(); 789 xid = GetXid();
778 790
779 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 791 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
780 pTcon = cifs_sb->tcon; 792
781 if (pTcon == NULL) 793 /*
782 return -EINVAL; 794 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
795 * '..'. Otherwise we won't be able to notify VFS in case of failure.
796 */
797 if (file->private_data == NULL) {
798 rc = initiate_cifs_search(xid, file);
799 cFYI(1, "initiate cifs search rc %d", rc);
800 if (rc)
801 goto rddir2_exit;
802 }
783 803
784 switch ((int) file->f_pos) { 804 switch ((int) file->f_pos) {
785 case 0: 805 case 0:
@@ -805,14 +825,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
805 if after then keep searching till find it */ 825 if after then keep searching till find it */
806 826
807 if (file->private_data == NULL) { 827 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) {
811 FreeXid(xid);
812 return rc;
813 }
814 }
815 if (file->private_data == NULL) {
816 rc = -EINVAL; 828 rc = -EINVAL;
817 FreeXid(xid); 829 FreeXid(xid);
818 return rc; 830 return rc;
@@ -829,6 +841,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
829 CIFSFindClose(xid, pTcon, cifsFile->netfid); 841 CIFSFindClose(xid, pTcon, cifsFile->netfid);
830 } */ 842 } */
831 843
844 pTcon = tlink_tcon(cifsFile->tlink);
832 rc = find_cifs_entry(xid, pTcon, file, 845 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 846 &current_entry, &num_to_fill);
834 if (rc) { 847 if (rc) {
@@ -847,6 +860,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 860 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
848 861
849 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); 862 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
863 if (tmp_buf == NULL) {
864 rc = -ENOMEM;
865 break;
866 }
867
850 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 868 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
851 if (current_entry == NULL) { 869 if (current_entry == NULL) {
852 /* evaluate whether this case is an error */ 870 /* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5d..7b01d3f6eed 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include "cifs_spnego.h" 33#include "cifs_spnego.h"
34 34
35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
36 unsigned char *p24);
37
38/* 35/*
39 * Checks if this is the first smb session to be reconnected after 36 * Checks if this is the first smb session to be reconnected after
40 * the socket has been reestablished (so we know whether to use vc 0). 37 * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
80 if (max_vcs < 2) 77 if (max_vcs < 2)
81 max_vcs = 0xFFFF; 78 max_vcs = 0xFFFF;
82 79
83 write_lock(&cifs_tcp_ses_lock); 80 spin_lock(&cifs_tcp_ses_lock);
84 if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) 81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
85 goto get_vc_num_exit; /* vcnum will be zero */ 82 goto get_vc_num_exit; /* vcnum will be zero */
86 for (i = ses->server->srv_count - 1; i < max_vcs; i++) { 83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
112 vcnum = i; 109 vcnum = i;
113 ses->vcnum = vcnum; 110 ses->vcnum = vcnum;
114get_vc_num_exit: 111get_vc_num_exit:
115 write_unlock(&cifs_tcp_ses_lock); 112 spin_unlock(&cifs_tcp_ses_lock);
116 113
117 return cpu_to_le16(vcnum); 114 return cpu_to_le16(vcnum);
118} 115}
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 380static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 struct cifsSesInfo *ses) 381 struct cifsSesInfo *ses)
385{ 382{
383 unsigned int tioffset; /* challenge message target info area */
384 unsigned int tilen; /* challenge message target info area length */
385
386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
387 387
388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,11 +399,23 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
399 return -EINVAL; 399 return -EINVAL;
400 } 400 }
401 401
402 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); 402 memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
403 /* BB we could decode pblob->NegotiateFlags; some may be useful */ 403 /* BB we could decode pblob->NegotiateFlags; some may be useful */
404 /* In particular we can examine sign flags */ 404 /* In particular we can examine sign flags */
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 406 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
408 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
409 tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
410 if (tilen) {
411 ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
412 if (!ses->auth_key.response) {
413 cERROR(1, "Challenge target info allocation failure");
414 return -ENOMEM;
415 }
416 memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
417 ses->auth_key.len = tilen;
418 }
407 419
408 return 0; 420 return 0;
409} 421}
@@ -425,12 +437,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
425 /* BB is NTLMV2 session security format easier to use here? */ 437 /* BB is NTLMV2 session security format easier to use here? */
426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
428 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 440 NTLMSSP_NEGOTIATE_NTLM;
429 if (ses->server->secMode & 441 if (ses->server->secMode &
430 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
431 flags |= NTLMSSP_NEGOTIATE_SIGN; 443 flags |= NTLMSSP_NEGOTIATE_SIGN;
432 if (ses->server->secMode & SECMODE_SIGN_REQUIRED) 444 if (!ses->server->session_estab)
433 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 445 flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
446 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
447 }
434 448
435 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 449 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
436 450
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
448 maximum possible size is fixed and small, making this approach cleaner. 462 maximum possible size is fixed and small, making this approach cleaner.
449 This function returns the length of the data in the blob */ 463 This function returns the length of the data in the blob */
450static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 464static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
465 u16 *buflen,
451 struct cifsSesInfo *ses, 466 struct cifsSesInfo *ses,
452 const struct nls_table *nls_cp, bool first) 467 const struct nls_table *nls_cp)
453{ 468{
469 int rc;
454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 470 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
455 __u32 flags; 471 __u32 flags;
456 unsigned char *tmp; 472 unsigned char *tmp;
457 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
458 473
459 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 474 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
460 sec_blob->MessageType = NtLmAuthenticate; 475 sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
462 flags = NTLMSSP_NEGOTIATE_56 | 477 flags = NTLMSSP_NEGOTIATE_56 |
463 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
464 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
465 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 480 NTLMSSP_NEGOTIATE_NTLM;
466 if (ses->server->secMode & 481 if (ses->server->secMode &
467 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
468 flags |= NTLMSSP_NEGOTIATE_SIGN; 483 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +492,20 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 sec_blob->LmChallengeResponse.Length = 0; 492 sec_blob->LmChallengeResponse.Length = 0;
478 sec_blob->LmChallengeResponse.MaximumLength = 0; 493 sec_blob->LmChallengeResponse.MaximumLength = 0;
479 494
480 /* calculate session key, BB what about adding similar ntlmv2 path? */
481 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
482 if (first)
483 cifs_calculate_mac_key(&ses->server->mac_signing_key,
484 ntlm_session_key, ses->password);
485
486 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); 495 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
488 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); 496 rc = setup_ntlmv2_rsp(ses, nls_cp);
489 sec_blob->NtChallengeResponse.MaximumLength = 497 if (rc) {
490 cpu_to_le16(CIFS_SESS_KEY_SIZE); 498 cERROR(1, "Error %d during NTLMSSP authentication", rc);
499 goto setup_ntlmv2_ret;
500 }
501 memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
502 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
503 tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
491 504
492 tmp += CIFS_SESS_KEY_SIZE; 505 sec_blob->NtChallengeResponse.Length =
506 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
507 sec_blob->NtChallengeResponse.MaximumLength =
508 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
493 509
494 if (ses->domainName == NULL) { 510 if (ses->domainName == NULL) {
495 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 511 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 517 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
502 MAX_USERNAME_SIZE, nls_cp); 518 MAX_USERNAME_SIZE, nls_cp);
503 len *= 2; /* unicode is 2 bytes each */ 519 len *= 2; /* unicode is 2 bytes each */
504 len += 2; /* trailing null */
505 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 520 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
506 sec_blob->DomainName.Length = cpu_to_le16(len); 521 sec_blob->DomainName.Length = cpu_to_le16(len);
507 sec_blob->DomainName.MaximumLength = cpu_to_le16(len); 522 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 533 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
519 MAX_USERNAME_SIZE, nls_cp); 534 MAX_USERNAME_SIZE, nls_cp);
520 len *= 2; /* unicode is 2 bytes each */ 535 len *= 2; /* unicode is 2 bytes each */
521 len += 2; /* trailing null */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 536 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 537 sec_blob->UserName.Length = cpu_to_le16(len);
524 sec_blob->UserName.MaximumLength = cpu_to_le16(len); 538 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,10 +544,23 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
530 sec_blob->WorkstationName.MaximumLength = 0; 544 sec_blob->WorkstationName.MaximumLength = 0;
531 tmp += 2; 545 tmp += 2;
532 546
533 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 547 if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
534 sec_blob->SessionKey.Length = 0; 548 !calc_seckey(ses)) {
535 sec_blob->SessionKey.MaximumLength = 0; 549 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
536 return tmp - pbuffer; 550 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
551 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
552 sec_blob->SessionKey.MaximumLength =
553 cpu_to_le16(CIFS_CPHTXT_SIZE);
554 tmp += CIFS_CPHTXT_SIZE;
555 } else {
556 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
557 sec_blob->SessionKey.Length = 0;
558 sec_blob->SessionKey.MaximumLength = 0;
559 }
560
561setup_ntlmv2_ret:
562 *buflen = tmp - pbuffer;
563 return rc;
537} 564}
538 565
539 566
@@ -545,19 +572,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
545 572
546 return; 573 return;
547} 574}
548
549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
550 struct cifsSesInfo *ses,
551 const struct nls_table *nls, bool first_time)
552{
553 int bloblen;
554
555 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
556 first_time);
557 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
558
559 return bloblen;
560}
561#endif 575#endif
562 576
563int 577int
@@ -579,18 +593,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
579 int bytes_remaining; 593 int bytes_remaining;
580 struct key *spnego_key = NULL; 594 struct key *spnego_key = NULL;
581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 595 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time; 596 u16 blob_len;
597 char *ntlmsspblob = NULL;
583 598
584 if (ses == NULL) 599 if (ses == NULL)
585 return -EINVAL; 600 return -EINVAL;
586 601
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
591 type = ses->server->secType; 602 type = ses->server->secType;
592
593 cFYI(1, "sess setup type %d", type); 603 cFYI(1, "sess setup type %d", type);
604 if (type == RawNTLMSSP) {
605 /* if memory allocation is successful, caller of this function
606 * frees it.
607 */
608 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
609 if (!ses->ntlmssp)
610 return -ENOMEM;
611 }
612
594ssetup_ntlmssp_authenticate: 613ssetup_ntlmssp_authenticate:
595 if (phase == NtLmChallenge) 614 if (phase == NtLmChallenge)
596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 615 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +674,14 @@ ssetup_ntlmssp_authenticate:
655 /* no capabilities flags in old lanman negotiation */ 674 /* no capabilities flags in old lanman negotiation */
656 675
657 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 676 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
658 /* BB calculate hash with password */
659 /* and copy into bcc */
660 677
661 calc_lanman_hash(ses->password, ses->server->cryptKey, 678 /* Calculate hash with password and copy into bcc_ptr.
679 * Encryption Key (stored as in cryptkey) gets used if the
680 * security mode bit in Negottiate Protocol response states
681 * to use challenge/response method (i.e. Password bit is 1).
682 */
683
684 calc_lanman_hash(ses->password, ses->server->cryptkey,
662 ses->server->secMode & SECMODE_PW_ENCRYPT ? 685 ses->server->secMode & SECMODE_PW_ENCRYPT ?
663 true : false, lnm_session_key); 686 true : false, lnm_session_key);
664 687
@@ -676,28 +699,27 @@ ssetup_ntlmssp_authenticate:
676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 699 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
677#endif 700#endif
678 } else if (type == NTLM) { 701 } else if (type == NTLM) {
679 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
680
681 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 702 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
682 pSMB->req_no_secext.CaseInsensitivePasswordLength = 703 pSMB->req_no_secext.CaseInsensitivePasswordLength =
683 cpu_to_le16(CIFS_SESS_KEY_SIZE); 704 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
684 pSMB->req_no_secext.CaseSensitivePasswordLength = 705 pSMB->req_no_secext.CaseSensitivePasswordLength =
685 cpu_to_le16(CIFS_SESS_KEY_SIZE); 706 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
686 707
687 /* calculate session key */ 708 /* calculate ntlm response and session key */
688 SMBNTencrypt(ses->password, ses->server->cryptKey, 709 rc = setup_ntlm_response(ses);
689 ntlm_session_key); 710 if (rc) {
711 cERROR(1, "Error %d during NTLM authentication", rc);
712 goto ssetup_exit;
713 }
690 714
691 if (first_time) /* should this be moved into common code 715 /* copy ntlm response */
692 with similar ntlmv2 path? */ 716 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
693 cifs_calculate_mac_key(&ses->server->mac_signing_key, 717 CIFS_AUTH_RESP_SIZE);
694 ntlm_session_key, ses->password); 718 bcc_ptr += CIFS_AUTH_RESP_SIZE;
695 /* copy session key */ 719 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
720 CIFS_AUTH_RESP_SIZE);
721 bcc_ptr += CIFS_AUTH_RESP_SIZE;
696 722
697 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
698 bcc_ptr += CIFS_SESS_KEY_SIZE;
699 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
700 bcc_ptr += CIFS_SESS_KEY_SIZE;
701 if (ses->capabilities & CAP_UNICODE) { 723 if (ses->capabilities & CAP_UNICODE) {
702 /* unicode strings must be word aligned */ 724 /* unicode strings must be word aligned */
703 if (iov[0].iov_len % 2) { 725 if (iov[0].iov_len % 2) {
@@ -708,33 +730,27 @@ ssetup_ntlmssp_authenticate:
708 } else 730 } else
709 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 731 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
710 } else if (type == NTLMv2) { 732 } else if (type == NTLMv2) {
711 char *v2_sess_key =
712 kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
713
714 /* BB FIXME change all users of v2_sess_key to
715 struct ntlmv2_resp */
716
717 if (v2_sess_key == NULL) {
718 rc = -ENOMEM;
719 goto ssetup_exit;
720 }
721
722 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 733 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
723 734
724 /* LM2 password would be here if we supported it */ 735 /* LM2 password would be here if we supported it */
725 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; 736 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
726 /* cpu_to_le16(LM2_SESS_KEY_SIZE); */
727 737
738 /* calculate nlmv2 response and session key */
739 rc = setup_ntlmv2_rsp(ses, nls_cp);
740 if (rc) {
741 cERROR(1, "Error %d during NTLMv2 authentication", rc);
742 goto ssetup_exit;
743 }
744 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
745 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
746 bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
747
748 /* set case sensitive password length after tilen may get
749 * assigned, tilen is 0 otherwise.
750 */
728 pSMB->req_no_secext.CaseSensitivePasswordLength = 751 pSMB->req_no_secext.CaseSensitivePasswordLength =
729 cpu_to_le16(sizeof(struct ntlmv2_resp)); 752 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
730 753
731 /* calculate session key */
732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
733 /* FIXME: calculate MAC key */
734 memcpy(bcc_ptr, (char *)v2_sess_key,
735 sizeof(struct ntlmv2_resp));
736 bcc_ptr += sizeof(struct ntlmv2_resp);
737 kfree(v2_sess_key);
738 if (ses->capabilities & CAP_UNICODE) { 754 if (ses->capabilities & CAP_UNICODE) {
739 if (iov[0].iov_len % 2) { 755 if (iov[0].iov_len % 2) {
740 *bcc_ptr = 0; 756 *bcc_ptr = 0;
@@ -746,6 +762,7 @@ ssetup_ntlmssp_authenticate:
746 } else if (type == Kerberos) { 762 } else if (type == Kerberos) {
747#ifdef CONFIG_CIFS_UPCALL 763#ifdef CONFIG_CIFS_UPCALL
748 struct cifs_spnego_msg *msg; 764 struct cifs_spnego_msg *msg;
765
749 spnego_key = cifs_get_spnego_key(ses); 766 spnego_key = cifs_get_spnego_key(ses);
750 if (IS_ERR(spnego_key)) { 767 if (IS_ERR(spnego_key)) {
751 rc = PTR_ERR(spnego_key); 768 rc = PTR_ERR(spnego_key);
@@ -763,19 +780,17 @@ ssetup_ntlmssp_authenticate:
763 rc = -EKEYREJECTED; 780 rc = -EKEYREJECTED;
764 goto ssetup_exit; 781 goto ssetup_exit;
765 } 782 }
766 /* bail out if key is too long */ 783
767 if (msg->sesskey_len > 784 ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 785 if (!ses->auth_key.response) {
769 cERROR(1, "Kerberos signing key too long (%u bytes)", 786 cERROR(1, "Kerberos can't allocate (%u bytes) memory",
770 msg->sesskey_len); 787 msg->sesskey_len);
771 rc = -EOVERFLOW; 788 rc = -ENOMEM;
772 goto ssetup_exit; 789 goto ssetup_exit;
773 } 790 }
774 if (first_time) { 791 memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
775 ses->server->mac_signing_key.len = msg->sesskey_len; 792 ses->auth_key.len = msg->sesskey_len;
776 memcpy(ses->server->mac_signing_key.data.krb5, 793
777 msg->data, msg->sesskey_len);
778 }
779 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 794 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
780 capabilities |= CAP_EXTENDED_SECURITY; 795 capabilities |= CAP_EXTENDED_SECURITY;
781 pSMB->req.Capabilities = cpu_to_le32(capabilities); 796 pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -815,12 +830,30 @@ ssetup_ntlmssp_authenticate:
815 if (phase == NtLmNegotiate) { 830 if (phase == NtLmNegotiate) {
816 setup_ntlmssp_neg_req(pSMB, ses); 831 setup_ntlmssp_neg_req(pSMB, ses);
817 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); 832 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
818 } else if (phase == NtLmAuthenticate) { 834 } else if (phase == NtLmAuthenticate) {
819 int blob_len; 835 /* 5 is an empirical value, large enought to
820 blob_len = setup_ntlmssp_auth_req(pSMB, ses, 836 * hold authenticate message, max 10 of
821 nls_cp, 837 * av paris, doamin,user,workstation mames,
822 first_time); 838 * flags etc..
839 */
840 ntlmsspblob = kmalloc(
841 5*sizeof(struct _AUTHENTICATE_MESSAGE),
842 GFP_KERNEL);
843 if (!ntlmsspblob) {
844 cERROR(1, "Can't allocate NTLMSSP");
845 rc = -ENOMEM;
846 goto ssetup_exit;
847 }
848
849 rc = build_ntlmssp_auth_blob(ntlmsspblob,
850 &blob_len, ses, nls_cp);
851 if (rc)
852 goto ssetup_exit;
823 iov[1].iov_len = blob_len; 853 iov[1].iov_len = blob_len;
854 iov[1].iov_base = ntlmsspblob;
855 pSMB->req.SecurityBlobLength =
856 cpu_to_le16(blob_len);
824 /* Make sure that we tell the server that we 857 /* Make sure that we tell the server that we
825 are using the uid that it just gave us back 858 are using the uid that it just gave us back
826 on the response (challenge) */ 859 on the response (challenge) */
@@ -830,7 +863,6 @@ ssetup_ntlmssp_authenticate:
830 rc = -ENOSYS; 863 rc = -ENOSYS;
831 goto ssetup_exit; 864 goto ssetup_exit;
832 } 865 }
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 /* unicode strings must be word aligned */ 866 /* unicode strings must be word aligned */
835 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 867 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
836 *bcc_ptr = 0; 868 *bcc_ptr = 0;
@@ -861,8 +893,6 @@ ssetup_ntlmssp_authenticate:
861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 893 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
862 /* SMB request buf freed in SendReceive2 */ 894 /* SMB request buf freed in SendReceive2 */
863 895
864 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
865
866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 896 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
867 smb_buf = (struct smb_hdr *)iov[0].iov_base; 897 smb_buf = (struct smb_hdr *)iov[0].iov_base;
868 898
@@ -895,7 +925,6 @@ ssetup_ntlmssp_authenticate:
895 bcc_ptr = pByteArea(smb_buf); 925 bcc_ptr = pByteArea(smb_buf);
896 926
897 if (smb_buf->WordCount == 4) { 927 if (smb_buf->WordCount == 4) {
898 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 928 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 929 if (blob_len > bytes_remaining) {
901 cERROR(1, "bad security blob length %d", blob_len); 930 cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +960,8 @@ ssetup_exit:
931 key_put(spnego_key); 960 key_put(spnego_key);
932 } 961 }
933 kfree(str_area); 962 kfree(str_area);
963 kfree(ntlmsspblob);
964 ntlmsspblob = NULL;
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 965 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); 966 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 967 cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7..7f16cb825fe 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
76#define ERRnofiles 18 /* A File Search command can find no 76#define ERRnofiles 18 /* A File Search command can find no
77 more files matching the specified 77 more files matching the specified
78 criteria. */ 78 criteria. */
79#define ERRwriteprot 19 /* media is write protected */
79#define ERRgeneral 31 80#define ERRgeneral 31
80#define ERRbadshare 32 /* The sharing mode specified for an 81#define ERRbadshare 32 /* The sharing mode specified for an
81 Open conflicts with existing FIDs on 82 Open conflicts with existing FIDs on
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d697..e0588cdf4cc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
544 SECMODE_SIGN_ENABLED))) { 544 SECMODE_SIGN_ENABLED))) {
545 rc = cifs_verify_signature(midQ->resp_buf, 545 rc = cifs_verify_signature(midQ->resp_buf,
546 &ses->server->mac_signing_key, 546 ses->server,
547 midQ->sequence_number+1); 547 midQ->sequence_number+1);
548 if (rc) { 548 if (rc) {
549 cERROR(1, "Unexpected SMB signature"); 549 cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
732 SECMODE_SIGN_ENABLED))) { 732 SECMODE_SIGN_ENABLED))) {
733 rc = cifs_verify_signature(out_buf, 733 rc = cifs_verify_signature(out_buf,
734 &ses->server->mac_signing_key, 734 ses->server,
735 midQ->sequence_number+1); 735 midQ->sequence_number+1);
736 if (rc) { 736 if (rc) {
737 cERROR(1, "Unexpected SMB signature"); 737 cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
982 SECMODE_SIGN_ENABLED))) { 982 SECMODE_SIGN_ENABLED))) {
983 rc = cifs_verify_signature(out_buf, 983 rc = cifs_verify_signature(out_buf,
984 &ses->server->mac_signing_key, 984 ses->server,
985 midQ->sequence_number+1); 985 midQ->sequence_number+1);
986 if (rc) { 986 if (rc) {
987 cERROR(1, "Unexpected SMB signature"); 987 cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa..a264b744bb4 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
47#ifdef CONFIG_CIFS_XATTR 47#ifdef CONFIG_CIFS_XATTR
48 int xid; 48 int xid;
49 struct cifs_sb_info *cifs_sb; 49 struct cifs_sb_info *cifs_sb;
50 struct tcon_link *tlink;
50 struct cifsTconInfo *pTcon; 51 struct cifsTconInfo *pTcon;
51 struct super_block *sb; 52 struct super_block *sb;
52 char *full_path; 53 char *full_path = NULL;
53 54
54 if (direntry == NULL) 55 if (direntry == NULL)
55 return -EIO; 56 return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
58 sb = direntry->d_inode->i_sb; 59 sb = direntry->d_inode->i_sb;
59 if (sb == NULL) 60 if (sb == NULL)
60 return -EIO; 61 return -EIO;
61 xid = GetXid();
62 62
63 cifs_sb = CIFS_SB(sb); 63 cifs_sb = CIFS_SB(sb);
64 pTcon = cifs_sb->tcon; 64 tlink = cifs_sb_tlink(cifs_sb);
65 if (IS_ERR(tlink))
66 return PTR_ERR(tlink);
67 pTcon = tlink_tcon(tlink);
68
69 xid = GetXid();
65 70
66 full_path = build_path_from_dentry(direntry); 71 full_path = build_path_from_dentry(direntry);
67 if (full_path == NULL) { 72 if (full_path == NULL) {
68 rc = -ENOMEM; 73 rc = -ENOMEM;
69 FreeXid(xid); 74 goto remove_ea_exit;
70 return rc;
71 } 75 }
72 if (ea_name == NULL) { 76 if (ea_name == NULL) {
73 cFYI(1, "Null xattr names not supported"); 77 cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
91remove_ea_exit: 95remove_ea_exit:
92 kfree(full_path); 96 kfree(full_path);
93 FreeXid(xid); 97 FreeXid(xid);
98 cifs_put_tlink(tlink);
94#endif 99#endif
95 return rc; 100 return rc;
96} 101}
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
102#ifdef CONFIG_CIFS_XATTR 107#ifdef CONFIG_CIFS_XATTR
103 int xid; 108 int xid;
104 struct cifs_sb_info *cifs_sb; 109 struct cifs_sb_info *cifs_sb;
110 struct tcon_link *tlink;
105 struct cifsTconInfo *pTcon; 111 struct cifsTconInfo *pTcon;
106 struct super_block *sb; 112 struct super_block *sb;
107 char *full_path; 113 char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
113 sb = direntry->d_inode->i_sb; 119 sb = direntry->d_inode->i_sb;
114 if (sb == NULL) 120 if (sb == NULL)
115 return -EIO; 121 return -EIO;
116 xid = GetXid();
117 122
118 cifs_sb = CIFS_SB(sb); 123 cifs_sb = CIFS_SB(sb);
119 pTcon = cifs_sb->tcon; 124 tlink = cifs_sb_tlink(cifs_sb);
125 if (IS_ERR(tlink))
126 return PTR_ERR(tlink);
127 pTcon = tlink_tcon(tlink);
128
129 xid = GetXid();
120 130
121 full_path = build_path_from_dentry(direntry); 131 full_path = build_path_from_dentry(direntry);
122 if (full_path == NULL) { 132 if (full_path == NULL) {
123 rc = -ENOMEM; 133 rc = -ENOMEM;
124 FreeXid(xid); 134 goto set_ea_exit;
125 return rc;
126 } 135 }
127 /* return dos attributes as pseudo xattr */ 136 /* return dos attributes as pseudo xattr */
128 /* return alt name if available as pseudo attr */ 137 /* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
132 returns as xattrs */ 141 returns as xattrs */
133 if (value_size > MAX_EA_VALUE_SIZE) { 142 if (value_size > MAX_EA_VALUE_SIZE) {
134 cFYI(1, "size of EA value too large"); 143 cFYI(1, "size of EA value too large");
135 kfree(full_path); 144 rc = -EOPNOTSUPP;
136 FreeXid(xid); 145 goto set_ea_exit;
137 return -EOPNOTSUPP;
138 } 146 }
139 147
140 if (ea_name == NULL) { 148 if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
198set_ea_exit: 206set_ea_exit:
199 kfree(full_path); 207 kfree(full_path);
200 FreeXid(xid); 208 FreeXid(xid);
209 cifs_put_tlink(tlink);
201#endif 210#endif
202 return rc; 211 return rc;
203} 212}
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
209#ifdef CONFIG_CIFS_XATTR 218#ifdef CONFIG_CIFS_XATTR
210 int xid; 219 int xid;
211 struct cifs_sb_info *cifs_sb; 220 struct cifs_sb_info *cifs_sb;
221 struct tcon_link *tlink;
212 struct cifsTconInfo *pTcon; 222 struct cifsTconInfo *pTcon;
213 struct super_block *sb; 223 struct super_block *sb;
214 char *full_path; 224 char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
221 if (sb == NULL) 231 if (sb == NULL)
222 return -EIO; 232 return -EIO;
223 233
224 xid = GetXid();
225
226 cifs_sb = CIFS_SB(sb); 234 cifs_sb = CIFS_SB(sb);
227 pTcon = cifs_sb->tcon; 235 tlink = cifs_sb_tlink(cifs_sb);
236 if (IS_ERR(tlink))
237 return PTR_ERR(tlink);
238 pTcon = tlink_tcon(tlink);
239
240 xid = GetXid();
228 241
229 full_path = build_path_from_dentry(direntry); 242 full_path = build_path_from_dentry(direntry);
230 if (full_path == NULL) { 243 if (full_path == NULL) {
231 rc = -ENOMEM; 244 rc = -ENOMEM;
232 FreeXid(xid); 245 goto get_ea_exit;
233 return rc;
234 } 246 }
235 /* return dos attributes as pseudo xattr */ 247 /* return dos attributes as pseudo xattr */
236 /* return alt name if available as pseudo attr */ 248 /* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
323get_ea_exit: 335get_ea_exit:
324 kfree(full_path); 336 kfree(full_path);
325 FreeXid(xid); 337 FreeXid(xid);
338 cifs_put_tlink(tlink);
326#endif 339#endif
327 return rc; 340 return rc;
328} 341}
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
333#ifdef CONFIG_CIFS_XATTR 346#ifdef CONFIG_CIFS_XATTR
334 int xid; 347 int xid;
335 struct cifs_sb_info *cifs_sb; 348 struct cifs_sb_info *cifs_sb;
349 struct tcon_link *tlink;
336 struct cifsTconInfo *pTcon; 350 struct cifsTconInfo *pTcon;
337 struct super_block *sb; 351 struct super_block *sb;
338 char *full_path; 352 char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
346 return -EIO; 360 return -EIO;
347 361
348 cifs_sb = CIFS_SB(sb); 362 cifs_sb = CIFS_SB(sb);
349 pTcon = cifs_sb->tcon;
350
351 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 363 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
352 return -EOPNOTSUPP; 364 return -EOPNOTSUPP;
353 365
366 tlink = cifs_sb_tlink(cifs_sb);
367 if (IS_ERR(tlink))
368 return PTR_ERR(tlink);
369 pTcon = tlink_tcon(tlink);
370
354 xid = GetXid(); 371 xid = GetXid();
355 372
356 full_path = build_path_from_dentry(direntry); 373 full_path = build_path_from_dentry(direntry);
357 if (full_path == NULL) { 374 if (full_path == NULL) {
358 rc = -ENOMEM; 375 rc = -ENOMEM;
359 FreeXid(xid); 376 goto list_ea_exit;
360 return rc;
361 } 377 }
362 /* return dos attributes as pseudo xattr */ 378 /* return dos attributes as pseudo xattr */
363 /* return alt name if available as pseudo attr */ 379 /* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
370 cifs_sb->mnt_cifs_flags & 386 cifs_sb->mnt_cifs_flags &
371 CIFS_MOUNT_MAP_SPECIAL_CHR); 387 CIFS_MOUNT_MAP_SPECIAL_CHR);
372 388
389list_ea_exit:
373 kfree(full_path); 390 kfree(full_path);
374 FreeXid(xid); 391 FreeXid(xid);
392 cifs_put_tlink(tlink);
375#endif 393#endif
376 return rc; 394 return rc;
377} 395}
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22..9060f08e70c 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h>
20 21
21#include <linux/coda.h> 22#include <linux/coda.h>
22#include <linux/coda_linux.h> 23#include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
31{ 32{
32 struct coda_inode_info *cii = ITOC(inode); 33 struct coda_inode_info *cii = ITOC(inode);
33 34
35 spin_lock(&cii->c_lock);
34 cii->c_cached_epoch = atomic_read(&permission_epoch); 36 cii->c_cached_epoch = atomic_read(&permission_epoch);
35 if (cii->c_uid != current_fsuid()) { 37 if (cii->c_uid != current_fsuid()) {
36 cii->c_uid = current_fsuid(); 38 cii->c_uid = current_fsuid();
37 cii->c_cached_perm = mask; 39 cii->c_cached_perm = mask;
38 } else 40 } else
39 cii->c_cached_perm |= mask; 41 cii->c_cached_perm |= mask;
42 spin_unlock(&cii->c_lock);
40} 43}
41 44
42/* remove cached acl from an inode */ 45/* remove cached acl from an inode */
43void coda_cache_clear_inode(struct inode *inode) 46void coda_cache_clear_inode(struct inode *inode)
44{ 47{
45 struct coda_inode_info *cii = ITOC(inode); 48 struct coda_inode_info *cii = ITOC(inode);
49 spin_lock(&cii->c_lock);
46 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; 50 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
51 spin_unlock(&cii->c_lock);
47} 52}
48 53
49/* remove all acl caches */ 54/* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
57int coda_cache_check(struct inode *inode, int mask) 62int coda_cache_check(struct inode *inode, int mask)
58{ 63{
59 struct coda_inode_info *cii = ITOC(inode); 64 struct coda_inode_info *cii = ITOC(inode);
60 int hit; 65 int hit;
61 66
62 hit = (mask & cii->c_cached_perm) == mask && 67 spin_lock(&cii->c_lock);
63 cii->c_uid == current_fsuid() && 68 hit = (mask & cii->c_cached_perm) == mask &&
64 cii->c_cached_epoch == atomic_read(&permission_epoch); 69 cii->c_uid == current_fsuid() &&
70 cii->c_cached_epoch == atomic_read(&permission_epoch);
71 spin_unlock(&cii->c_lock);
65 72
66 return hit; 73 return hit;
67} 74}
68 75
69 76
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929ee..602240569c8 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
45static int coda_test_inode(struct inode *inode, void *data) 45static int coda_test_inode(struct inode *inode, void *data)
46{ 46{
47 struct CodaFid *fid = (struct CodaFid *)data; 47 struct CodaFid *fid = (struct CodaFid *)data;
48 return coda_fideq(&(ITOC(inode)->c_fid), fid); 48 struct coda_inode_info *cii = ITOC(inode);
49 return coda_fideq(&cii->c_fid, fid);
49} 50}
50 51
51static int coda_set_inode(struct inode *inode, void *data) 52static int coda_set_inode(struct inode *inode, void *data)
52{ 53{
53 struct CodaFid *fid = (struct CodaFid *)data; 54 struct CodaFid *fid = (struct CodaFid *)data;
54 ITOC(inode)->c_fid = *fid; 55 struct coda_inode_info *cii = ITOC(inode);
56 cii->c_fid = *fid;
55 return 0; 57 return 0;
56} 58}
57 59
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
71 cii = ITOC(inode); 73 cii = ITOC(inode);
72 /* we still need to set i_ino for things like stat(2) */ 74 /* we still need to set i_ino for things like stat(2) */
73 inode->i_ino = hash; 75 inode->i_ino = hash;
76 /* inode is locked and unique, no need to grab cii->c_lock */
74 cii->c_mapcount = 0; 77 cii->c_mapcount = 0;
75 unlock_new_inode(inode); 78 unlock_new_inode(inode);
76 } 79 }
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
107} 110}
108 111
109 112
113/* Although we treat Coda file identifiers as immutable, there is one
114 * special case for files created during a disconnection where they may
115 * not be globally unique. When an identifier collision is detected we
116 * first try to flush the cached inode from the kernel and finally
117 * resort to renaming/rehashing in-place. Userspace remembers both old
118 * and new values of the identifier to handle any in-flight upcalls.
119 * The real solution is to use globally unique UUIDs as identifiers, but
120 * retrofitting the existing userspace code for this is non-trivial. */
110void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 121void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid,
111 struct CodaFid *newfid) 122 struct CodaFid *newfid)
112{ 123{
113 struct coda_inode_info *cii; 124 struct coda_inode_info *cii = ITOC(inode);
114 unsigned long hash = coda_f2i(newfid); 125 unsigned long hash = coda_f2i(newfid);
115 126
116 cii = ITOC(inode);
117
118 BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); 127 BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
119 128
120 /* replace fid and rehash inode */ 129 /* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0..5d8b3553960 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/smp_lock.h> 20#include <linux/spinlock.h>
21 21
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
116 goto exit; 116 goto exit;
117 } 117 }
118 118
119 lock_kernel();
120
121 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, 119 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
122 &type, &resfid); 120 &type, &resfid);
123 if (!error) 121 if (!error)
124 error = coda_cnode_make(&inode, &resfid, dir->i_sb); 122 error = coda_cnode_make(&inode, &resfid, dir->i_sb);
125 123
126 unlock_kernel();
127
128 if (error && error != -ENOENT) 124 if (error && error != -ENOENT)
129 return ERR_PTR(error); 125 return ERR_PTR(error);
130 126
@@ -140,28 +136,24 @@ exit:
140 136
141int coda_permission(struct inode *inode, int mask) 137int coda_permission(struct inode *inode, int mask)
142{ 138{
143 int error = 0; 139 int error;
144 140
145 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 141 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
146 142
147 if (!mask) 143 if (!mask)
148 return 0; 144 return 0;
149 145
150 if ((mask & MAY_EXEC) && !execute_ok(inode)) 146 if ((mask & MAY_EXEC) && !execute_ok(inode))
151 return -EACCES; 147 return -EACCES;
152 148
153 lock_kernel();
154
155 if (coda_cache_check(inode, mask)) 149 if (coda_cache_check(inode, mask))
156 goto out; 150 return 0;
157 151
158 error = venus_access(inode->i_sb, coda_i2f(inode), mask); 152 error = venus_access(inode->i_sb, coda_i2f(inode), mask);
159 153
160 if (!error) 154 if (!error)
161 coda_cache_enter(inode, mask); 155 coda_cache_enter(inode, mask);
162 156
163 out:
164 unlock_kernel();
165 return error; 157 return error;
166} 158}
167 159
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
200/* creation routines: create, mknod, mkdir, link, symlink */ 192/* creation routines: create, mknod, mkdir, link, symlink */
201static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) 193static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
202{ 194{
203 int error=0; 195 int error;
204 const char *name=de->d_name.name; 196 const char *name=de->d_name.name;
205 int length=de->d_name.len; 197 int length=de->d_name.len;
206 struct inode *inode; 198 struct inode *inode;
207 struct CodaFid newfid; 199 struct CodaFid newfid;
208 struct coda_vattr attrs; 200 struct coda_vattr attrs;
209 201
210 lock_kernel(); 202 if (coda_isroot(dir) && coda_iscontrol(name, length))
211
212 if (coda_isroot(dir) && coda_iscontrol(name, length)) {
213 unlock_kernel();
214 return -EPERM; 203 return -EPERM;
215 }
216 204
217 error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 205 error = venus_create(dir->i_sb, coda_i2f(dir), name, length,
218 0, mode, &newfid, &attrs); 206 0, mode, &newfid, &attrs);
219 207 if (error)
220 if ( error ) { 208 goto err_out;
221 unlock_kernel();
222 d_drop(de);
223 return error;
224 }
225 209
226 inode = coda_iget(dir->i_sb, &newfid, &attrs); 210 inode = coda_iget(dir->i_sb, &newfid, &attrs);
227 if ( IS_ERR(inode) ) { 211 if (IS_ERR(inode)) {
228 unlock_kernel(); 212 error = PTR_ERR(inode);
229 d_drop(de); 213 goto err_out;
230 return PTR_ERR(inode);
231 } 214 }
232 215
233 /* invalidate the directory cnode's attributes */ 216 /* invalidate the directory cnode's attributes */
234 coda_dir_update_mtime(dir); 217 coda_dir_update_mtime(dir);
235 unlock_kernel();
236 d_instantiate(de, inode); 218 d_instantiate(de, inode);
237 return 0; 219 return 0;
220err_out:
221 d_drop(de);
222 return error;
238} 223}
239 224
240static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) 225static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
246 int error; 231 int error;
247 struct CodaFid newfid; 232 struct CodaFid newfid;
248 233
249 lock_kernel(); 234 if (coda_isroot(dir) && coda_iscontrol(name, len))
250
251 if (coda_isroot(dir) && coda_iscontrol(name, len)) {
252 unlock_kernel();
253 return -EPERM; 235 return -EPERM;
254 }
255 236
256 attrs.va_mode = mode; 237 attrs.va_mode = mode;
257 error = venus_mkdir(dir->i_sb, coda_i2f(dir), 238 error = venus_mkdir(dir->i_sb, coda_i2f(dir),
258 name, len, &newfid, &attrs); 239 name, len, &newfid, &attrs);
259 240 if (error)
260 if ( error ) { 241 goto err_out;
261 unlock_kernel();
262 d_drop(de);
263 return error;
264 }
265 242
266 inode = coda_iget(dir->i_sb, &newfid, &attrs); 243 inode = coda_iget(dir->i_sb, &newfid, &attrs);
267 if ( IS_ERR(inode) ) { 244 if (IS_ERR(inode)) {
268 unlock_kernel(); 245 error = PTR_ERR(inode);
269 d_drop(de); 246 goto err_out;
270 return PTR_ERR(inode);
271 } 247 }
272 248
273 /* invalidate the directory cnode's attributes */ 249 /* invalidate the directory cnode's attributes */
274 coda_dir_inc_nlink(dir); 250 coda_dir_inc_nlink(dir);
275 coda_dir_update_mtime(dir); 251 coda_dir_update_mtime(dir);
276 unlock_kernel();
277 d_instantiate(de, inode); 252 d_instantiate(de, inode);
278 return 0; 253 return 0;
254err_out:
255 d_drop(de);
256 return error;
279} 257}
280 258
281/* try to make de an entry in dir_inodde linked to source_de */ 259/* try to make de an entry in dir_inodde linked to source_de */
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
287 int len = de->d_name.len; 265 int len = de->d_name.len;
288 int error; 266 int error;
289 267
290 lock_kernel(); 268 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
291
292 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
293 unlock_kernel();
294 return -EPERM; 269 return -EPERM;
295 }
296 270
297 error = venus_link(dir_inode->i_sb, coda_i2f(inode), 271 error = venus_link(dir_inode->i_sb, coda_i2f(inode),
298 coda_i2f(dir_inode), (const char *)name, len); 272 coda_i2f(dir_inode), (const char *)name, len);
299
300 if (error) { 273 if (error) {
301 d_drop(de); 274 d_drop(de);
302 goto out; 275 return error;
303 } 276 }
304 277
305 coda_dir_update_mtime(dir_inode); 278 coda_dir_update_mtime(dir_inode);
306 atomic_inc(&inode->i_count); 279 ihold(inode);
307 d_instantiate(de, inode); 280 d_instantiate(de, inode);
308 inc_nlink(inode); 281 inc_nlink(inode);
309 282 return 0;
310out:
311 unlock_kernel();
312 return(error);
313} 283}
314 284
315 285
316static int coda_symlink(struct inode *dir_inode, struct dentry *de, 286static int coda_symlink(struct inode *dir_inode, struct dentry *de,
317 const char *symname) 287 const char *symname)
318{ 288{
319 const char *name = de->d_name.name; 289 const char *name = de->d_name.name;
320 int len = de->d_name.len; 290 int len = de->d_name.len;
321 int symlen; 291 int symlen;
322 int error = 0; 292 int error;
323
324 lock_kernel();
325 293
326 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { 294 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
327 unlock_kernel();
328 return -EPERM; 295 return -EPERM;
329 }
330 296
331 symlen = strlen(symname); 297 symlen = strlen(symname);
332 if ( symlen > CODA_MAXPATHLEN ) { 298 if (symlen > CODA_MAXPATHLEN)
333 unlock_kernel();
334 return -ENAMETOOLONG; 299 return -ENAMETOOLONG;
335 }
336 300
337 /* 301 /*
338 * This entry is now negative. Since we do not create 302 * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
343 symname, symlen); 307 symname, symlen);
344 308
345 /* mtime is no good anymore */ 309 /* mtime is no good anymore */
346 if ( !error ) 310 if (!error)
347 coda_dir_update_mtime(dir_inode); 311 coda_dir_update_mtime(dir_inode);
348 312
349 unlock_kernel();
350 return error; 313 return error;
351} 314}
352 315
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
357 const char *name = de->d_name.name; 320 const char *name = de->d_name.name;
358 int len = de->d_name.len; 321 int len = de->d_name.len;
359 322
360 lock_kernel();
361
362 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); 323 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
363 if ( error ) { 324 if (error)
364 unlock_kernel();
365 return error; 325 return error;
366 }
367 326
368 coda_dir_update_mtime(dir); 327 coda_dir_update_mtime(dir);
369 drop_nlink(de->d_inode); 328 drop_nlink(de->d_inode);
370 unlock_kernel();
371 return 0; 329 return 0;
372} 330}
373 331
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
377 int len = de->d_name.len; 335 int len = de->d_name.len;
378 int error; 336 int error;
379 337
380 lock_kernel();
381
382 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 338 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
383 if (!error) { 339 if (!error) {
384 /* VFS may delete the child */ 340 /* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
389 coda_dir_drop_nlink(dir); 345 coda_dir_drop_nlink(dir);
390 coda_dir_update_mtime(dir); 346 coda_dir_update_mtime(dir);
391 } 347 }
392 unlock_kernel();
393 return error; 348 return error;
394} 349}
395 350
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
403 int new_length = new_dentry->d_name.len; 358 int new_length = new_dentry->d_name.len;
404 int error; 359 int error;
405 360
406 lock_kernel();
407
408 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 361 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
409 coda_i2f(new_dir), old_length, new_length, 362 coda_i2f(new_dir), old_length, new_length,
410 (const char *) old_name, (const char *)new_name); 363 (const char *) old_name, (const char *)new_name);
411 364 if (!error) {
412 if ( !error ) { 365 if (new_dentry->d_inode) {
413 if ( new_dentry->d_inode ) { 366 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
414 if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
415 coda_dir_drop_nlink(old_dir); 367 coda_dir_drop_nlink(old_dir);
416 coda_dir_inc_nlink(new_dir); 368 coda_dir_inc_nlink(new_dir);
417 } 369 }
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
423 coda_flag_inode(new_dir, C_VATTR); 375 coda_flag_inode(new_dir, C_VATTR);
424 } 376 }
425 } 377 }
426 unlock_kernel();
427
428 return error; 378 return error;
429} 379}
430 380
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
594 struct inode *inode = de->d_inode; 544 struct inode *inode = de->d_inode;
595 struct coda_inode_info *cii; 545 struct coda_inode_info *cii;
596 546
597 if (!inode) 547 if (!inode || coda_isroot(inode))
598 return 1;
599 lock_kernel();
600 if (coda_isroot(inode))
601 goto out; 548 goto out;
602 if (is_bad_inode(inode)) 549 if (is_bad_inode(inode))
603 goto bad; 550 goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
617 goto out; 564 goto out;
618 565
619 /* clear the flags. */ 566 /* clear the flags. */
567 spin_lock(&cii->c_lock);
620 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 568 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
621 569 spin_unlock(&cii->c_lock);
622bad: 570bad:
623 unlock_kernel();
624 return 0; 571 return 0;
625out: 572out:
626 unlock_kernel();
627 return 1; 573 return 1;
628} 574}
629 575
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
656int coda_revalidate_inode(struct dentry *dentry) 602int coda_revalidate_inode(struct dentry *dentry)
657{ 603{
658 struct coda_vattr attr; 604 struct coda_vattr attr;
659 int error = 0; 605 int error;
660 int old_mode; 606 int old_mode;
661 ino_t old_ino; 607 ino_t old_ino;
662 struct inode *inode = dentry->d_inode; 608 struct inode *inode = dentry->d_inode;
663 struct coda_inode_info *cii = ITOC(inode); 609 struct coda_inode_info *cii = ITOC(inode);
664 610
665 lock_kernel(); 611 if (!cii->c_flags)
666 if ( !cii->c_flags ) 612 return 0;
667 goto ok;
668 613
669 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { 614 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
670 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); 615 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
671 if ( error ) 616 if (error)
672 goto return_bad; 617 return -EIO;
673 618
674 /* this inode may be lost if: 619 /* this inode may be lost if:
675 - it's ino changed 620 - it's ino changed
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
688 /* the following can happen when a local fid is replaced 633 /* the following can happen when a local fid is replaced
689 with a global one, here we lose and declare the inode bad */ 634 with a global one, here we lose and declare the inode bad */
690 if (inode->i_ino != old_ino) 635 if (inode->i_ino != old_ino)
691 goto return_bad; 636 return -EIO;
692 637
693 coda_flag_inode_children(inode, C_FLUSH); 638 coda_flag_inode_children(inode, C_FLUSH);
639
640 spin_lock(&cii->c_lock);
694 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 641 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
642 spin_unlock(&cii->c_lock);
695 } 643 }
696
697ok:
698 unlock_kernel();
699 return 0; 644 return 0;
700
701return_bad:
702 unlock_kernel();
703 return -EIO;
704} 645}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb..c8b50ba4366 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/cred.h> 16#include <linux/cred.h>
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/spinlock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
109 109
110 coda_inode = coda_file->f_path.dentry->d_inode; 110 coda_inode = coda_file->f_path.dentry->d_inode;
111 host_inode = host_file->f_path.dentry->d_inode; 111 host_inode = host_file->f_path.dentry->d_inode;
112
113 cii = ITOC(coda_inode);
114 spin_lock(&cii->c_lock);
112 coda_file->f_mapping = host_file->f_mapping; 115 coda_file->f_mapping = host_file->f_mapping;
113 if (coda_inode->i_mapping == &coda_inode->i_data) 116 if (coda_inode->i_mapping == &coda_inode->i_data)
114 coda_inode->i_mapping = host_inode->i_mapping; 117 coda_inode->i_mapping = host_inode->i_mapping;
115 118
116 /* only allow additional mmaps as long as userspace isn't changing 119 /* only allow additional mmaps as long as userspace isn't changing
117 * the container file on us! */ 120 * the container file on us! */
118 else if (coda_inode->i_mapping != host_inode->i_mapping) 121 else if (coda_inode->i_mapping != host_inode->i_mapping) {
122 spin_unlock(&cii->c_lock);
119 return -EBUSY; 123 return -EBUSY;
124 }
120 125
121 /* keep track of how often the coda_inode/host_file has been mmapped */ 126 /* keep track of how often the coda_inode/host_file has been mmapped */
122 cii = ITOC(coda_inode);
123 cii->c_mapcount++; 127 cii->c_mapcount++;
124 cfi->cfi_mapcount++; 128 cfi->cfi_mapcount++;
129 spin_unlock(&cii->c_lock);
125 130
126 return host_file->f_op->mmap(host_file, vma); 131 return host_file->f_op->mmap(host_file, vma);
127} 132}
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
138 if (!cfi) 143 if (!cfi)
139 return -ENOMEM; 144 return -ENOMEM;
140 145
141 lock_kernel();
142
143 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, 146 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
144 &host_file); 147 &host_file);
145 if (!host_file) 148 if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
147 150
148 if (error) { 151 if (error) {
149 kfree(cfi); 152 kfree(cfi);
150 unlock_kernel();
151 return error; 153 return error;
152 } 154 }
153 155
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
159 161
160 BUG_ON(coda_file->private_data != NULL); 162 BUG_ON(coda_file->private_data != NULL);
161 coda_file->private_data = cfi; 163 coda_file->private_data = cfi;
162
163 unlock_kernel();
164 return 0; 164 return 0;
165} 165}
166 166
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
171 struct coda_file_info *cfi; 171 struct coda_file_info *cfi;
172 struct coda_inode_info *cii; 172 struct coda_inode_info *cii;
173 struct inode *host_inode; 173 struct inode *host_inode;
174 int err = 0; 174 int err;
175
176 lock_kernel();
177 175
178 cfi = CODA_FTOC(coda_file); 176 cfi = CODA_FTOC(coda_file);
179 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 177 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
185 cii = ITOC(coda_inode); 183 cii = ITOC(coda_inode);
186 184
187 /* did we mmap this file? */ 185 /* did we mmap this file? */
186 spin_lock(&cii->c_lock);
188 if (coda_inode->i_mapping == &host_inode->i_data) { 187 if (coda_inode->i_mapping == &host_inode->i_data) {
189 cii->c_mapcount -= cfi->cfi_mapcount; 188 cii->c_mapcount -= cfi->cfi_mapcount;
190 if (!cii->c_mapcount) 189 if (!cii->c_mapcount)
191 coda_inode->i_mapping = &coda_inode->i_data; 190 coda_inode->i_mapping = &coda_inode->i_data;
192 } 191 }
192 spin_unlock(&cii->c_lock);
193 193
194 fput(cfi->cfi_container); 194 fput(cfi->cfi_container);
195 kfree(coda_file->private_data); 195 kfree(coda_file->private_data);
196 coda_file->private_data = NULL; 196 coda_file->private_data = NULL;
197 197
198 unlock_kernel();
199
200 /* VFS fput ignores the return value from file_operations->release, so 198 /* VFS fput ignores the return value from file_operations->release, so
201 * there is no use returning an error here */ 199 * there is no use returning an error here */
202 return 0; 200 return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
207 struct file *host_file; 205 struct file *host_file;
208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode; 206 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 207 struct coda_file_info *cfi;
210 int err = 0; 208 int err;
211 209
212 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || 210 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
213 S_ISLNK(coda_inode->i_mode))) 211 S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
218 host_file = cfi->cfi_container; 216 host_file = cfi->cfi_container;
219 217
220 err = vfs_fsync(host_file, datasync); 218 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 219 if (!err && !datasync)
222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 220 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
224 unlock_kernel();
225 }
226 221
227 return err; 222 return err;
228} 223}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d97f9935a02..5ea57c8c7f9 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/unistd.h> 17#include <linux/unistd.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/file.h> 20#include <linux/file.h>
20#include <linux/vfs.h> 21#include <linux/vfs.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
@@ -35,7 +36,7 @@
35#include "coda_int.h" 36#include "coda_int.h"
36 37
37/* VFS super_block ops */ 38/* VFS super_block ops */
38static void coda_clear_inode(struct inode *); 39static void coda_evict_inode(struct inode *);
39static void coda_put_super(struct super_block *); 40static void coda_put_super(struct super_block *);
40static int coda_statfs(struct dentry *dentry, struct kstatfs *buf); 41static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
41 42
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
51 ei->c_flags = 0; 52 ei->c_flags = 0;
52 ei->c_uid = 0; 53 ei->c_uid = 0;
53 ei->c_cached_perm = 0; 54 ei->c_cached_perm = 0;
55 spin_lock_init(&ei->c_lock);
54 return &ei->vfs_inode; 56 return &ei->vfs_inode;
55} 57}
56 58
@@ -93,7 +95,7 @@ static const struct super_operations coda_super_operations =
93{ 95{
94 .alloc_inode = coda_alloc_inode, 96 .alloc_inode = coda_alloc_inode,
95 .destroy_inode = coda_destroy_inode, 97 .destroy_inode = coda_destroy_inode,
96 .clear_inode = coda_clear_inode, 98 .evict_inode = coda_evict_inode,
97 .put_super = coda_put_super, 99 .put_super = coda_put_super,
98 .statfs = coda_statfs, 100 .statfs = coda_statfs,
99 .remount_fs = coda_remount, 101 .remount_fs = coda_remount,
@@ -143,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
143static int coda_fill_super(struct super_block *sb, void *data, int silent) 145static int coda_fill_super(struct super_block *sb, void *data, int silent)
144{ 146{
145 struct inode *root = NULL; 147 struct inode *root = NULL;
146 struct venus_comm *vc = NULL; 148 struct venus_comm *vc;
147 struct CodaFid fid; 149 struct CodaFid fid;
148 int error; 150 int error;
149 int idx; 151 int idx;
@@ -157,21 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
157 printk(KERN_INFO "coda_read_super: device index: %i\n", idx); 159 printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
158 160
159 vc = &coda_comms[idx]; 161 vc = &coda_comms[idx];
162 mutex_lock(&vc->vc_mutex);
163
160 if (!vc->vc_inuse) { 164 if (!vc->vc_inuse) {
161 printk("coda_read_super: No pseudo device\n"); 165 printk("coda_read_super: No pseudo device\n");
162 return -EINVAL; 166 error = -EINVAL;
167 goto unlock_out;
163 } 168 }
164 169
165 if ( vc->vc_sb ) { 170 if (vc->vc_sb) {
166 printk("coda_read_super: Device already mounted\n"); 171 printk("coda_read_super: Device already mounted\n");
167 return -EBUSY; 172 error = -EBUSY;
173 goto unlock_out;
168 } 174 }
169 175
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 176 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error) 177 if (error)
172 goto bdi_err; 178 goto unlock_out;
173 179
174 vc->vc_sb = sb; 180 vc->vc_sb = sb;
181 mutex_unlock(&vc->vc_mutex);
175 182
176 sb->s_fs_info = vc; 183 sb->s_fs_info = vc;
177 sb->s_flags |= MS_NOATIME; 184 sb->s_flags |= MS_NOATIME;
@@ -200,32 +207,41 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
200 printk("coda_read_super: rootinode is %ld dev %s\n", 207 printk("coda_read_super: rootinode is %ld dev %s\n",
201 root->i_ino, root->i_sb->s_id); 208 root->i_ino, root->i_sb->s_id);
202 sb->s_root = d_alloc_root(root); 209 sb->s_root = d_alloc_root(root);
203 if (!sb->s_root) 210 if (!sb->s_root) {
211 error = -EINVAL;
204 goto error; 212 goto error;
205 return 0; 213 }
214 return 0;
206 215
207 error: 216error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
210 if (root) 217 if (root)
211 iput(root); 218 iput(root);
212 if (vc)
213 vc->vc_sb = NULL;
214 219
215 return -EINVAL; 220 mutex_lock(&vc->vc_mutex);
221 bdi_destroy(&vc->bdi);
222 vc->vc_sb = NULL;
223 sb->s_fs_info = NULL;
224unlock_out:
225 mutex_unlock(&vc->vc_mutex);
226 return error;
216} 227}
217 228
218static void coda_put_super(struct super_block *sb) 229static void coda_put_super(struct super_block *sb)
219{ 230{
220 bdi_destroy(&coda_vcp(sb)->bdi); 231 struct venus_comm *vcp = coda_vcp(sb);
221 coda_vcp(sb)->vc_sb = NULL; 232 mutex_lock(&vcp->vc_mutex);
233 bdi_destroy(&vcp->bdi);
234 vcp->vc_sb = NULL;
222 sb->s_fs_info = NULL; 235 sb->s_fs_info = NULL;
236 mutex_unlock(&vcp->vc_mutex);
223 237
224 printk("Coda: Bye bye.\n"); 238 printk("Coda: Bye bye.\n");
225} 239}
226 240
227static void coda_clear_inode(struct inode *inode) 241static void coda_evict_inode(struct inode *inode)
228{ 242{
243 truncate_inode_pages(&inode->i_data, 0);
244 end_writeback(inode);
229 coda_cache_clear_inode(inode); 245 coda_cache_clear_inode(inode);
230} 246}
231 247
@@ -243,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
243 struct coda_vattr vattr; 259 struct coda_vattr vattr;
244 int error; 260 int error;
245 261
246 lock_kernel();
247
248 memset(&vattr, 0, sizeof(vattr)); 262 memset(&vattr, 0, sizeof(vattr));
249 263
250 inode->i_ctime = CURRENT_TIME_SEC; 264 inode->i_ctime = CURRENT_TIME_SEC;
@@ -254,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
254 /* Venus is responsible for truncating the container-file!!! */ 268 /* Venus is responsible for truncating the container-file!!! */
255 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); 269 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
256 270
257 if ( !error ) { 271 if (!error) {
258 coda_vattr_to_iattr(inode, &vattr); 272 coda_vattr_to_iattr(inode, &vattr);
259 coda_cache_clear_inode(inode); 273 coda_cache_clear_inode(inode);
260 } 274 }
261
262 unlock_kernel();
263
264 return error; 275 return error;
265} 276}
266 277
@@ -274,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
274{ 285{
275 int error; 286 int error;
276 287
277 lock_kernel();
278
279 error = venus_statfs(dentry, buf); 288 error = venus_statfs(dentry, buf);
280 289
281 unlock_kernel();
282
283 if (error) { 290 if (error) {
284 /* fake something like AFS does */ 291 /* fake something like AFS does */
285 buf->f_blocks = 9000000; 292 buf->f_blocks = 9000000;
@@ -299,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
299 306
300/* init_coda: used by filesystems.c to register coda */ 307/* init_coda: used by filesystems.c to register coda */
301 308
302static int coda_get_sb(struct file_system_type *fs_type, 309static struct dentry *coda_mount(struct file_system_type *fs_type,
303 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 310 int flags, const char *dev_name, void *data)
304{ 311{
305 return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); 312 return mount_nodev(fs_type, flags, data, coda_fill_super);
306} 313}
307 314
308struct file_system_type coda_fs_type = { 315struct file_system_type coda_fs_type = {
309 .owner = THIS_MODULE, 316 .owner = THIS_MODULE,
310 .name = "coda", 317 .name = "coda",
311 .get_sb = coda_get_sb, 318 .mount = coda_mount,
312 .kill_sb = kill_anon_super, 319 .kill_sb = kill_anon_super,
313 .fs_flags = FS_BINARY_MOUNTDATA, 320 .fs_flags = FS_BINARY_MOUNTDATA,
314}; 321};
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c..2fd89b5c5c7 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
28/* pioctl ops */ 26/* pioctl ops */
29static int coda_ioctl_permission(struct inode *inode, int mask); 27static int coda_ioctl_permission(struct inode *inode, int mask);
30static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -39,6 +37,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
39const struct file_operations coda_ioctl_operations = { 37const struct file_operations coda_ioctl_operations = {
40 .owner = THIS_MODULE, 38 .owner = THIS_MODULE,
41 .unlocked_ioctl = coda_pioctl, 39 .unlocked_ioctl = coda_pioctl,
40 .llseek = noop_llseek,
42}; 41};
43 42
44/* the coda pioctl inode ops */ 43/* the coda pioctl inode ops */
@@ -57,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
57 struct inode *target_inode = NULL; 56 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp; 57 struct coda_inode_info *cnp;
59 58
60 lock_kernel();
61
62 /* get the Pioctl data arguments from user space */ 59 /* get the Pioctl data arguments from user space */
63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 60 if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
64 error = -EINVAL; 61 return -EINVAL;
65 goto out;
66 }
67 62
68 /* 63 /*
69 * Look up the pathname. Note that the pathname is in 64 * Look up the pathname. Note that the pathname is in
@@ -75,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
75 error = user_lpath(data.path, &path); 70 error = user_lpath(data.path, &path);
76 71
77 if (error) 72 if (error)
78 goto out; 73 return error;
79 else 74
80 target_inode = path.dentry->d_inode; 75 target_inode = path.dentry->d_inode;
81 76
82 /* return if it is not a Coda inode */ 77 /* return if it is not a Coda inode */
83 if (target_inode->i_sb != inode->i_sb) { 78 if (target_inode->i_sb != inode->i_sb) {
84 path_put(&path);
85 error = -EINVAL; 79 error = -EINVAL;
86 goto out; 80 goto out;
87 } 81 }
@@ -90,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
90 cnp = ITOC(target_inode); 84 cnp = ITOC(target_inode);
91 85
92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 86 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
93
94 path_put(&path);
95
96out: 87out:
97 unlock_kernel(); 88 path_put(&path);
98 return error; 89 return error;
99} 90}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 66b9cf79c5b..62647a8595e 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/device.h> 39#include <linux/device.h>
40#include <asm/io.h> 40#include <asm/io.h>
41#include <asm/system.h> 41#include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
67 unsigned int mask = POLLOUT | POLLWRNORM; 67 unsigned int mask = POLLOUT | POLLWRNORM;
68 68
69 poll_wait(file, &vcp->vc_waitq, wait); 69 poll_wait(file, &vcp->vc_waitq, wait);
70 mutex_lock(&vcp->vc_mutex);
70 if (!list_empty(&vcp->vc_pending)) 71 if (!list_empty(&vcp->vc_pending))
71 mask |= POLLIN | POLLRDNORM; 72 mask |= POLLIN | POLLRDNORM;
73 mutex_unlock(&vcp->vc_mutex);
72 74
73 return mask; 75 return mask;
74} 76}
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
108 return -EFAULT; 110 return -EFAULT;
109 111
110 if (DOWNCALL(hdr.opcode)) { 112 if (DOWNCALL(hdr.opcode)) {
111 struct super_block *sb = NULL; 113 union outputArgs *dcbuf;
112 union outputArgs *dcbuf;
113 int size = sizeof(*dcbuf); 114 int size = sizeof(*dcbuf);
114 115
115 sb = vcp->vc_sb;
116 if ( !sb ) {
117 count = nbytes;
118 goto out;
119 }
120
121 if ( nbytes < sizeof(struct coda_out_hdr) ) { 116 if ( nbytes < sizeof(struct coda_out_hdr) ) {
122 printk("coda_downcall opc %d uniq %d, not enough!\n", 117 printk("coda_downcall opc %d uniq %d, not enough!\n",
123 hdr.opcode, hdr.unique); 118 hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
137 } 132 }
138 133
139 /* what downcall errors does Venus handle ? */ 134 /* what downcall errors does Venus handle ? */
140 lock_kernel(); 135 error = coda_downcall(vcp, hdr.opcode, dcbuf);
141 error = coda_downcall(hdr.opcode, dcbuf, sb);
142 unlock_kernel();
143 136
144 CODA_FREE(dcbuf, nbytes); 137 CODA_FREE(dcbuf, nbytes);
145 if (error) { 138 if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
152 } 145 }
153 146
154 /* Look for the message on the processing queue. */ 147 /* Look for the message on the processing queue. */
155 lock_kernel(); 148 mutex_lock(&vcp->vc_mutex);
156 list_for_each(lh, &vcp->vc_processing) { 149 list_for_each(lh, &vcp->vc_processing) {
157 tmp = list_entry(lh, struct upc_req , uc_chain); 150 tmp = list_entry(lh, struct upc_req , uc_chain);
158 if (tmp->uc_unique == hdr.unique) { 151 if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
161 break; 154 break;
162 } 155 }
163 } 156 }
164 unlock_kernel(); 157 mutex_unlock(&vcp->vc_mutex);
165 158
166 if (!req) { 159 if (!req) {
167 printk("psdev_write: msg (%d, %d) not found\n", 160 printk("psdev_write: msg (%d, %d) not found\n",
@@ -177,15 +170,15 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
177 nbytes = req->uc_outSize; /* don't have more space! */ 170 nbytes = req->uc_outSize; /* don't have more space! */
178 } 171 }
179 if (copy_from_user(req->uc_data, buf, nbytes)) { 172 if (copy_from_user(req->uc_data, buf, nbytes)) {
180 req->uc_flags |= REQ_ABORT; 173 req->uc_flags |= CODA_REQ_ABORT;
181 wake_up(&req->uc_sleep); 174 wake_up(&req->uc_sleep);
182 retval = -EFAULT; 175 retval = -EFAULT;
183 goto out; 176 goto out;
184 } 177 }
185 178
186 /* adjust outsize. is this useful ?? */ 179 /* adjust outsize. is this useful ?? */
187 req->uc_outSize = nbytes; 180 req->uc_outSize = nbytes;
188 req->uc_flags |= REQ_WRITE; 181 req->uc_flags |= CODA_REQ_WRITE;
189 count = nbytes; 182 count = nbytes;
190 183
191 /* Convert filedescriptor into a file handle */ 184 /* Convert filedescriptor into a file handle */
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
216 if (nbytes == 0) 209 if (nbytes == 0)
217 return 0; 210 return 0;
218 211
219 lock_kernel(); 212 mutex_lock(&vcp->vc_mutex);
220 213
221 add_wait_queue(&vcp->vc_waitq, &wait); 214 add_wait_queue(&vcp->vc_waitq, &wait);
222 set_current_state(TASK_INTERRUPTIBLE); 215 set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
230 retval = -ERESTARTSYS; 223 retval = -ERESTARTSYS;
231 break; 224 break;
232 } 225 }
226 mutex_unlock(&vcp->vc_mutex);
233 schedule(); 227 schedule();
228 mutex_lock(&vcp->vc_mutex);
234 } 229 }
235 230
236 set_current_state(TASK_RUNNING); 231 set_current_state(TASK_RUNNING);
@@ -254,8 +249,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
254 retval = -EFAULT; 249 retval = -EFAULT;
255 250
256 /* If request was not a signal, enqueue and don't free */ 251 /* If request was not a signal, enqueue and don't free */
257 if (!(req->uc_flags & REQ_ASYNC)) { 252 if (!(req->uc_flags & CODA_REQ_ASYNC)) {
258 req->uc_flags |= REQ_READ; 253 req->uc_flags |= CODA_REQ_READ;
259 list_add_tail(&(req->uc_chain), &vcp->vc_processing); 254 list_add_tail(&(req->uc_chain), &vcp->vc_processing);
260 goto out; 255 goto out;
261 } 256 }
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
263 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 258 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
264 kfree(req); 259 kfree(req);
265out: 260out:
266 unlock_kernel(); 261 mutex_unlock(&vcp->vc_mutex);
267 return (count ? count : retval); 262 return (count ? count : retval);
268} 263}
269 264
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
276 if (idx < 0 || idx >= MAX_CODADEVS) 271 if (idx < 0 || idx >= MAX_CODADEVS)
277 return -ENODEV; 272 return -ENODEV;
278 273
279 lock_kernel();
280
281 err = -EBUSY; 274 err = -EBUSY;
282 vcp = &coda_comms[idx]; 275 vcp = &coda_comms[idx];
276 mutex_lock(&vcp->vc_mutex);
277
283 if (!vcp->vc_inuse) { 278 if (!vcp->vc_inuse) {
284 vcp->vc_inuse++; 279 vcp->vc_inuse++;
285 280
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
293 err = 0; 288 err = 0;
294 } 289 }
295 290
296 unlock_kernel(); 291 mutex_unlock(&vcp->vc_mutex);
297 return err; 292 return err;
298} 293}
299 294
@@ -308,32 +303,32 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
308 return -1; 303 return -1;
309 } 304 }
310 305
311 lock_kernel(); 306 mutex_lock(&vcp->vc_mutex);
312 307
313 /* Wakeup clients so they can return. */ 308 /* Wakeup clients so they can return. */
314 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { 309 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
315 list_del(&req->uc_chain); 310 list_del(&req->uc_chain);
316 311
317 /* Async requests need to be freed here */ 312 /* Async requests need to be freed here */
318 if (req->uc_flags & REQ_ASYNC) { 313 if (req->uc_flags & CODA_REQ_ASYNC) {
319 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 314 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
320 kfree(req); 315 kfree(req);
321 continue; 316 continue;
322 } 317 }
323 req->uc_flags |= REQ_ABORT; 318 req->uc_flags |= CODA_REQ_ABORT;
324 wake_up(&req->uc_sleep); 319 wake_up(&req->uc_sleep);
325 } 320 }
326 321
327 list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { 322 list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
328 list_del(&req->uc_chain); 323 list_del(&req->uc_chain);
329 324
330 req->uc_flags |= REQ_ABORT; 325 req->uc_flags |= CODA_REQ_ABORT;
331 wake_up(&req->uc_sleep); 326 wake_up(&req->uc_sleep);
332 } 327 }
333 328
334 file->private_data = NULL; 329 file->private_data = NULL;
335 vcp->vc_inuse--; 330 vcp->vc_inuse--;
336 unlock_kernel(); 331 mutex_unlock(&vcp->vc_mutex);
337 return 0; 332 return 0;
338} 333}
339 334
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
346 .unlocked_ioctl = coda_psdev_ioctl, 341 .unlocked_ioctl = coda_psdev_ioctl,
347 .open = coda_psdev_open, 342 .open = coda_psdev_open,
348 .release = coda_psdev_release, 343 .release = coda_psdev_release,
344 .llseek = noop_llseek,
349}; 345};
350 346
351static int init_coda_psdev(void) 347static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
361 err = PTR_ERR(coda_psdev_class); 357 err = PTR_ERR(coda_psdev_class);
362 goto out_chrdev; 358 goto out_chrdev;
363 } 359 }
364 for (i = 0; i < MAX_CODADEVS; i++) 360 for (i = 0; i < MAX_CODADEVS; i++) {
361 mutex_init(&(&coda_comms[i])->vc_mutex);
365 device_create(coda_psdev_class, NULL, 362 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); 363 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
364 }
367 coda_sysctl_init(); 365 coda_sysctl_init();
368 goto out; 366 goto out;
369 367
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b725845..af78f007a2b 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18 17
19#include <linux/coda.h> 18#include <linux/coda.h>
20#include <linux/coda_linux.h> 19#include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
29 unsigned int len = PAGE_SIZE; 28 unsigned int len = PAGE_SIZE;
30 char *p = kmap(page); 29 char *p = kmap(page);
31 30
32 lock_kernel();
33 cii = ITOC(inode); 31 cii = ITOC(inode);
34 32
35 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); 33 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
36 unlock_kernel();
37 if (error) 34 if (error)
38 goto fail; 35 goto fail;
39 SetPageUptodate(page); 36 SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f09c5ed76f6..c3563cab975 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
@@ -604,9 +605,10 @@ static void coda_unblock_signals(sigset_t *old)
604 (((r)->uc_opcode != CODA_CLOSE && \ 605 (((r)->uc_opcode != CODA_CLOSE && \
605 (r)->uc_opcode != CODA_STORE && \ 606 (r)->uc_opcode != CODA_STORE && \
606 (r)->uc_opcode != CODA_RELEASE) || \ 607 (r)->uc_opcode != CODA_RELEASE) || \
607 (r)->uc_flags & REQ_READ)) 608 (r)->uc_flags & CODA_REQ_READ))
608 609
609static inline void coda_waitfor_upcall(struct upc_req *req) 610static inline void coda_waitfor_upcall(struct venus_comm *vcp,
611 struct upc_req *req)
610{ 612{
611 DECLARE_WAITQUEUE(wait, current); 613 DECLARE_WAITQUEUE(wait, current);
612 unsigned long timeout = jiffies + coda_timeout * HZ; 614 unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -624,7 +626,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
624 set_current_state(TASK_UNINTERRUPTIBLE); 626 set_current_state(TASK_UNINTERRUPTIBLE);
625 627
626 /* got a reply */ 628 /* got a reply */
627 if (req->uc_flags & (REQ_WRITE | REQ_ABORT)) 629 if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
628 break; 630 break;
629 631
630 if (blocked && time_after(jiffies, timeout) && 632 if (blocked && time_after(jiffies, timeout) &&
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
639 break; 641 break;
640 } 642 }
641 643
644 mutex_unlock(&vcp->vc_mutex);
642 if (blocked) 645 if (blocked)
643 schedule_timeout(HZ); 646 schedule_timeout(HZ);
644 else 647 else
645 schedule(); 648 schedule();
649 mutex_lock(&vcp->vc_mutex);
646 } 650 }
647 if (blocked) 651 if (blocked)
648 coda_unblock_signals(&old); 652 coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
667{ 671{
668 union outputArgs *out; 672 union outputArgs *out;
669 union inputArgs *sig_inputArgs; 673 union inputArgs *sig_inputArgs;
670 struct upc_req *req, *sig_req; 674 struct upc_req *req = NULL, *sig_req;
671 int error = 0; 675 int error;
676
677 mutex_lock(&vcp->vc_mutex);
672 678
673 if (!vcp->vc_inuse) { 679 if (!vcp->vc_inuse) {
674 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); 680 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
675 return -ENXIO; 681 error = -ENXIO;
682 goto exit;
676 } 683 }
677 684
678 /* Format the request message. */ 685 /* Format the request message. */
679 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); 686 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
680 if (!req) 687 if (!req) {
681 return -ENOMEM; 688 error = -ENOMEM;
689 goto exit;
690 }
682 691
683 req->uc_data = (void *)buffer; 692 req->uc_data = (void *)buffer;
684 req->uc_flags = 0; 693 req->uc_flags = 0;
@@ -705,10 +714,10 @@ static int coda_upcall(struct venus_comm *vcp,
705 * ENODEV. */ 714 * ENODEV. */
706 715
707 /* Go to sleep. Wake up on signals only after the timeout. */ 716 /* Go to sleep. Wake up on signals only after the timeout. */
708 coda_waitfor_upcall(req); 717 coda_waitfor_upcall(vcp, req);
709 718
710 /* Op went through, interrupt or not... */ 719 /* Op went through, interrupt or not... */
711 if (req->uc_flags & REQ_WRITE) { 720 if (req->uc_flags & CODA_REQ_WRITE) {
712 out = (union outputArgs *)req->uc_data; 721 out = (union outputArgs *)req->uc_data;
713 /* here we map positive Venus errors to kernel errors */ 722 /* here we map positive Venus errors to kernel errors */
714 error = -out->oh.result; 723 error = -out->oh.result;
@@ -717,13 +726,13 @@ static int coda_upcall(struct venus_comm *vcp,
717 } 726 }
718 727
719 error = -EINTR; 728 error = -EINTR;
720 if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) { 729 if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
721 printk(KERN_WARNING "coda: Unexpected interruption.\n"); 730 printk(KERN_WARNING "coda: Unexpected interruption.\n");
722 goto exit; 731 goto exit;
723 } 732 }
724 733
725 /* Interrupted before venus read it. */ 734 /* Interrupted before venus read it. */
726 if (!(req->uc_flags & REQ_READ)) 735 if (!(req->uc_flags & CODA_REQ_READ))
727 goto exit; 736 goto exit;
728 737
729 /* Venus saw the upcall, make sure we can send interrupt signal */ 738 /* Venus saw the upcall, make sure we can send interrupt signal */
@@ -747,7 +756,7 @@ static int coda_upcall(struct venus_comm *vcp,
747 sig_inputArgs->ih.opcode = CODA_SIGNAL; 756 sig_inputArgs->ih.opcode = CODA_SIGNAL;
748 sig_inputArgs->ih.unique = req->uc_unique; 757 sig_inputArgs->ih.unique = req->uc_unique;
749 758
750 sig_req->uc_flags = REQ_ASYNC; 759 sig_req->uc_flags = CODA_REQ_ASYNC;
751 sig_req->uc_opcode = sig_inputArgs->ih.opcode; 760 sig_req->uc_opcode = sig_inputArgs->ih.opcode;
752 sig_req->uc_unique = sig_inputArgs->ih.unique; 761 sig_req->uc_unique = sig_inputArgs->ih.unique;
753 sig_req->uc_inSize = sizeof(struct coda_in_hdr); 762 sig_req->uc_inSize = sizeof(struct coda_in_hdr);
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
759 768
760exit: 769exit:
761 kfree(req); 770 kfree(req);
771 mutex_unlock(&vcp->vc_mutex);
762 return error; 772 return error;
763} 773}
764 774
@@ -796,21 +806,24 @@ exit:
796 * 806 *
797 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ 807 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
798 808
799int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) 809int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
800{ 810{
801 struct inode *inode = NULL; 811 struct inode *inode = NULL;
802 struct CodaFid *fid, *newfid; 812 struct CodaFid *fid = NULL, *newfid;
813 struct super_block *sb;
803 814
804 /* Handle invalidation requests. */ 815 /* Handle invalidation requests. */
805 if ( !sb || !sb->s_root) 816 mutex_lock(&vcp->vc_mutex);
806 return 0; 817 sb = vcp->vc_sb;
818 if (!sb || !sb->s_root)
819 goto unlock_out;
807 820
808 switch (opcode) { 821 switch (opcode) {
809 case CODA_FLUSH: 822 case CODA_FLUSH:
810 coda_cache_clear_all(sb); 823 coda_cache_clear_all(sb);
811 shrink_dcache_sb(sb); 824 shrink_dcache_sb(sb);
812 if (sb->s_root->d_inode) 825 if (sb->s_root->d_inode)
813 coda_flag_inode(sb->s_root->d_inode, C_FLUSH); 826 coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
814 break; 827 break;
815 828
816 case CODA_PURGEUSER: 829 case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
819 832
820 case CODA_ZAPDIR: 833 case CODA_ZAPDIR:
821 fid = &out->coda_zapdir.CodaFid; 834 fid = &out->coda_zapdir.CodaFid;
822 inode = coda_fid_to_inode(fid, sb);
823 if (inode) {
824 coda_flag_inode_children(inode, C_PURGE);
825 coda_flag_inode(inode, C_VATTR);
826 }
827 break; 835 break;
828 836
829 case CODA_ZAPFILE: 837 case CODA_ZAPFILE:
830 fid = &out->coda_zapfile.CodaFid; 838 fid = &out->coda_zapfile.CodaFid;
831 inode = coda_fid_to_inode(fid, sb);
832 if (inode)
833 coda_flag_inode(inode, C_VATTR);
834 break; 839 break;
835 840
836 case CODA_PURGEFID: 841 case CODA_PURGEFID:
837 fid = &out->coda_purgefid.CodaFid; 842 fid = &out->coda_purgefid.CodaFid;
843 break;
844
845 case CODA_REPLACE:
846 fid = &out->coda_replace.OldFid;
847 break;
848 }
849 if (fid)
838 inode = coda_fid_to_inode(fid, sb); 850 inode = coda_fid_to_inode(fid, sb);
839 if (inode) {
840 coda_flag_inode_children(inode, C_PURGE);
841 851
842 /* catch the dentries later if some are still busy */ 852unlock_out:
843 coda_flag_inode(inode, C_PURGE); 853 mutex_unlock(&vcp->vc_mutex);
844 d_prune_aliases(inode);
845 854
846 } 855 if (!inode)
856 return 0;
857
858 switch (opcode) {
859 case CODA_ZAPDIR:
860 coda_flag_inode_children(inode, C_PURGE);
861 coda_flag_inode(inode, C_VATTR);
862 break;
863
864 case CODA_ZAPFILE:
865 coda_flag_inode(inode, C_VATTR);
866 break;
867
868 case CODA_PURGEFID:
869 coda_flag_inode_children(inode, C_PURGE);
870
871 /* catch the dentries later if some are still busy */
872 coda_flag_inode(inode, C_PURGE);
873 d_prune_aliases(inode);
847 break; 874 break;
848 875
849 case CODA_REPLACE: 876 case CODA_REPLACE:
850 fid = &out->coda_replace.OldFid;
851 newfid = &out->coda_replace.NewFid; 877 newfid = &out->coda_replace.NewFid;
852 inode = coda_fid_to_inode(fid, sb); 878 coda_replace_fid(inode, fid, newfid);
853 if (inode)
854 coda_replace_fid(inode, fid, newfid);
855 break; 879 break;
856 } 880 }
857 881 iput(inode);
858 if (inode)
859 iput(inode);
860
861 return 0; 882 return 0;
862} 883}
863 884
diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff..c580c322fa6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,13 +8,14 @@
8 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) 8 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
9 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) 9 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
10 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs 10 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
11 * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) 11 * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
12 * 12 *
13 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2 as 14 * it under the terms of the GNU General Public License version 2 as
15 * published by the Free Software Foundation. 15 * published by the Free Software Foundation.
16 */ 16 */
17 17
18#include <linux/stddef.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/linkage.h> 20#include <linux/linkage.h>
20#include <linux/compat.h> 21#include <linux/compat.h>
@@ -28,8 +29,6 @@
28#include <linux/vfs.h> 29#include <linux/vfs.h>
29#include <linux/ioctl.h> 30#include <linux/ioctl.h>
30#include <linux/init.h> 31#include <linux/init.h>
31#include <linux/smb.h>
32#include <linux/smb_mount.h>
33#include <linux/ncp_mount.h> 32#include <linux/ncp_mount.h>
34#include <linux/nfs4_mount.h> 33#include <linux/nfs4_mount.h>
35#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -50,6 +49,7 @@
50#include <linux/eventpoll.h> 49#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 50#include <linux/fs_struct.h>
52#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/pagemap.h>
53 53
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -76,7 +76,8 @@ int compat_printk(const char *fmt, ...)
76 * Not all architectures have sys_utime, so implement this in terms 76 * Not all architectures have sys_utime, so implement this in terms
77 * of sys_utimes. 77 * of sys_utimes.
78 */ 78 */
79asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t) 79asmlinkage long compat_sys_utime(const char __user *filename,
80 struct compat_utimbuf __user *t)
80{ 81{
81 struct timespec tv[2]; 82 struct timespec tv[2];
82 83
@@ -90,7 +91,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
90 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); 91 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
91} 92}
92 93
93asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags) 94asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
94{ 95{
95 struct timespec tv[2]; 96 struct timespec tv[2];
96 97
@@ -105,7 +106,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
105 return do_utimes(dfd, filename, t ? tv : NULL, flags); 106 return do_utimes(dfd, filename, t ? tv : NULL, flags);
106} 107}
107 108
108asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) 109asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
109{ 110{
110 struct timespec tv[2]; 111 struct timespec tv[2];
111 112
@@ -124,7 +125,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
124 return do_utimes(dfd, filename, t ? tv : NULL, 0); 125 return do_utimes(dfd, filename, t ? tv : NULL, 0);
125} 126}
126 127
127asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t) 128asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
128{ 129{
129 return compat_sys_futimesat(AT_FDCWD, filename, t); 130 return compat_sys_futimesat(AT_FDCWD, filename, t);
130} 131}
@@ -168,7 +169,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
168 return err; 169 return err;
169} 170}
170 171
171asmlinkage long compat_sys_newstat(char __user * filename, 172asmlinkage long compat_sys_newstat(const char __user * filename,
172 struct compat_stat __user *statbuf) 173 struct compat_stat __user *statbuf)
173{ 174{
174 struct kstat stat; 175 struct kstat stat;
@@ -180,7 +181,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
180 return cp_compat_stat(&stat, statbuf); 181 return cp_compat_stat(&stat, statbuf);
181} 182}
182 183
183asmlinkage long compat_sys_newlstat(char __user * filename, 184asmlinkage long compat_sys_newlstat(const char __user * filename,
184 struct compat_stat __user *statbuf) 185 struct compat_stat __user *statbuf)
185{ 186{
186 struct kstat stat; 187 struct kstat stat;
@@ -193,7 +194,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
193} 194}
194 195
195#ifndef __ARCH_WANT_STAT64 196#ifndef __ARCH_WANT_STAT64
196asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, 197asmlinkage long compat_sys_newfstatat(unsigned int dfd,
198 const char __user *filename,
197 struct compat_stat __user *statbuf, int flag) 199 struct compat_stat __user *statbuf, int flag)
198{ 200{
199 struct kstat stat; 201 struct kstat stat;
@@ -266,7 +268,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
266 error = user_path(pathname, &path); 268 error = user_path(pathname, &path);
267 if (!error) { 269 if (!error) {
268 struct kstatfs tmp; 270 struct kstatfs tmp;
269 error = vfs_statfs(path.dentry, &tmp); 271 error = vfs_statfs(&path, &tmp);
270 if (!error) 272 if (!error)
271 error = put_compat_statfs(buf, &tmp); 273 error = put_compat_statfs(buf, &tmp);
272 path_put(&path); 274 path_put(&path);
@@ -284,7 +286,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
284 file = fget(fd); 286 file = fget(fd);
285 if (!file) 287 if (!file)
286 goto out; 288 goto out;
287 error = vfs_statfs(file->f_path.dentry, &tmp); 289 error = vfs_statfs(&file->f_path, &tmp);
288 if (!error) 290 if (!error)
289 error = put_compat_statfs(buf, &tmp); 291 error = put_compat_statfs(buf, &tmp);
290 fput(file); 292 fput(file);
@@ -334,7 +336,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
334 error = user_path(pathname, &path); 336 error = user_path(pathname, &path);
335 if (!error) { 337 if (!error) {
336 struct kstatfs tmp; 338 struct kstatfs tmp;
337 error = vfs_statfs(path.dentry, &tmp); 339 error = vfs_statfs(&path, &tmp);
338 if (!error) 340 if (!error)
339 error = put_compat_statfs64(buf, &tmp); 341 error = put_compat_statfs64(buf, &tmp);
340 path_put(&path); 342 path_put(&path);
@@ -355,7 +357,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
355 file = fget(fd); 357 file = fget(fd);
356 if (!file) 358 if (!file)
357 goto out; 359 goto out;
358 error = vfs_statfs(file->f_path.dentry, &tmp); 360 error = vfs_statfs(&file->f_path, &tmp);
359 if (!error) 361 if (!error)
360 error = put_compat_statfs64(buf, &tmp); 362 error = put_compat_statfs64(buf, &tmp);
361 fput(file); 363 fput(file);
@@ -378,7 +380,7 @@ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
378 sb = user_get_super(new_decode_dev(dev)); 380 sb = user_get_super(new_decode_dev(dev));
379 if (!sb) 381 if (!sb)
380 return -EINVAL; 382 return -EINVAL;
381 err = vfs_statfs(sb->s_root, &sbuf); 383 err = statfs_by_dentry(sb->s_root, &sbuf);
382 drop_super(sb); 384 drop_super(sb);
383 if (err) 385 if (err)
384 return err; 386 return err;
@@ -605,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
605 /* 607 /*
606 * Single unix specification: 608 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an 609 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t 610 * ssize_t.
609 * 611 *
610 * Be careful here because iov_len is a size_t not an ssize_t 612 * In Linux, the total length is limited to MAX_RW_COUNT, there is
613 * no overflow possibility.
611 */ 614 */
612 tot_len = 0; 615 tot_len = 0;
613 ret = -EINVAL; 616 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) { 617 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf; 618 compat_uptr_t buf;
617 compat_ssize_t len; 619 compat_ssize_t len;
618 620
@@ -623,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
623 } 625 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 626 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out; 627 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 628 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
630 ret = -EFAULT; 629 ret = -EFAULT;
631 goto out; 630 goto out;
632 } 631 }
632 if (len > MAX_RW_COUNT - tot_len)
633 len = MAX_RW_COUNT - tot_len;
634 tot_len += len;
633 iov->iov_base = compat_ptr(buf); 635 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len; 636 iov->iov_len = (compat_size_t) len;
635 uvector++; 637 uvector++;
@@ -742,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
742 return raw_data; 744 return raw_data;
743} 745}
744 746
745struct compat_smb_mount_data {
746 compat_int_t version;
747 __compat_uid_t mounted_uid;
748 __compat_uid_t uid;
749 __compat_gid_t gid;
750 compat_mode_t file_mode;
751 compat_mode_t dir_mode;
752};
753
754static void *do_smb_super_data_conv(void *raw_data)
755{
756 struct smb_mount_data *s = raw_data;
757 struct compat_smb_mount_data *c_s = raw_data;
758
759 if (c_s->version != SMB_MOUNT_OLDVERSION)
760 goto out;
761 s->dir_mode = c_s->dir_mode;
762 s->file_mode = c_s->file_mode;
763 s->gid = c_s->gid;
764 s->uid = c_s->uid;
765 s->mounted_uid = c_s->mounted_uid;
766 out:
767 return raw_data;
768}
769 747
770struct compat_nfs_string { 748struct compat_nfs_string {
771 compat_uint_t len; 749 compat_uint_t len;
@@ -832,13 +810,13 @@ static int do_nfs4_super_data_conv(void *raw_data)
832 return 0; 810 return 0;
833} 811}
834 812
835#define SMBFS_NAME "smbfs"
836#define NCPFS_NAME "ncpfs" 813#define NCPFS_NAME "ncpfs"
837#define NFS4_NAME "nfs4" 814#define NFS4_NAME "nfs4"
838 815
839asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, 816asmlinkage long compat_sys_mount(const char __user * dev_name,
840 char __user * type, unsigned long flags, 817 const char __user * dir_name,
841 void __user * data) 818 const char __user * type, unsigned long flags,
819 const void __user * data)
842{ 820{
843 char *kernel_type; 821 char *kernel_type;
844 unsigned long data_page; 822 unsigned long data_page;
@@ -866,9 +844,7 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
866 retval = -EINVAL; 844 retval = -EINVAL;
867 845
868 if (kernel_type && data_page) { 846 if (kernel_type && data_page) {
869 if (!strcmp(kernel_type, SMBFS_NAME)) { 847 if (!strcmp(kernel_type, NCPFS_NAME)) {
870 do_smb_super_data_conv((void *)data_page);
871 } else if (!strcmp(kernel_type, NCPFS_NAME)) {
872 do_ncp_super_data_conv((void *)data_page); 848 do_ncp_super_data_conv((void *)data_page);
873 } else if (!strcmp(kernel_type, NFS4_NAME)) { 849 } else if (!strcmp(kernel_type, NFS4_NAME)) {
874 if (do_nfs4_super_data_conv((void *) data_page)) 850 if (do_nfs4_super_data_conv((void *) data_page))
@@ -891,8 +867,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
891 return retval; 867 return retval;
892} 868}
893 869
894#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
895
896struct compat_old_linux_dirent { 870struct compat_old_linux_dirent {
897 compat_ulong_t d_ino; 871 compat_ulong_t d_ino;
898 compat_ulong_t d_offset; 872 compat_ulong_t d_offset;
@@ -981,7 +955,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
981 struct compat_linux_dirent __user * dirent; 955 struct compat_linux_dirent __user * dirent;
982 struct compat_getdents_callback *buf = __buf; 956 struct compat_getdents_callback *buf = __buf;
983 compat_ulong_t d_ino; 957 compat_ulong_t d_ino;
984 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t)); 958 int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
959 namlen + 2, sizeof(compat_long_t));
985 960
986 buf->error = -EINVAL; /* only used if we fail.. */ 961 buf->error = -EINVAL; /* only used if we fail.. */
987 if (reclen > buf->count) 962 if (reclen > buf->count)
@@ -1068,8 +1043,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
1068{ 1043{
1069 struct linux_dirent64 __user *dirent; 1044 struct linux_dirent64 __user *dirent;
1070 struct compat_getdents_callback64 *buf = __buf; 1045 struct compat_getdents_callback64 *buf = __buf;
1071 int jj = NAME_OFFSET(dirent); 1046 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
1072 int reclen = ALIGN(jj + namlen + 1, sizeof(u64)); 1047 sizeof(u64));
1073 u64 off; 1048 u64 off;
1074 1049
1075 buf->error = -EINVAL; /* only used if we fail.. */ 1050 buf->error = -EINVAL; /* only used if we fail.. */
@@ -1150,7 +1125,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1150{ 1125{
1151 compat_ssize_t tot_len; 1126 compat_ssize_t tot_len;
1152 struct iovec iovstack[UIO_FASTIOV]; 1127 struct iovec iovstack[UIO_FASTIOV];
1153 struct iovec *iov; 1128 struct iovec *iov = iovstack;
1154 ssize_t ret; 1129 ssize_t ret;
1155 io_fn_t fn; 1130 io_fn_t fn;
1156 iov_fn_t fnv; 1131 iov_fn_t fnv;
@@ -1193,11 +1168,10 @@ out:
1193 if (iov != iovstack) 1168 if (iov != iovstack)
1194 kfree(iov); 1169 kfree(iov);
1195 if ((ret + (type == READ)) > 0) { 1170 if ((ret + (type == READ)) > 0) {
1196 struct dentry *dentry = file->f_path.dentry;
1197 if (type == READ) 1171 if (type == READ)
1198 fsnotify_access(dentry); 1172 fsnotify_access(file);
1199 else 1173 else
1200 fsnotify_modify(dentry); 1174 fsnotify_modify(file);
1201 } 1175 }
1202 return ret; 1176 return ret;
1203} 1177}
@@ -1961,7 +1935,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1961} 1935}
1962#endif /* HAVE_SET_RESTORE_SIGMASK */ 1936#endif /* HAVE_SET_RESTORE_SIGMASK */
1963 1937
1964#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) 1938#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
1965/* Stuff for NFS server syscalls... */ 1939/* Stuff for NFS server syscalls... */
1966struct compat_nfsctl_svc { 1940struct compat_nfsctl_svc {
1967 u16 svc32_port; 1941 u16 svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae..410ed188faa 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) 4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) 5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs 6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
7 * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) 7 * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
8 * 8 *
9 * These routines maintain argument size conversion between 32bit and 64bit 9 * These routines maintain argument size conversion between 32bit and 64bit
10 * ioctls. 10 * ioctls.
@@ -46,7 +46,6 @@
46#include <linux/videodev.h> 46#include <linux/videodev.h>
47#include <linux/netdevice.h> 47#include <linux/netdevice.h>
48#include <linux/raw.h> 48#include <linux/raw.h>
49#include <linux/smb_fs.h>
50#include <linux/blkdev.h> 49#include <linux/blkdev.h>
51#include <linux/elevator.h> 50#include <linux/elevator.h>
52#include <linux/rtc.h> 51#include <linux/rtc.h>
@@ -131,23 +130,6 @@ static int w_long(unsigned int fd, unsigned int cmd,
131 return err; 130 return err;
132} 131}
133 132
134static int rw_long(unsigned int fd, unsigned int cmd,
135 compat_ulong_t __user *argp)
136{
137 mm_segment_t old_fs = get_fs();
138 int err;
139 unsigned long val;
140
141 if(get_user(val, argp))
142 return -EFAULT;
143 set_fs (KERNEL_DS);
144 err = sys_ioctl(fd, cmd, (unsigned long)&val);
145 set_fs (old_fs);
146 if (!err && put_user(val, argp))
147 return -EFAULT;
148 return err;
149}
150
151struct compat_video_event { 133struct compat_video_event {
152 int32_t type; 134 int32_t type;
153 compat_time_t timestamp; 135 compat_time_t timestamp;
@@ -575,34 +557,12 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
575 557
576#endif /* CONFIG_BLOCK */ 558#endif /* CONFIG_BLOCK */
577 559
578static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
579 compat_uid_t __user *argp)
580{
581 mm_segment_t old_fs = get_fs();
582 __kernel_uid_t kuid;
583 int err;
584
585 cmd = SMB_IOC_GETMOUNTUID;
586
587 set_fs(KERNEL_DS);
588 err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
589 set_fs(old_fs);
590
591 if (err >= 0)
592 err = put_user(kuid, argp);
593
594 return err;
595}
596
597static int ioc_settimeout(unsigned int fd, unsigned int cmd,
598 compat_ulong_t __user *argp)
599{
600 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
601}
602
603/* Bluetooth ioctls */ 560/* Bluetooth ioctls */
604#define HCIUARTSETPROTO _IOW('U', 200, int) 561#define HCIUARTSETPROTO _IOW('U', 200, int)
605#define HCIUARTGETPROTO _IOR('U', 201, int) 562#define HCIUARTGETPROTO _IOR('U', 201, int)
563#define HCIUARTGETDEVICE _IOR('U', 202, int)
564#define HCIUARTSETFLAGS _IOW('U', 203, int)
565#define HCIUARTGETFLAGS _IOR('U', 204, int)
606 566
607#define BNEPCONNADD _IOW('B', 200, int) 567#define BNEPCONNADD _IOW('B', 200, int)
608#define BNEPCONNDEL _IOW('B', 201, int) 568#define BNEPCONNDEL _IOW('B', 201, int)
@@ -619,69 +579,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd,
619#define HIDPGETCONNLIST _IOR('H', 210, int) 579#define HIDPGETCONNLIST _IOR('H', 210, int)
620#define HIDPGETCONNINFO _IOR('H', 211, int) 580#define HIDPGETCONNINFO _IOR('H', 211, int)
621 581
622#ifdef CONFIG_BLOCK
623struct raw32_config_request
624{
625 compat_int_t raw_minor;
626 __u64 block_major;
627 __u64 block_minor;
628} __attribute__((packed));
629
630static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
631{
632 int ret;
633
634 if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
635 return -EFAULT;
636
637 ret = __get_user(req->raw_minor, &user_req->raw_minor);
638 ret |= __get_user(req->block_major, &user_req->block_major);
639 ret |= __get_user(req->block_minor, &user_req->block_minor);
640
641 return ret ? -EFAULT : 0;
642}
643
644static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
645{
646 int ret;
647
648 if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
649 return -EFAULT;
650
651 ret = __put_user(req->raw_minor, &user_req->raw_minor);
652 ret |= __put_user(req->block_major, &user_req->block_major);
653 ret |= __put_user(req->block_minor, &user_req->block_minor);
654
655 return ret ? -EFAULT : 0;
656}
657
658static int raw_ioctl(unsigned fd, unsigned cmd,
659 struct raw32_config_request __user *user_req)
660{
661 int ret;
662
663 switch (cmd) {
664 case RAW_SETBIND:
665 default: { /* RAW_GETBIND */
666 struct raw_config_request req;
667 mm_segment_t oldfs = get_fs();
668
669 if ((ret = get_raw32_request(&req, user_req)))
670 return ret;
671
672 set_fs(KERNEL_DS);
673 ret = sys_ioctl(fd,cmd,(unsigned long)&req);
674 set_fs(oldfs);
675
676 if ((!ret) && (cmd == RAW_GETBIND)) {
677 ret = set_raw32_request(&req, user_req);
678 }
679 break;
680 }
681 }
682 return ret;
683}
684#endif /* CONFIG_BLOCK */
685 582
686struct serial_struct32 { 583struct serial_struct32 {
687 compat_int_t type; 584 compat_int_t type;
@@ -966,6 +863,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP)
966COMPATIBLE_IOCTL(TIOCGPTN) 863COMPATIBLE_IOCTL(TIOCGPTN)
967COMPATIBLE_IOCTL(TIOCSPTLCK) 864COMPATIBLE_IOCTL(TIOCSPTLCK)
968COMPATIBLE_IOCTL(TIOCSERGETLSR) 865COMPATIBLE_IOCTL(TIOCSERGETLSR)
866COMPATIBLE_IOCTL(TIOCSIG)
969#ifdef TCGETS2 867#ifdef TCGETS2
970COMPATIBLE_IOCTL(TCGETS2) 868COMPATIBLE_IOCTL(TCGETS2)
971COMPATIBLE_IOCTL(TCSETS2) 869COMPATIBLE_IOCTL(TCSETS2)
@@ -1281,18 +1179,9 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
1281COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) 1179COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
1282COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) 1180COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
1283COMPATIBLE_IOCTL(OSS_GETVERSION) 1181COMPATIBLE_IOCTL(OSS_GETVERSION)
1284/* AUTOFS */
1285COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
1286COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
1287COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
1288COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
1289COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
1290COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
1291/* Raw devices */ 1182/* Raw devices */
1292COMPATIBLE_IOCTL(RAW_SETBIND) 1183COMPATIBLE_IOCTL(RAW_SETBIND)
1293COMPATIBLE_IOCTL(RAW_GETBIND) 1184COMPATIBLE_IOCTL(RAW_GETBIND)
1294/* SMB ioctls which do not need any translations */
1295COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
1296/* Watchdog */ 1185/* Watchdog */
1297COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) 1186COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
1298COMPATIBLE_IOCTL(WDIOC_GETSTATUS) 1187COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1328,6 +1217,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
1328COMPATIBLE_IOCTL(HCISETLINKMODE) 1217COMPATIBLE_IOCTL(HCISETLINKMODE)
1329COMPATIBLE_IOCTL(HCISETACLMTU) 1218COMPATIBLE_IOCTL(HCISETACLMTU)
1330COMPATIBLE_IOCTL(HCISETSCOMTU) 1219COMPATIBLE_IOCTL(HCISETSCOMTU)
1220COMPATIBLE_IOCTL(HCIBLOCKADDR)
1221COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
1331COMPATIBLE_IOCTL(HCIINQUIRY) 1222COMPATIBLE_IOCTL(HCIINQUIRY)
1332COMPATIBLE_IOCTL(HCIUARTSETPROTO) 1223COMPATIBLE_IOCTL(HCIUARTSETPROTO)
1333COMPATIBLE_IOCTL(HCIUARTGETPROTO) 1224COMPATIBLE_IOCTL(HCIUARTGETPROTO)
@@ -1547,18 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1547 case MTIOCGET32: 1438 case MTIOCGET32:
1548 case MTIOCPOS32: 1439 case MTIOCPOS32:
1549 return mt_ioctl_trans(fd, cmd, argp); 1440 return mt_ioctl_trans(fd, cmd, argp);
1550 /* Raw devices */
1551 case RAW_SETBIND:
1552 case RAW_GETBIND:
1553 return raw_ioctl(fd, cmd, argp);
1554#endif 1441#endif
1555#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
1556 case AUTOFS_IOC_SETTIMEOUT32:
1557 return ioc_settimeout(fd, cmd, argp);
1558 /* One SMB ioctl needs translations. */
1559#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
1560 case SMB_IOC_GETMOUNTUID_32:
1561 return do_smb_getmountuid(fd, cmd, argp);
1562 /* Serial */ 1442 /* Serial */
1563 case TIOCGSERIAL: 1443 case TIOCGSERIAL:
1564 case TIOCSSERIAL: 1444 case TIOCSSERIAL:
@@ -1609,9 +1489,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1609 case KDSKBMETA: 1489 case KDSKBMETA:
1610 case KDSKBLED: 1490 case KDSKBLED:
1611 case KDSETLED: 1491 case KDSETLED:
1612 /* AUTOFS */
1613 case AUTOFS_IOC_READY:
1614 case AUTOFS_IOC_FAIL:
1615 /* NBD */ 1492 /* NBD */
1616 case NBD_SET_SOCK: 1493 case NBD_SET_SOCK:
1617 case NBD_SET_BLKSIZE: 1494 case NBD_SET_BLKSIZE:
@@ -1729,8 +1606,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1729 goto out_fput; 1606 goto out_fput;
1730 } 1607 }
1731 1608
1732 if (!filp->f_op || 1609 if (!filp->f_op || !filp->f_op->unlocked_ioctl)
1733 (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
1734 goto do_ioctl; 1610 goto do_ioctl;
1735 break; 1611 break;
1736 } 1612 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6..253476d78ed 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
135{ 135{
136 struct inode * inode = new_inode(configfs_sb); 136 struct inode * inode = new_inode(configfs_sb);
137 if (inode) { 137 if (inode) {
138 inode->i_ino = get_next_ino();
138 inode->i_mapping->a_ops = &configfs_aops; 139 inode->i_mapping->a_ops = &configfs_aops;
139 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
140 inode->i_op = &configfs_inode_operations; 141 inode->i_op = &configfs_inode_operations;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2..7d3607febe1 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
104 return 0; 104 return 0;
105} 105}
106 106
107static int configfs_get_sb(struct file_system_type *fs_type, 107static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
108 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 108 int flags, const char *dev_name, void *data)
109{ 109{
110 return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); 110 return mount_single(fs_type, flags, data, configfs_fill_super);
111} 111}
112 112
113static struct file_system_type configfs_fs_type = { 113static struct file_system_type configfs_fs_type = {
114 .owner = THIS_MODULE, 114 .owner = THIS_MODULE,
115 .name = "configfs", 115 .name = "configfs",
116 .get_sb = configfs_get_sb, 116 .mount = configfs_do_mount,
117 .kill_sb = kill_litter_super, 117 .kill_sb = kill_litter_super,
118}; 118};
119 119
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index dd3634e4c96..32fd5fe9ca0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,66 +39,55 @@ static DEFINE_MUTEX(read_mutex);
39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1) 39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 40#define OFFSET(x) ((x)->i_ino)
41 41
42 42static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
43static int cramfs_iget5_test(struct inode *inode, void *opaque)
44{
45 struct cramfs_inode *cramfs_inode = opaque;
46 return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
47}
48
49static int cramfs_iget5_set(struct inode *inode, void *opaque)
50{ 43{
51 struct cramfs_inode *cramfs_inode = opaque; 44 static struct timespec zerotime;
52 inode->i_ino = CRAMINO(cramfs_inode); 45 inode->i_mode = cramfs_inode->mode;
53 return 0; 46 inode->i_uid = cramfs_inode->uid;
47 inode->i_size = cramfs_inode->size;
48 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
49 inode->i_gid = cramfs_inode->gid;
50 /* Struct copy intentional */
51 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
52 /* inode->i_nlink is left 1 - arguably wrong for directories,
53 but it's the best we can do without reading the directory
54 contents. 1 yields the right result in GNU find, even
55 without -noleaf option. */
56 if (S_ISREG(inode->i_mode)) {
57 inode->i_fop = &generic_ro_fops;
58 inode->i_data.a_ops = &cramfs_aops;
59 } else if (S_ISDIR(inode->i_mode)) {
60 inode->i_op = &cramfs_dir_inode_operations;
61 inode->i_fop = &cramfs_directory_operations;
62 } else if (S_ISLNK(inode->i_mode)) {
63 inode->i_op = &page_symlink_inode_operations;
64 inode->i_data.a_ops = &cramfs_aops;
65 } else {
66 init_special_inode(inode, inode->i_mode,
67 old_decode_dev(cramfs_inode->size));
68 }
54} 69}
55 70
56static struct inode *get_cramfs_inode(struct super_block *sb, 71static struct inode *get_cramfs_inode(struct super_block *sb,
57 struct cramfs_inode * cramfs_inode) 72 struct cramfs_inode * cramfs_inode)
58{ 73{
59 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), 74 struct inode *inode;
60 cramfs_iget5_test, cramfs_iget5_set, 75 if (CRAMINO(cramfs_inode) == 1) {
61 cramfs_inode); 76 inode = new_inode(sb);
62 static struct timespec zerotime; 77 if (inode) {
63 78 inode->i_ino = 1;
64 if (inode && (inode->i_state & I_NEW)) { 79 setup_inode(inode, cramfs_inode);
65 inode->i_mode = cramfs_inode->mode; 80 }
66 inode->i_uid = cramfs_inode->uid; 81 } else {
67 inode->i_size = cramfs_inode->size; 82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
68 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; 83 if (inode && (inode->i_state & I_NEW)) {
69 inode->i_gid = cramfs_inode->gid; 84 setup_inode(inode, cramfs_inode);
70 /* Struct copy intentional */ 85 unlock_new_inode(inode);
71 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
72 /* inode->i_nlink is left 1 - arguably wrong for directories,
73 but it's the best we can do without reading the directory
74 contents. 1 yields the right result in GNU find, even
75 without -noleaf option. */
76 if (S_ISREG(inode->i_mode)) {
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 } else if (S_ISDIR(inode->i_mode)) {
80 inode->i_op = &cramfs_dir_inode_operations;
81 inode->i_fop = &cramfs_directory_operations;
82 } else if (S_ISLNK(inode->i_mode)) {
83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops;
85 } else {
86 init_special_inode(inode, inode->i_mode,
87 old_decode_dev(cramfs_inode->size));
88 } 86 }
89 unlock_new_inode(inode);
90 } 87 }
91 return inode; 88 return inode;
92} 89}
93 90
94static void cramfs_drop_inode(struct inode *inode)
95{
96 if (inode->i_ino == 1)
97 generic_delete_inode(inode);
98 else
99 generic_drop_inode(inode);
100}
101
102/* 91/*
103 * We have our own block cache: don't fill up the buffer cache 92 * We have our own block cache: don't fill up the buffer cache
104 * with the rom-image, because the way the filesystem is set 93 * with the rom-image, because the way the filesystem is set
@@ -542,20 +531,18 @@ static const struct super_operations cramfs_ops = {
542 .put_super = cramfs_put_super, 531 .put_super = cramfs_put_super,
543 .remount_fs = cramfs_remount, 532 .remount_fs = cramfs_remount,
544 .statfs = cramfs_statfs, 533 .statfs = cramfs_statfs,
545 .drop_inode = cramfs_drop_inode,
546}; 534};
547 535
548static int cramfs_get_sb(struct file_system_type *fs_type, 536static struct dentry *cramfs_mount(struct file_system_type *fs_type,
549 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 537 int flags, const char *dev_name, void *data)
550{ 538{
551 return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, 539 return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
552 mnt);
553} 540}
554 541
555static struct file_system_type cramfs_fs_type = { 542static struct file_system_type cramfs_fs_type = {
556 .owner = THIS_MODULE, 543 .owner = THIS_MODULE,
557 .name = "cramfs", 544 .name = "cramfs",
558 .get_sb = cramfs_get_sb, 545 .mount = cramfs_mount,
559 .kill_sb = kill_block_super, 546 .kill_sb = kill_block_super,
560 .fs_flags = FS_REQUIRES_DEV, 547 .fs_flags = FS_REQUIRES_DEV,
561}; 548};
diff --git a/fs/dcache.c b/fs/dcache.c
index 86d4db15473..23702a9d4e6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 67 .age_limit = 45,
68}; 68};
69 69
70static void __d_free(struct dentry *dentry) 70static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
71static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
72
73#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
74int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
75 size_t *lenp, loff_t *ppos)
76{
77 dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
78 dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
79 return proc_dointvec(table, write, buffer, lenp, ppos);
80}
81#endif
82
83static void __d_free(struct rcu_head *head)
71{ 84{
85 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
86
72 WARN_ON(!list_empty(&dentry->d_alias)); 87 WARN_ON(!list_empty(&dentry->d_alias));
73 if (dname_external(dentry)) 88 if (dname_external(dentry))
74 kfree(dentry->d_name.name); 89 kfree(dentry->d_name.name);
75 kmem_cache_free(dentry_cache, dentry); 90 kmem_cache_free(dentry_cache, dentry);
76} 91}
77 92
78static void d_callback(struct rcu_head *head)
79{
80 struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
81 __d_free(dentry);
82}
83
84/* 93/*
85 * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry 94 * no dcache_lock, please.
86 * inside dcache_lock.
87 */ 95 */
88static void d_free(struct dentry *dentry) 96static void d_free(struct dentry *dentry)
89{ 97{
98 percpu_counter_dec(&nr_dentry);
90 if (dentry->d_op && dentry->d_op->d_release) 99 if (dentry->d_op && dentry->d_op->d_release)
91 dentry->d_op->d_release(dentry); 100 dentry->d_op->d_release(dentry);
101
92 /* if dentry was never inserted into hash, immediate free is OK */ 102 /* if dentry was never inserted into hash, immediate free is OK */
93 if (hlist_unhashed(&dentry->d_hash)) 103 if (hlist_unhashed(&dentry->d_hash))
94 __d_free(dentry); 104 __d_free(&dentry->d_u.d_rcu);
95 else 105 else
96 call_rcu(&dentry->d_u.d_rcu, d_callback); 106 call_rcu(&dentry->d_u.d_rcu, __d_free);
97} 107}
98 108
99/* 109/*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
123} 133}
124 134
125/* 135/*
126 * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. 136 * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
127 */ 137 */
128static void dentry_lru_add(struct dentry *dentry) 138static void dentry_lru_add(struct dentry *dentry)
129{ 139{
130 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 140 if (list_empty(&dentry->d_lru)) {
131 dentry->d_sb->s_nr_dentry_unused++; 141 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
132 dentry_stat.nr_unused++; 142 dentry->d_sb->s_nr_dentry_unused++;
133} 143 percpu_counter_inc(&nr_dentry_unused);
134 144 }
135static void dentry_lru_add_tail(struct dentry *dentry)
136{
137 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
138 dentry->d_sb->s_nr_dentry_unused++;
139 dentry_stat.nr_unused++;
140} 145}
141 146
142static void dentry_lru_del(struct dentry *dentry) 147static void dentry_lru_del(struct dentry *dentry)
143{ 148{
144 if (!list_empty(&dentry->d_lru)) { 149 if (!list_empty(&dentry->d_lru)) {
145 list_del(&dentry->d_lru); 150 list_del_init(&dentry->d_lru);
146 dentry->d_sb->s_nr_dentry_unused--; 151 dentry->d_sb->s_nr_dentry_unused--;
147 dentry_stat.nr_unused--; 152 percpu_counter_dec(&nr_dentry_unused);
148 } 153 }
149} 154}
150 155
151static void dentry_lru_del_init(struct dentry *dentry) 156static void dentry_lru_move_tail(struct dentry *dentry)
152{ 157{
153 if (likely(!list_empty(&dentry->d_lru))) { 158 if (list_empty(&dentry->d_lru)) {
154 list_del_init(&dentry->d_lru); 159 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
155 dentry->d_sb->s_nr_dentry_unused--; 160 dentry->d_sb->s_nr_dentry_unused++;
156 dentry_stat.nr_unused--; 161 percpu_counter_inc(&nr_dentry_unused);
162 } else {
163 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
157 } 164 }
158} 165}
159 166
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
172 struct dentry *parent; 179 struct dentry *parent;
173 180
174 list_del(&dentry->d_u.d_child); 181 list_del(&dentry->d_u.d_child);
175 dentry_stat.nr_dentry--; /* For d_free, below */
176 /*drops the locks, at that point nobody can reach this dentry */ 182 /*drops the locks, at that point nobody can reach this dentry */
177 dentry_iput(dentry); 183 dentry_iput(dentry);
178 if (IS_ROOT(dentry)) 184 if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
237 if (dentry->d_op->d_delete(dentry)) 243 if (dentry->d_op->d_delete(dentry))
238 goto unhash_it; 244 goto unhash_it;
239 } 245 }
246
240 /* Unreachable? Get rid of it */ 247 /* Unreachable? Get rid of it */
241 if (d_unhashed(dentry)) 248 if (d_unhashed(dentry))
242 goto kill_it; 249 goto kill_it;
243 if (list_empty(&dentry->d_lru)) { 250
244 dentry->d_flags |= DCACHE_REFERENCED; 251 /* Otherwise leave it cached and ensure it's on the LRU */
245 dentry_lru_add(dentry); 252 dentry->d_flags |= DCACHE_REFERENCED;
246 } 253 dentry_lru_add(dentry);
254
247 spin_unlock(&dentry->d_lock); 255 spin_unlock(&dentry->d_lock);
248 spin_unlock(&dcache_lock); 256 spin_unlock(&dcache_lock);
249 return; 257 return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
318EXPORT_SYMBOL(d_invalidate); 326EXPORT_SYMBOL(d_invalidate);
319 327
320/* This should be called _only_ with dcache_lock held */ 328/* This should be called _only_ with dcache_lock held */
321
322static inline struct dentry * __dget_locked(struct dentry *dentry) 329static inline struct dentry * __dget_locked(struct dentry *dentry)
323{ 330{
324 atomic_inc(&dentry->d_count); 331 atomic_inc(&dentry->d_count);
325 dentry_lru_del_init(dentry); 332 dentry_lru_del(dentry);
326 return dentry; 333 return dentry;
327} 334}
328 335
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
441 448
442 if (dentry->d_op && dentry->d_op->d_delete) 449 if (dentry->d_op && dentry->d_op->d_delete)
443 dentry->d_op->d_delete(dentry); 450 dentry->d_op->d_delete(dentry);
444 dentry_lru_del_init(dentry); 451 dentry_lru_del(dentry);
445 __d_drop(dentry); 452 __d_drop(dentry);
446 dentry = d_kill(dentry); 453 dentry = d_kill(dentry);
447 spin_lock(&dcache_lock); 454 spin_lock(&dcache_lock);
448 } 455 }
449} 456}
450 457
451/* 458static void shrink_dentry_list(struct list_head *list)
452 * Shrink the dentry LRU on a given superblock.
453 * @sb : superblock to shrink dentry LRU.
454 * @count: If count is NULL, we prune all dentries on superblock.
455 * @flags: If flags is non-zero, we need to do special processing based on
456 * which flags are set. This means we don't need to maintain multiple
457 * similar copies of this loop.
458 */
459static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
460{ 459{
461 LIST_HEAD(referenced);
462 LIST_HEAD(tmp);
463 struct dentry *dentry; 460 struct dentry *dentry;
464 int cnt = 0;
465 461
466 BUG_ON(!sb); 462 while (!list_empty(list)) {
467 BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); 463 dentry = list_entry(list->prev, struct dentry, d_lru);
468 spin_lock(&dcache_lock); 464 dentry_lru_del(dentry);
469 if (count != NULL)
470 /* called from prune_dcache() and shrink_dcache_parent() */
471 cnt = *count;
472restart:
473 if (count == NULL)
474 list_splice_init(&sb->s_dentry_lru, &tmp);
475 else {
476 while (!list_empty(&sb->s_dentry_lru)) {
477 dentry = list_entry(sb->s_dentry_lru.prev,
478 struct dentry, d_lru);
479 BUG_ON(dentry->d_sb != sb);
480 465
481 spin_lock(&dentry->d_lock);
482 /*
483 * If we are honouring the DCACHE_REFERENCED flag and
484 * the dentry has this flag set, don't free it. Clear
485 * the flag and put it back on the LRU.
486 */
487 if ((flags & DCACHE_REFERENCED)
488 && (dentry->d_flags & DCACHE_REFERENCED)) {
489 dentry->d_flags &= ~DCACHE_REFERENCED;
490 list_move(&dentry->d_lru, &referenced);
491 spin_unlock(&dentry->d_lock);
492 } else {
493 list_move_tail(&dentry->d_lru, &tmp);
494 spin_unlock(&dentry->d_lock);
495 cnt--;
496 if (!cnt)
497 break;
498 }
499 cond_resched_lock(&dcache_lock);
500 }
501 }
502 while (!list_empty(&tmp)) {
503 dentry = list_entry(tmp.prev, struct dentry, d_lru);
504 dentry_lru_del_init(dentry);
505 spin_lock(&dentry->d_lock);
506 /* 466 /*
507 * We found an inuse dentry which was not removed from 467 * We found an inuse dentry which was not removed from
508 * the LRU because of laziness during lookup. Do not free 468 * the LRU because of laziness during lookup. Do not free
509 * it - just keep it off the LRU list. 469 * it - just keep it off the LRU list.
510 */ 470 */
471 spin_lock(&dentry->d_lock);
511 if (atomic_read(&dentry->d_count)) { 472 if (atomic_read(&dentry->d_count)) {
512 spin_unlock(&dentry->d_lock); 473 spin_unlock(&dentry->d_lock);
513 continue; 474 continue;
@@ -516,13 +477,60 @@ restart:
516 /* dentry->d_lock was dropped in prune_one_dentry() */ 477 /* dentry->d_lock was dropped in prune_one_dentry() */
517 cond_resched_lock(&dcache_lock); 478 cond_resched_lock(&dcache_lock);
518 } 479 }
519 if (count == NULL && !list_empty(&sb->s_dentry_lru)) 480}
520 goto restart; 481
521 if (count != NULL) 482/**
522 *count = cnt; 483 * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
484 * @sb: superblock to shrink dentry LRU.
485 * @count: number of entries to prune
486 * @flags: flags to control the dentry processing
487 *
488 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
489 */
490static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
491{
492 /* called from prune_dcache() and shrink_dcache_parent() */
493 struct dentry *dentry;
494 LIST_HEAD(referenced);
495 LIST_HEAD(tmp);
496 int cnt = *count;
497
498 spin_lock(&dcache_lock);
499 while (!list_empty(&sb->s_dentry_lru)) {
500 dentry = list_entry(sb->s_dentry_lru.prev,
501 struct dentry, d_lru);
502 BUG_ON(dentry->d_sb != sb);
503
504 /*
505 * If we are honouring the DCACHE_REFERENCED flag and the
506 * dentry has this flag set, don't free it. Clear the flag
507 * and put it back on the LRU.
508 */
509 if (flags & DCACHE_REFERENCED) {
510 spin_lock(&dentry->d_lock);
511 if (dentry->d_flags & DCACHE_REFERENCED) {
512 dentry->d_flags &= ~DCACHE_REFERENCED;
513 list_move(&dentry->d_lru, &referenced);
514 spin_unlock(&dentry->d_lock);
515 cond_resched_lock(&dcache_lock);
516 continue;
517 }
518 spin_unlock(&dentry->d_lock);
519 }
520
521 list_move_tail(&dentry->d_lru, &tmp);
522 if (!--cnt)
523 break;
524 cond_resched_lock(&dcache_lock);
525 }
526
527 *count = cnt;
528 shrink_dentry_list(&tmp);
529
523 if (!list_empty(&referenced)) 530 if (!list_empty(&referenced))
524 list_splice(&referenced, &sb->s_dentry_lru); 531 list_splice(&referenced, &sb->s_dentry_lru);
525 spin_unlock(&dcache_lock); 532 spin_unlock(&dcache_lock);
533
526} 534}
527 535
528/** 536/**
@@ -536,9 +544,9 @@ restart:
536 */ 544 */
537static void prune_dcache(int count) 545static void prune_dcache(int count)
538{ 546{
539 struct super_block *sb, *n; 547 struct super_block *sb, *p = NULL;
540 int w_count; 548 int w_count;
541 int unused = dentry_stat.nr_unused; 549 int unused = percpu_counter_sum_positive(&nr_dentry_unused);
542 int prune_ratio; 550 int prune_ratio;
543 int pruned; 551 int pruned;
544 552
@@ -550,7 +558,7 @@ static void prune_dcache(int count)
550 else 558 else
551 prune_ratio = unused / count; 559 prune_ratio = unused / count;
552 spin_lock(&sb_lock); 560 spin_lock(&sb_lock);
553 list_for_each_entry_safe(sb, n, &super_blocks, s_list) { 561 list_for_each_entry(sb, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances)) 562 if (list_empty(&sb->s_instances))
555 continue; 563 continue;
556 if (sb->s_nr_dentry_unused == 0) 564 if (sb->s_nr_dentry_unused == 0)
@@ -590,14 +598,16 @@ static void prune_dcache(int count)
590 up_read(&sb->s_umount); 598 up_read(&sb->s_umount);
591 } 599 }
592 spin_lock(&sb_lock); 600 spin_lock(&sb_lock);
593 /* lock was dropped, must reset next */ 601 if (p)
594 list_safe_reset_next(sb, n, s_list); 602 __put_super(p);
595 count -= pruned; 603 count -= pruned;
596 __put_super(sb); 604 p = sb;
597 /* more work left to do? */ 605 /* more work left to do? */
598 if (count <= 0) 606 if (count <= 0)
599 break; 607 break;
600 } 608 }
609 if (p)
610 __put_super(p);
601 spin_unlock(&sb_lock); 611 spin_unlock(&sb_lock);
602 spin_unlock(&dcache_lock); 612 spin_unlock(&dcache_lock);
603} 613}
@@ -606,13 +616,19 @@ static void prune_dcache(int count)
606 * shrink_dcache_sb - shrink dcache for a superblock 616 * shrink_dcache_sb - shrink dcache for a superblock
607 * @sb: superblock 617 * @sb: superblock
608 * 618 *
609 * Shrink the dcache for the specified super block. This 619 * Shrink the dcache for the specified super block. This is used to free
610 * is used to free the dcache before unmounting a file 620 * the dcache before unmounting a file system.
611 * system
612 */ 621 */
613void shrink_dcache_sb(struct super_block * sb) 622void shrink_dcache_sb(struct super_block *sb)
614{ 623{
615 __shrink_dcache_sb(sb, NULL, 0); 624 LIST_HEAD(tmp);
625
626 spin_lock(&dcache_lock);
627 while (!list_empty(&sb->s_dentry_lru)) {
628 list_splice_init(&sb->s_dentry_lru, &tmp);
629 shrink_dentry_list(&tmp);
630 }
631 spin_unlock(&dcache_lock);
616} 632}
617EXPORT_SYMBOL(shrink_dcache_sb); 633EXPORT_SYMBOL(shrink_dcache_sb);
618 634
@@ -630,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
630 646
631 /* detach this root from the system */ 647 /* detach this root from the system */
632 spin_lock(&dcache_lock); 648 spin_lock(&dcache_lock);
633 dentry_lru_del_init(dentry); 649 dentry_lru_del(dentry);
634 __d_drop(dentry); 650 __d_drop(dentry);
635 spin_unlock(&dcache_lock); 651 spin_unlock(&dcache_lock);
636 652
@@ -644,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
644 spin_lock(&dcache_lock); 660 spin_lock(&dcache_lock);
645 list_for_each_entry(loop, &dentry->d_subdirs, 661 list_for_each_entry(loop, &dentry->d_subdirs,
646 d_u.d_child) { 662 d_u.d_child) {
647 dentry_lru_del_init(loop); 663 dentry_lru_del(loop);
648 __d_drop(loop); 664 __d_drop(loop);
649 cond_resched_lock(&dcache_lock); 665 cond_resched_lock(&dcache_lock);
650 } 666 }
@@ -701,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
701 * otherwise we ascend to the parent and move to the 717 * otherwise we ascend to the parent and move to the
702 * next sibling if there is one */ 718 * next sibling if there is one */
703 if (!parent) 719 if (!parent)
704 goto out; 720 return;
705
706 dentry = parent; 721 dentry = parent;
707
708 } while (list_empty(&dentry->d_subdirs)); 722 } while (list_empty(&dentry->d_subdirs));
709 723
710 dentry = list_entry(dentry->d_subdirs.next, 724 dentry = list_entry(dentry->d_subdirs.next,
711 struct dentry, d_u.d_child); 725 struct dentry, d_u.d_child);
712 } 726 }
713out:
714 /* several dentries were freed, need to correct nr_dentry */
715 spin_lock(&dcache_lock);
716 dentry_stat.nr_dentry -= detached;
717 spin_unlock(&dcache_lock);
718} 727}
719 728
720/* 729/*
@@ -828,14 +837,15 @@ resume:
828 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 837 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
829 next = tmp->next; 838 next = tmp->next;
830 839
831 dentry_lru_del_init(dentry);
832 /* 840 /*
833 * move only zero ref count dentries to the end 841 * move only zero ref count dentries to the end
834 * of the unused list for prune_dcache 842 * of the unused list for prune_dcache
835 */ 843 */
836 if (!atomic_read(&dentry->d_count)) { 844 if (!atomic_read(&dentry->d_count)) {
837 dentry_lru_add_tail(dentry); 845 dentry_lru_move_tail(dentry);
838 found++; 846 found++;
847 } else {
848 dentry_lru_del(dentry);
839 } 849 }
840 850
841 /* 851 /*
@@ -898,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
898 */ 908 */
899static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 909static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
900{ 910{
911 int nr_unused;
912
901 if (nr) { 913 if (nr) {
902 if (!(gfp_mask & __GFP_FS)) 914 if (!(gfp_mask & __GFP_FS))
903 return -1; 915 return -1;
904 prune_dcache(nr); 916 prune_dcache(nr);
905 } 917 }
906 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 918
919 nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
920 return (nr_unused / 100) * sysctl_vfs_cache_pressure;
907} 921}
908 922
909static struct shrinker dcache_shrinker = { 923static struct shrinker dcache_shrinker = {
@@ -970,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
970 spin_lock(&dcache_lock); 984 spin_lock(&dcache_lock);
971 if (parent) 985 if (parent)
972 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 986 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
973 dentry_stat.nr_dentry++;
974 spin_unlock(&dcache_lock); 987 spin_unlock(&dcache_lock);
975 988
989 percpu_counter_inc(&nr_dentry);
990
976 return dentry; 991 return dentry;
977} 992}
978EXPORT_SYMBOL(d_alloc); 993EXPORT_SYMBOL(d_alloc);
@@ -1330,31 +1345,13 @@ EXPORT_SYMBOL(d_add_ci);
1330 * d_lookup - search for a dentry 1345 * d_lookup - search for a dentry
1331 * @parent: parent dentry 1346 * @parent: parent dentry
1332 * @name: qstr of name we wish to find 1347 * @name: qstr of name we wish to find
1348 * Returns: dentry, or NULL
1333 * 1349 *
1334 * Searches the children of the parent dentry for the name in question. If 1350 * d_lookup searches the children of the parent dentry for the name in
1335 * the dentry is found its reference count is incremented and the dentry 1351 * question. If the dentry is found its reference count is incremented and the
1336 * is returned. The caller must use dput to free the entry when it has 1352 * dentry is returned. The caller must use dput to free the entry when it has
1337 * finished using it. %NULL is returned on failure. 1353 * finished using it. %NULL is returned if the dentry does not exist.
1338 *
1339 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
1340 * Memory barriers are used while updating and doing lockless traversal.
1341 * To avoid races with d_move while rename is happening, d_lock is used.
1342 *
1343 * Overflows in memcmp(), while d_move, are avoided by keeping the length
1344 * and name pointer in one structure pointed by d_qstr.
1345 *
1346 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
1347 * lookup is going on.
1348 *
1349 * The dentry unused LRU is not updated even if lookup finds the required dentry
1350 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
1351 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
1352 * acquisition.
1353 *
1354 * d_lookup() is protected against the concurrent renames in some unrelated
1355 * directory using the seqlockt_t rename_lock.
1356 */ 1354 */
1357
1358struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1355struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1359{ 1356{
1360 struct dentry * dentry = NULL; 1357 struct dentry * dentry = NULL;
@@ -1370,6 +1367,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1370} 1367}
1371EXPORT_SYMBOL(d_lookup); 1368EXPORT_SYMBOL(d_lookup);
1372 1369
1370/*
1371 * __d_lookup - search for a dentry (racy)
1372 * @parent: parent dentry
1373 * @name: qstr of name we wish to find
1374 * Returns: dentry, or NULL
1375 *
1376 * __d_lookup is like d_lookup, however it may (rarely) return a
1377 * false-negative result due to unrelated rename activity.
1378 *
1379 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
1380 * however it must be used carefully, eg. with a following d_lookup in
1381 * the case of failure.
1382 *
1383 * __d_lookup callers must be commented.
1384 */
1373struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1385struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1374{ 1386{
1375 unsigned int len = name->len; 1387 unsigned int len = name->len;
@@ -1380,6 +1392,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1380 struct hlist_node *node; 1392 struct hlist_node *node;
1381 struct dentry *dentry; 1393 struct dentry *dentry;
1382 1394
1395 /*
1396 * The hash list is protected using RCU.
1397 *
1398 * Take d_lock when comparing a candidate dentry, to avoid races
1399 * with d_move().
1400 *
1401 * It is possible that concurrent renames can mess up our list
1402 * walk here and result in missing our dentry, resulting in the
1403 * false-negative result. d_lookup() protects against concurrent
1404 * renames using rename_lock seqlock.
1405 *
1406 * See Documentation/vfs/dcache-locking.txt for more details.
1407 */
1383 rcu_read_lock(); 1408 rcu_read_lock();
1384 1409
1385 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1410 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1394,8 +1419,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1394 1419
1395 /* 1420 /*
1396 * Recheck the dentry after taking the lock - d_move may have 1421 * Recheck the dentry after taking the lock - d_move may have
1397 * changed things. Don't bother checking the hash because we're 1422 * changed things. Don't bother checking the hash because
1398 * about to compare the whole name anyway. 1423 * we're about to compare the whole name anyway.
1399 */ 1424 */
1400 if (dentry->d_parent != parent) 1425 if (dentry->d_parent != parent)
1401 goto next; 1426 goto next;
@@ -1466,33 +1491,26 @@ out:
1466 * This is used by ncpfs in its readdir implementation. 1491 * This is used by ncpfs in its readdir implementation.
1467 * Zero is returned in the dentry is invalid. 1492 * Zero is returned in the dentry is invalid.
1468 */ 1493 */
1469 1494int d_validate(struct dentry *dentry, struct dentry *parent)
1470int d_validate(struct dentry *dentry, struct dentry *dparent)
1471{ 1495{
1472 struct hlist_head *base; 1496 struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
1473 struct hlist_node *lhp; 1497 struct hlist_node *node;
1498 struct dentry *d;
1474 1499
1475 /* Check whether the ptr might be valid at all.. */ 1500 /* Check whether the ptr might be valid at all.. */
1476 if (!kmem_ptr_validate(dentry_cache, dentry)) 1501 if (!kmem_ptr_validate(dentry_cache, dentry))
1477 goto out; 1502 return 0;
1478 1503 if (dentry->d_parent != parent)
1479 if (dentry->d_parent != dparent) 1504 return 0;
1480 goto out;
1481 1505
1482 spin_lock(&dcache_lock); 1506 rcu_read_lock();
1483 base = d_hash(dparent, dentry->d_name.hash); 1507 hlist_for_each_entry_rcu(d, node, head, d_hash) {
1484 hlist_for_each(lhp,base) { 1508 if (d == dentry) {
1485 /* hlist_for_each_entry_rcu() not required for d_hash list 1509 dget(dentry);
1486 * as it is parsed under dcache_lock
1487 */
1488 if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
1489 __dget_locked(dentry);
1490 spin_unlock(&dcache_lock);
1491 return 1; 1510 return 1;
1492 } 1511 }
1493 } 1512 }
1494 spin_unlock(&dcache_lock); 1513 rcu_read_unlock();
1495out:
1496 return 0; 1514 return 0;
1497} 1515}
1498EXPORT_SYMBOL(d_validate); 1516EXPORT_SYMBOL(d_validate);
@@ -1903,48 +1921,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1903} 1921}
1904 1922
1905/** 1923/**
1906 * __d_path - return the path of a dentry 1924 * Prepend path string to a buffer
1925 *
1907 * @path: the dentry/vfsmount to report 1926 * @path: the dentry/vfsmount to report
1908 * @root: root vfsmnt/dentry (may be modified by this function) 1927 * @root: root vfsmnt/dentry (may be modified by this function)
1909 * @buffer: buffer to return value in 1928 * @buffer: pointer to the end of the buffer
1910 * @buflen: buffer length 1929 * @buflen: pointer to buffer length
1911 * 1930 *
1912 * Convert a dentry into an ASCII path name. If the entry has been deleted 1931 * Caller holds the dcache_lock.
1913 * the string " (deleted)" is appended. Note that this is ambiguous.
1914 *
1915 * Returns a pointer into the buffer or an error code if the
1916 * path was too long.
1917 *
1918 * "buflen" should be positive. Caller holds the dcache_lock.
1919 * 1932 *
1920 * If path is not reachable from the supplied root, then the value of 1933 * If path is not reachable from the supplied root, then the value of
1921 * root is changed (without modifying refcounts). 1934 * root is changed (without modifying refcounts).
1922 */ 1935 */
1923char *__d_path(const struct path *path, struct path *root, 1936static int prepend_path(const struct path *path, struct path *root,
1924 char *buffer, int buflen) 1937 char **buffer, int *buflen)
1925{ 1938{
1926 struct dentry *dentry = path->dentry; 1939 struct dentry *dentry = path->dentry;
1927 struct vfsmount *vfsmnt = path->mnt; 1940 struct vfsmount *vfsmnt = path->mnt;
1928 char *end = buffer + buflen; 1941 bool slash = false;
1929 char *retval; 1942 int error = 0;
1930
1931 spin_lock(&vfsmount_lock);
1932 prepend(&end, &buflen, "\0", 1);
1933 if (d_unlinked(dentry) &&
1934 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1935 goto Elong;
1936
1937 if (buflen < 1)
1938 goto Elong;
1939 /* Get '/' right */
1940 retval = end-1;
1941 *retval = '/';
1942 1943
1943 for (;;) { 1944 br_read_lock(vfsmount_lock);
1945 while (dentry != root->dentry || vfsmnt != root->mnt) {
1944 struct dentry * parent; 1946 struct dentry * parent;
1945 1947
1946 if (dentry == root->dentry && vfsmnt == root->mnt)
1947 break;
1948 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 1948 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
1949 /* Global root? */ 1949 /* Global root? */
1950 if (vfsmnt->mnt_parent == vfsmnt) { 1950 if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1956,28 +1956,90 @@ char *__d_path(const struct path *path, struct path *root,
1956 } 1956 }
1957 parent = dentry->d_parent; 1957 parent = dentry->d_parent;
1958 prefetch(parent); 1958 prefetch(parent);
1959 if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || 1959 error = prepend_name(buffer, buflen, &dentry->d_name);
1960 (prepend(&end, &buflen, "/", 1) != 0)) 1960 if (!error)
1961 goto Elong; 1961 error = prepend(buffer, buflen, "/", 1);
1962 retval = end; 1962 if (error)
1963 break;
1964
1965 slash = true;
1963 dentry = parent; 1966 dentry = parent;
1964 } 1967 }
1965 1968
1966out: 1969out:
1967 spin_unlock(&vfsmount_lock); 1970 if (!error && !slash)
1968 return retval; 1971 error = prepend(buffer, buflen, "/", 1);
1972
1973 br_read_unlock(vfsmount_lock);
1974 return error;
1969 1975
1970global_root: 1976global_root:
1971 retval += 1; /* hit the slash */ 1977 /*
1972 if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) 1978 * Filesystems needing to implement special "root names"
1973 goto Elong; 1979 * should do so with ->d_dname()
1980 */
1981 if (IS_ROOT(dentry) &&
1982 (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
1983 WARN(1, "Root dentry has weird name <%.*s>\n",
1984 (int) dentry->d_name.len, dentry->d_name.name);
1985 }
1974 root->mnt = vfsmnt; 1986 root->mnt = vfsmnt;
1975 root->dentry = dentry; 1987 root->dentry = dentry;
1976 goto out; 1988 goto out;
1989}
1977 1990
1978Elong: 1991/**
1979 retval = ERR_PTR(-ENAMETOOLONG); 1992 * __d_path - return the path of a dentry
1980 goto out; 1993 * @path: the dentry/vfsmount to report
1994 * @root: root vfsmnt/dentry (may be modified by this function)
1995 * @buf: buffer to return value in
1996 * @buflen: buffer length
1997 *
1998 * Convert a dentry into an ASCII path name.
1999 *
2000 * Returns a pointer into the buffer or an error code if the
2001 * path was too long.
2002 *
2003 * "buflen" should be positive.
2004 *
2005 * If path is not reachable from the supplied root, then the value of
2006 * root is changed (without modifying refcounts).
2007 */
2008char *__d_path(const struct path *path, struct path *root,
2009 char *buf, int buflen)
2010{
2011 char *res = buf + buflen;
2012 int error;
2013
2014 prepend(&res, &buflen, "\0", 1);
2015 spin_lock(&dcache_lock);
2016 error = prepend_path(path, root, &res, &buflen);
2017 spin_unlock(&dcache_lock);
2018
2019 if (error)
2020 return ERR_PTR(error);
2021 return res;
2022}
2023
2024/*
2025 * same as __d_path but appends "(deleted)" for unlinked files.
2026 */
2027static int path_with_deleted(const struct path *path, struct path *root,
2028 char **buf, int *buflen)
2029{
2030 prepend(buf, buflen, "\0", 1);
2031 if (d_unlinked(path->dentry)) {
2032 int error = prepend(buf, buflen, " (deleted)", 10);
2033 if (error)
2034 return error;
2035 }
2036
2037 return prepend_path(path, root, buf, buflen);
2038}
2039
2040static int prepend_unreachable(char **buffer, int *buflen)
2041{
2042 return prepend(buffer, buflen, "(unreachable)", 13);
1981} 2043}
1982 2044
1983/** 2045/**
@@ -1998,9 +2060,10 @@ Elong:
1998 */ 2060 */
1999char *d_path(const struct path *path, char *buf, int buflen) 2061char *d_path(const struct path *path, char *buf, int buflen)
2000{ 2062{
2001 char *res; 2063 char *res = buf + buflen;
2002 struct path root; 2064 struct path root;
2003 struct path tmp; 2065 struct path tmp;
2066 int error;
2004 2067
2005 /* 2068 /*
2006 * We have various synthetic filesystems that never get mounted. On 2069 * We have various synthetic filesystems that never get mounted. On
@@ -2012,19 +2075,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
2012 if (path->dentry->d_op && path->dentry->d_op->d_dname) 2075 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2013 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2076 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2014 2077
2015 read_lock(&current->fs->lock); 2078 get_fs_root(current->fs, &root);
2016 root = current->fs->root;
2017 path_get(&root);
2018 read_unlock(&current->fs->lock);
2019 spin_lock(&dcache_lock); 2079 spin_lock(&dcache_lock);
2020 tmp = root; 2080 tmp = root;
2021 res = __d_path(path, &tmp, buf, buflen); 2081 error = path_with_deleted(path, &tmp, &res, &buflen);
2082 if (error)
2083 res = ERR_PTR(error);
2022 spin_unlock(&dcache_lock); 2084 spin_unlock(&dcache_lock);
2023 path_put(&root); 2085 path_put(&root);
2024 return res; 2086 return res;
2025} 2087}
2026EXPORT_SYMBOL(d_path); 2088EXPORT_SYMBOL(d_path);
2027 2089
2090/**
2091 * d_path_with_unreachable - return the path of a dentry
2092 * @path: path to report
2093 * @buf: buffer to return value in
2094 * @buflen: buffer length
2095 *
2096 * The difference from d_path() is that this prepends "(unreachable)"
2097 * to paths which are unreachable from the current process' root.
2098 */
2099char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2100{
2101 char *res = buf + buflen;
2102 struct path root;
2103 struct path tmp;
2104 int error;
2105
2106 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2107 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2108
2109 get_fs_root(current->fs, &root);
2110 spin_lock(&dcache_lock);
2111 tmp = root;
2112 error = path_with_deleted(path, &tmp, &res, &buflen);
2113 if (!error && !path_equal(&tmp, &root))
2114 error = prepend_unreachable(&res, &buflen);
2115 spin_unlock(&dcache_lock);
2116 path_put(&root);
2117 if (error)
2118 res = ERR_PTR(error);
2119
2120 return res;
2121}
2122
2028/* 2123/*
2029 * Helper function for dentry_operations.d_dname() members 2124 * Helper function for dentry_operations.d_dname() members
2030 */ 2125 */
@@ -2049,16 +2144,12 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
2049/* 2144/*
2050 * Write full pathname from the root of the filesystem into the buffer. 2145 * Write full pathname from the root of the filesystem into the buffer.
2051 */ 2146 */
2052char *dentry_path(struct dentry *dentry, char *buf, int buflen) 2147char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2053{ 2148{
2054 char *end = buf + buflen; 2149 char *end = buf + buflen;
2055 char *retval; 2150 char *retval;
2056 2151
2057 spin_lock(&dcache_lock);
2058 prepend(&end, &buflen, "\0", 1); 2152 prepend(&end, &buflen, "\0", 1);
2059 if (d_unlinked(dentry) &&
2060 (prepend(&end, &buflen, "//deleted", 9) != 0))
2061 goto Elong;
2062 if (buflen < 1) 2153 if (buflen < 1)
2063 goto Elong; 2154 goto Elong;
2064 /* Get '/' right */ 2155 /* Get '/' right */
@@ -2076,7 +2167,28 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2076 retval = end; 2167 retval = end;
2077 dentry = parent; 2168 dentry = parent;
2078 } 2169 }
2170 return retval;
2171Elong:
2172 return ERR_PTR(-ENAMETOOLONG);
2173}
2174EXPORT_SYMBOL(__dentry_path);
2175
2176char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2177{
2178 char *p = NULL;
2179 char *retval;
2180
2181 spin_lock(&dcache_lock);
2182 if (d_unlinked(dentry)) {
2183 p = buf + buflen;
2184 if (prepend(&p, &buflen, "//deleted", 10) != 0)
2185 goto Elong;
2186 buflen++;
2187 }
2188 retval = __dentry_path(dentry, buf, buflen);
2079 spin_unlock(&dcache_lock); 2189 spin_unlock(&dcache_lock);
2190 if (!IS_ERR(retval) && p)
2191 *p = '/'; /* restore '/' overriden with '\0' */
2080 return retval; 2192 return retval;
2081Elong: 2193Elong:
2082 spin_unlock(&dcache_lock); 2194 spin_unlock(&dcache_lock);
@@ -2110,27 +2222,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2110 if (!page) 2222 if (!page)
2111 return -ENOMEM; 2223 return -ENOMEM;
2112 2224
2113 read_lock(&current->fs->lock); 2225 get_fs_root_and_pwd(current->fs, &root, &pwd);
2114 pwd = current->fs->pwd;
2115 path_get(&pwd);
2116 root = current->fs->root;
2117 path_get(&root);
2118 read_unlock(&current->fs->lock);
2119 2226
2120 error = -ENOENT; 2227 error = -ENOENT;
2121 spin_lock(&dcache_lock); 2228 spin_lock(&dcache_lock);
2122 if (!d_unlinked(pwd.dentry)) { 2229 if (!d_unlinked(pwd.dentry)) {
2123 unsigned long len; 2230 unsigned long len;
2124 struct path tmp = root; 2231 struct path tmp = root;
2125 char * cwd; 2232 char *cwd = page + PAGE_SIZE;
2233 int buflen = PAGE_SIZE;
2126 2234
2127 cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); 2235 prepend(&cwd, &buflen, "\0", 1);
2236 error = prepend_path(&pwd, &tmp, &cwd, &buflen);
2128 spin_unlock(&dcache_lock); 2237 spin_unlock(&dcache_lock);
2129 2238
2130 error = PTR_ERR(cwd); 2239 if (error)
2131 if (IS_ERR(cwd))
2132 goto out; 2240 goto out;
2133 2241
2242 /* Unreachable from current root */
2243 if (!path_equal(&tmp, &root)) {
2244 error = prepend_unreachable(&cwd, &buflen);
2245 if (error)
2246 goto out;
2247 }
2248
2134 error = -ERANGE; 2249 error = -ERANGE;
2135 len = PAGE_SIZE + page - cwd; 2250 len = PAGE_SIZE + page - cwd;
2136 if (len <= size) { 2251 if (len <= size) {
@@ -2195,11 +2310,12 @@ int path_is_under(struct path *path1, struct path *path2)
2195 struct vfsmount *mnt = path1->mnt; 2310 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry; 2311 struct dentry *dentry = path1->dentry;
2197 int res; 2312 int res;
2198 spin_lock(&vfsmount_lock); 2313
2314 br_read_lock(vfsmount_lock);
2199 if (mnt != path2->mnt) { 2315 if (mnt != path2->mnt) {
2200 for (;;) { 2316 for (;;) {
2201 if (mnt->mnt_parent == mnt) { 2317 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock); 2318 br_read_unlock(vfsmount_lock);
2203 return 0; 2319 return 0;
2204 } 2320 }
2205 if (mnt->mnt_parent == path2->mnt) 2321 if (mnt->mnt_parent == path2->mnt)
@@ -2209,7 +2325,7 @@ int path_is_under(struct path *path1, struct path *path2)
2209 dentry = mnt->mnt_mountpoint; 2325 dentry = mnt->mnt_mountpoint;
2210 } 2326 }
2211 res = is_subdir(dentry, path2->dentry); 2327 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock); 2328 br_read_unlock(vfsmount_lock);
2213 return res; 2329 return res;
2214} 2330}
2215EXPORT_SYMBOL(path_is_under); 2331EXPORT_SYMBOL(path_is_under);
@@ -2311,6 +2427,9 @@ static void __init dcache_init(void)
2311{ 2427{
2312 int loop; 2428 int loop;
2313 2429
2430 percpu_counter_init(&nr_dentry, 0);
2431 percpu_counter_init(&nr_dentry_unused, 0);
2432
2314 /* 2433 /*
2315 * A constructor could be added for stable state like the lists, 2434 * A constructor could be added for stable state like the lists,
2316 * but it is probably not worth it because of the cache nature 2435 * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b..89d394d8fe2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
43 .read = default_read_file, 43 .read = default_read_file,
44 .write = default_write_file, 44 .write = default_write_file,
45 .open = default_open, 45 .open = default_open,
46 .llseek = noop_llseek,
46}; 47};
47 48
48static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) 49static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
454 .read = read_file_bool, 455 .read = read_file_bool,
455 .write = write_file_bool, 456 .write = write_file_bool,
456 .open = default_open, 457 .open = default_open,
458 .llseek = default_llseek,
457}; 459};
458 460
459/** 461/**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
498static const struct file_operations fops_blob = { 500static const struct file_operations fops_blob = {
499 .read = read_file_blob, 501 .read = read_file_blob,
500 .open = default_open, 502 .open = default_open,
503 .llseek = default_llseek,
501}; 504};
502 505
503/** 506/**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbca..37a8ca7c122 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
40 struct inode *inode = new_inode(sb); 40 struct inode *inode = new_inode(sb);
41 41
42 if (inode) { 42 if (inode) {
43 inode->i_ino = get_next_ino();
43 inode->i_mode = mode; 44 inode->i_mode = mode;
44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
45 switch (mode & S_IFMT) { 46 switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
134 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); 135 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
135} 136}
136 137
137static int debug_get_sb(struct file_system_type *fs_type, 138static struct dentry *debug_mount(struct file_system_type *fs_type,
138 int flags, const char *dev_name, 139 int flags, const char *dev_name,
139 void *data, struct vfsmount *mnt) 140 void *data)
140{ 141{
141 return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); 142 return mount_single(fs_type, flags, data, debug_fill_super);
142} 143}
143 144
144static struct file_system_type debug_fs_type = { 145static struct file_system_type debug_fs_type = {
145 .owner = THIS_MODULE, 146 .owner = THIS_MODULE,
146 .name = "debugfs", 147 .name = "debugfs",
147 .get_sb = debug_get_sb, 148 .mount = debug_mount,
148 .kill_sb = kill_litter_super, 149 .kill_sb = kill_litter_super,
149}; 150};
150 151
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b523..1bb547c9cad 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
331} 331}
332 332
333/* 333/*
334 * devpts_get_sb() 334 * devpts_mount()
335 * 335 *
336 * If the '-o newinstance' mount option was specified, mount a new 336 * If the '-o newinstance' mount option was specified, mount a new
337 * (private) instance of devpts. PTYs created in this instance are 337 * (private) instance of devpts. PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
345 * semantics in devpts while preserving backward compatibility of the 345 * semantics in devpts while preserving backward compatibility of the
346 * current 'single-namespace' semantics. i.e all mounts of devpts 346 * current 'single-namespace' semantics. i.e all mounts of devpts
347 * without the 'newinstance' mount option should bind to the initial 347 * without the 'newinstance' mount option should bind to the initial
348 * kernel mount, like get_sb_single(). 348 * kernel mount, like mount_single().
349 * 349 *
350 * Mounts with 'newinstance' option create a new, private namespace. 350 * Mounts with 'newinstance' option create a new, private namespace.
351 * 351 *
352 * NOTE: 352 * NOTE:
353 * 353 *
354 * For single-mount semantics, devpts cannot use get_sb_single(), 354 * For single-mount semantics, devpts cannot use mount_single(),
355 * because get_sb_single()/sget() find and use the super-block from 355 * because mount_single()/sget() find and use the super-block from
356 * the most recent mount of devpts. But that recent mount may be a 356 * the most recent mount of devpts. But that recent mount may be a
357 * 'newinstance' mount and get_sb_single() would pick the newinstance 357 * 'newinstance' mount and mount_single() would pick the newinstance
358 * super-block instead of the initial super-block. 358 * super-block instead of the initial super-block.
359 */ 359 */
360static int devpts_get_sb(struct file_system_type *fs_type, 360static struct dentry *devpts_mount(struct file_system_type *fs_type,
361 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 361 int flags, const char *dev_name, void *data)
362{ 362{
363 int error; 363 int error;
364 struct pts_mount_opts opts; 364 struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
366 366
367 error = parse_mount_options(data, PARSE_MOUNT, &opts); 367 error = parse_mount_options(data, PARSE_MOUNT, &opts);
368 if (error) 368 if (error)
369 return error; 369 return ERR_PTR(error);
370 370
371 if (opts.newinstance) 371 if (opts.newinstance)
372 s = sget(fs_type, NULL, set_anon_super, NULL); 372 s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); 374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
375 375
376 if (IS_ERR(s)) 376 if (IS_ERR(s))
377 return PTR_ERR(s); 377 return ERR_CAST(s);
378 378
379 if (!s->s_root) { 379 if (!s->s_root) {
380 s->s_flags = flags; 380 s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
390 if (error) 390 if (error)
391 goto out_undo_sget; 391 goto out_undo_sget;
392 392
393 simple_set_mnt(mnt, s); 393 return dget(s->s_root);
394
395 return 0;
396 394
397out_undo_sget: 395out_undo_sget:
398 deactivate_locked_super(s); 396 deactivate_locked_super(s);
399 return error; 397 return ERR_PTR(error);
400} 398}
401 399
402#else 400#else
@@ -404,10 +402,10 @@ out_undo_sget:
404 * This supports only the legacy single-instance semantics (no 402 * This supports only the legacy single-instance semantics (no
405 * multiple-instance semantics) 403 * multiple-instance semantics)
406 */ 404 */
407static int devpts_get_sb(struct file_system_type *fs_type, int flags, 405static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
408 const char *dev_name, void *data, struct vfsmount *mnt) 406 const char *dev_name, void *data)
409{ 407{
410 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); 408 return mount_single(fs_type, flags, data, devpts_fill_super);
411} 409}
412#endif 410#endif
413 411
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
421 419
422static struct file_system_type devpts_fs_type = { 420static struct file_system_type devpts_fs_type = {
423 .name = "devpts", 421 .name = "devpts",
424 .get_sb = devpts_get_sb, 422 .mount = devpts_mount,
425 .kill_sb = devpts_kill_sb, 423 .kill_sb = devpts_kill_sb,
426}; 424};
427 425
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531..85882f6ba5f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
218 * filesystems can use it to hold additional state between get_block calls and 218 * filesystems can use it to hold additional state between get_block calls and
219 * dio_complete. 219 * dio_complete.
220 */ 220 */
221static int dio_complete(struct dio *dio, loff_t offset, int ret) 221static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
222{ 222{
223 ssize_t transferred = 0; 223 ssize_t transferred = 0;
224 224
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
239 transferred = dio->i_size - offset; 239 transferred = dio->i_size - offset;
240 } 240 }
241 241
242 if (dio->end_io && dio->result)
243 dio->end_io(dio->iocb, offset, transferred,
244 dio->map_bh.b_private);
245
246 if (dio->flags & DIO_LOCKING)
247 /* lockdep: non-owner release */
248 up_read_non_owner(&dio->inode->i_alloc_sem);
249
250 if (ret == 0) 242 if (ret == 0)
251 ret = dio->page_errors; 243 ret = dio->page_errors;
252 if (ret == 0) 244 if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
254 if (ret == 0) 246 if (ret == 0)
255 ret = transferred; 247 ret = transferred;
256 248
249 if (dio->end_io && dio->result) {
250 dio->end_io(dio->iocb, offset, transferred,
251 dio->map_bh.b_private, ret, is_async);
252 } else if (is_async) {
253 aio_complete(dio->iocb, ret, 0);
254 }
255
256 if (dio->flags & DIO_LOCKING)
257 /* lockdep: non-owner release */
258 up_read_non_owner(&dio->inode->i_alloc_sem);
259
257 return ret; 260 return ret;
258} 261}
259 262
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
277 spin_unlock_irqrestore(&dio->bio_lock, flags); 280 spin_unlock_irqrestore(&dio->bio_lock, flags);
278 281
279 if (remaining == 0) { 282 if (remaining == 0) {
280 int ret = dio_complete(dio, dio->iocb->ki_pos, 0); 283 dio_complete(dio, dio->iocb->ki_pos, 0, true);
281 aio_complete(dio->iocb, ret, 0);
282 kfree(dio); 284 kfree(dio);
283 } 285 }
284} 286}
@@ -632,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio)
632 int ret = 0; 634 int ret = 0;
633 635
634 if (dio->bio) { 636 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits; 637 loff_t cur_offset = dio->cur_page_fs_offset;
636 loff_t bio_next_offset = dio->logical_offset_in_bio + 638 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size; 639 dio->bio->bi_size;
638 640
@@ -657,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio)
657 * Submit now if the underlying fs is about to perform a 659 * Submit now if the underlying fs is about to perform a
658 * metadata read 660 * metadata read
659 */ 661 */
660 if (dio->boundary) 662 else if (dio->boundary)
661 dio_bio_submit(dio); 663 dio_bio_submit(dio);
662 } 664 }
663 665
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1126 spin_unlock_irqrestore(&dio->bio_lock, flags); 1128 spin_unlock_irqrestore(&dio->bio_lock, flags);
1127 1129
1128 if (ret2 == 0) { 1130 if (ret2 == 0) {
1129 ret = dio_complete(dio, offset, ret); 1131 ret = dio_complete(dio, offset, ret, false);
1130 kfree(dio); 1132 kfree(dio);
1131 } else 1133 } else
1132 BUG_ON(ret != -EIOCBQUEUED); 1134 BUG_ON(ret != -EIOCBQUEUED);
@@ -1134,8 +1136,27 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1134 return ret; 1136 return ret;
1135} 1137}
1136 1138
1139/*
1140 * This is a library function for use by filesystem drivers.
1141 *
1142 * The locking rules are governed by the flags parameter:
1143 * - if the flags value contains DIO_LOCKING we use a fancy locking
1144 * scheme for dumb filesystems.
1145 * For writes this function is called under i_mutex and returns with
1146 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1147 * taken and dropped again before returning.
1148 * For reads and writes i_alloc_sem is taken in shared mode and released
1149 * on I/O completion (which may happen asynchronously after returning to
1150 * the caller).
1151 *
1152 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1153 * internal locking but rather rely on the filesystem to synchronize
1154 * direct I/O reads/writes versus each other and truncate.
1155 * For reads and writes both i_mutex and i_alloc_sem are not held on
1156 * entry and are never taken.
1157 */
1137ssize_t 1158ssize_t
1138__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, 1159__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1139 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1160 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1140 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1161 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1141 dio_submit_t submit_io, int flags) 1162 dio_submit_t submit_io, int flags)
@@ -1231,57 +1252,4 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
1231out: 1252out:
1232 return retval; 1253 return retval;
1233} 1254}
1234EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
1235
1236/*
1237 * This is a library function for use by filesystem drivers.
1238 *
1239 * The locking rules are governed by the flags parameter:
1240 * - if the flags value contains DIO_LOCKING we use a fancy locking
1241 * scheme for dumb filesystems.
1242 * For writes this function is called under i_mutex and returns with
1243 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1244 * taken and dropped again before returning.
1245 * For reads and writes i_alloc_sem is taken in shared mode and released
1246 * on I/O completion (which may happen asynchronously after returning to
1247 * the caller).
1248 *
1249 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1250 * internal locking but rather rely on the filesystem to synchronize
1251 * direct I/O reads/writes versus each other and truncate.
1252 * For reads and writes both i_mutex and i_alloc_sem are not held on
1253 * entry and are never taken.
1254 */
1255ssize_t
1256__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1257 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1258 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1259 dio_submit_t submit_io, int flags)
1260{
1261 ssize_t retval;
1262
1263 retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
1264 offset, nr_segs, get_block, end_io, submit_io, flags);
1265 /*
1266 * In case of error extending write may have instantiated a few
1267 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1268 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
1269 * their own manner. This is a further example of where the old
1270 * truncate sequence is inadequate.
1271 *
1272 * NOTE: filesystems with their own locking have to handle this
1273 * on their own.
1274 */
1275 if (flags & DIO_LOCKING) {
1276 if (unlikely((rw & WRITE) && retval < 0)) {
1277 loff_t isize = i_size_read(inode);
1278 loff_t end = offset + iov_length(iov, nr_segs);
1279
1280 if (end > isize)
1281 vmtruncate(inode, isize);
1282 }
1283 }
1284
1285 return retval;
1286}
1287EXPORT_SYMBOL(__blockdev_direct_IO); 1255EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf2515874..6b42ba807df 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
643static const struct file_operations waiters_fops = { 643static const struct file_operations waiters_fops = {
644 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
645 .open = waiters_open, 645 .open = waiters_open,
646 .read = waiters_read 646 .read = waiters_read,
647 .llseek = default_llseek,
647}; 648};
648 649
649void dlm_delete_debug_file(struct dlm_ls *ls) 650void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15c..64e5f3efdd8 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1846 struct dlm_lkb *gr; 1846 struct dlm_lkb *gr;
1847 1847
1848 list_for_each_entry(gr, head, lkb_statequeue) { 1848 list_for_each_entry(gr, head, lkb_statequeue) {
1849 /* skip self when sending basts to convertqueue */
1850 if (gr == lkb)
1851 continue;
1849 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { 1852 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
1850 queue_bast(r, gr, lkb->lkb_rqmode); 1853 queue_bast(r, gr, lkb->lkb_rqmode);
1851 gr->lkb_highbast = lkb->lkb_rqmode; 1854 gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c62052..37a34c2c622 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
248 248
249 for (i = 0 ; i < CONN_HASH_SIZE; i++) { 249 for (i = 0 ; i < CONN_HASH_SIZE; i++) {
250 hlist_for_each_entry(con, h, &connection_hash[i], list) { 250 hlist_for_each_entry(con, h, &connection_hash[i], list) {
251 if (con && con->sctp_assoc == assoc_id) { 251 if (con->sctp_assoc == assoc_id) {
252 mutex_unlock(&connections_lock); 252 mutex_unlock(&connections_lock);
253 return con; 253 return con;
254 } 254 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100..ef17e0169da 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
81 81
82int __init dlm_netlink_init(void) 82int __init dlm_netlink_init(void)
83{ 83{
84 int rv; 84 return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
85
86 rv = genl_register_family(&family);
87 if (rv)
88 return rv;
89
90 rv = genl_register_ops(&family, &dlm_nl_ops);
91 if (rv < 0)
92 goto err;
93 return 0;
94 err:
95 genl_unregister_family(&family);
96 return rv;
97} 85}
98 86
99void dlm_netlink_exit(void) 87void dlm_netlink_exit(void)
100{ 88{
101 genl_unregister_ops(&family, &dlm_nl_ops);
102 genl_unregister_family(&family); 89 genl_unregister_family(&family);
103} 90}
104 91
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db694..30d8b85febb 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
412 .read = dev_read, 412 .read = dev_read,
413 .write = dev_write, 413 .write = dev_write,
414 .poll = dev_poll, 414 .poll = dev_poll,
415 .owner = THIS_MODULE 415 .owner = THIS_MODULE,
416 .llseek = noop_llseek,
416}; 417};
417 418
418static struct miscdevice plock_dev_misc = { 419static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130..66d6c16bf44 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
1009 .write = device_write, 1009 .write = device_write,
1010 .poll = device_poll, 1010 .poll = device_poll,
1011 .owner = THIS_MODULE, 1011 .owner = THIS_MODULE,
1012 .llseek = noop_llseek,
1012}; 1013};
1013 1014
1014static const struct file_operations ctl_device_fops = { 1015static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
1017 .read = device_read, 1018 .read = device_read,
1018 .write = device_write, 1019 .write = device_write,
1019 .owner = THIS_MODULE, 1020 .owner = THIS_MODULE,
1021 .llseek = noop_llseek,
1020}; 1022};
1021 1023
1022static struct miscdevice ctl_device = { 1024static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
1029 .open = monitor_device_open, 1031 .open = monitor_device_open,
1030 .release = monitor_device_close, 1032 .release = monitor_device_close,
1031 .owner = THIS_MODULE, 1033 .owner = THIS_MODULE,
1034 .llseek = noop_llseek,
1032}; 1035};
1033 1036
1034static struct miscdevice monitor_device = { 1037static struct miscdevice monitor_device = {
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 83c4f600786..2195c213ab2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5..cbadc1bee6e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
762 762
763/** 763/**
764 * ecryptfs_init_crypt_ctx 764 * ecryptfs_init_crypt_ctx
765 * @crypt_stat: Uninitilized crypt stats structure 765 * @crypt_stat: Uninitialized crypt stats structure
766 * 766 *
767 * Initialize the crypto context. 767 * Initialize the crypto context.
768 * 768 *
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
1793static struct list_head key_tfm_list; 1793static struct list_head key_tfm_list;
1794struct mutex key_tfm_list_mutex; 1794struct mutex key_tfm_list_mutex;
1795 1795
1796int ecryptfs_init_crypto(void) 1796int __init ecryptfs_init_crypto(void)
1797{ 1797{
1798 mutex_init(&key_tfm_list_mutex); 1798 mutex_init(&key_tfm_list_mutex);
1799 INIT_LIST_HEAD(&key_tfm_list); 1799 INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2170 + encoded_name_no_prefix_size); 2170 + encoded_name_no_prefix_size);
2171 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2172 (*encoded_name_size)++;
2173 } else { 2172 } else {
2174 rc = -EOPNOTSUPP; 2173 rc = -EOPNOTSUPP;
2175 } 2174 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a..413a3c48f0b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
377#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 377#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
378#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 378#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
379#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 379#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
380#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080
380 u32 flags; 381 u32 flags;
381 struct list_head global_auth_tok_list; 382 struct list_head global_auth_tok_list;
382 struct mutex global_auth_tok_list_mutex; 383 struct mutex global_auth_tok_list_mutex;
@@ -477,7 +478,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
477static inline struct ecryptfs_file_info * 478static inline struct ecryptfs_file_info *
478ecryptfs_file_to_private(struct file *file) 479ecryptfs_file_to_private(struct file *file)
479{ 480{
480 return (struct ecryptfs_file_info *)file->private_data; 481 return file->private_data;
481} 482}
482 483
483static inline void 484static inline void
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e8fcf4e2ed7..91da02987bf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/compat.h> 32#include <linux/compat.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/smp_lock.h>
35#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
36 35
37/** 36/**
@@ -199,7 +198,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
199 "the persistent file for the dentry with name " 198 "the persistent file for the dentry with name "
200 "[%s]; rc = [%d]\n", __func__, 199 "[%s]; rc = [%d]\n", __func__,
201 ecryptfs_dentry->d_name.name, rc); 200 ecryptfs_dentry->d_name.name, rc);
202 goto out; 201 goto out_free;
203 } 202 }
204 } 203 }
205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 204 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
@@ -207,7 +206,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
207 rc = -EPERM; 206 rc = -EPERM;
208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 207 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
209 "file must hence be opened RO\n", __func__); 208 "file must hence be opened RO\n", __func__);
210 goto out; 209 goto out_free;
211 } 210 }
212 ecryptfs_set_file_lower( 211 ecryptfs_set_file_lower(
213 file, ecryptfs_inode_to_private(inode)->lower_file); 212 file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -284,26 +283,53 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
284 int rc = 0; 283 int rc = 0;
285 struct file *lower_file = NULL; 284 struct file *lower_file = NULL;
286 285
287 lock_kernel();
288 lower_file = ecryptfs_file_to_lower(file); 286 lower_file = ecryptfs_file_to_lower(file);
289 if (lower_file->f_op && lower_file->f_op->fasync) 287 if (lower_file->f_op && lower_file->f_op->fasync)
290 rc = lower_file->f_op->fasync(fd, lower_file, flag); 288 rc = lower_file->f_op->fasync(fd, lower_file, flag);
291 unlock_kernel();
292 return rc; 289 return rc;
293} 290}
294 291
295static int ecryptfs_ioctl(struct inode *inode, struct file *file, 292static long
296 unsigned int cmd, unsigned long arg); 293ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
294{
295 struct file *lower_file = NULL;
296 long rc = -ENOTTY;
297
298 if (ecryptfs_file_to_private(file))
299 lower_file = ecryptfs_file_to_lower(file);
300 if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
301 rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
302 return rc;
303}
304
305#ifdef CONFIG_COMPAT
306static long
307ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
308{
309 struct file *lower_file = NULL;
310 long rc = -ENOIOCTLCMD;
311
312 if (ecryptfs_file_to_private(file))
313 lower_file = ecryptfs_file_to_lower(file);
314 if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
315 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
316 return rc;
317}
318#endif
297 319
298const struct file_operations ecryptfs_dir_fops = { 320const struct file_operations ecryptfs_dir_fops = {
299 .readdir = ecryptfs_readdir, 321 .readdir = ecryptfs_readdir,
300 .ioctl = ecryptfs_ioctl, 322 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
323#ifdef CONFIG_COMPAT
324 .compat_ioctl = ecryptfs_compat_ioctl,
325#endif
301 .open = ecryptfs_open, 326 .open = ecryptfs_open,
302 .flush = ecryptfs_flush, 327 .flush = ecryptfs_flush,
303 .release = ecryptfs_release, 328 .release = ecryptfs_release,
304 .fsync = ecryptfs_fsync, 329 .fsync = ecryptfs_fsync,
305 .fasync = ecryptfs_fasync, 330 .fasync = ecryptfs_fasync,
306 .splice_read = generic_file_splice_read, 331 .splice_read = generic_file_splice_read,
332 .llseek = default_llseek,
307}; 333};
308 334
309const struct file_operations ecryptfs_main_fops = { 335const struct file_operations ecryptfs_main_fops = {
@@ -313,7 +339,10 @@ const struct file_operations ecryptfs_main_fops = {
313 .write = do_sync_write, 339 .write = do_sync_write,
314 .aio_write = generic_file_aio_write, 340 .aio_write = generic_file_aio_write,
315 .readdir = ecryptfs_readdir, 341 .readdir = ecryptfs_readdir,
316 .ioctl = ecryptfs_ioctl, 342 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
343#ifdef CONFIG_COMPAT
344 .compat_ioctl = ecryptfs_compat_ioctl,
345#endif
317 .mmap = generic_file_mmap, 346 .mmap = generic_file_mmap,
318 .open = ecryptfs_open, 347 .open = ecryptfs_open,
319 .flush = ecryptfs_flush, 348 .flush = ecryptfs_flush,
@@ -322,20 +351,3 @@ const struct file_operations ecryptfs_main_fops = {
322 .fasync = ecryptfs_fasync, 351 .fasync = ecryptfs_fasync,
323 .splice_read = generic_file_splice_read, 352 .splice_read = generic_file_splice_read,
324}; 353};
325
326static int
327ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
328 unsigned long arg)
329{
330 int rc = 0;
331 struct file *lower_file = NULL;
332
333 if (ecryptfs_file_to_private(file))
334 lower_file = ecryptfs_file_to_lower(file);
335 if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
336 rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
337 lower_file, cmd, arg);
338 else
339 rc = -ENOTTY;
340 return rc;
341}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 31ef5252f0f..9d1a22d6276 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/xattr.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
70 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 71 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
71 struct dentry *dentry_save; 72 struct dentry *dentry_save;
72 struct vfsmount *vfsmount_save; 73 struct vfsmount *vfsmount_save;
74 unsigned int flags_save;
73 int rc; 75 int rc;
74 76
75 dentry_save = nd->path.dentry; 77 dentry_save = nd->path.dentry;
76 vfsmount_save = nd->path.mnt; 78 vfsmount_save = nd->path.mnt;
79 flags_save = nd->flags;
77 nd->path.dentry = lower_dentry; 80 nd->path.dentry = lower_dentry;
78 nd->path.mnt = lower_mnt; 81 nd->path.mnt = lower_mnt;
82 nd->flags &= ~LOOKUP_OPEN;
79 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); 83 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
80 nd->path.dentry = dentry_save; 84 nd->path.dentry = dentry_save;
81 nd->path.mnt = vfsmount_save; 85 nd->path.mnt = vfsmount_save;
86 nd->flags = flags_save;
82 return rc; 87 return rc;
83} 88}
84 89
@@ -264,7 +269,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
264 printk(KERN_ERR "%s: Out of memory whilst attempting " 269 printk(KERN_ERR "%s: Out of memory whilst attempting "
265 "to allocate ecryptfs_dentry_info struct\n", 270 "to allocate ecryptfs_dentry_info struct\n",
266 __func__); 271 __func__);
267 goto out_dput; 272 goto out_put;
268 } 273 }
269 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); 274 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
270 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); 275 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -339,14 +344,84 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
339out_free_kmem: 344out_free_kmem:
340 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 345 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
341 goto out; 346 goto out;
342out_dput: 347out_put:
343 dput(lower_dentry); 348 dput(lower_dentry);
349 mntput(lower_mnt);
344 d_drop(ecryptfs_dentry); 350 d_drop(ecryptfs_dentry);
345out: 351out:
346 return rc; 352 return rc;
347} 353}
348 354
349/** 355/**
356 * ecryptfs_new_lower_dentry
357 * @name: The name of the new dentry.
358 * @lower_dir_dentry: Parent directory of the new dentry.
359 * @nd: nameidata from last lookup.
360 *
361 * Create a new dentry or get it from lower parent dir.
362 */
363static struct dentry *
364ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
365 struct nameidata *nd)
366{
367 struct dentry *new_dentry;
368 struct dentry *tmp;
369 struct inode *lower_dir_inode;
370
371 lower_dir_inode = lower_dir_dentry->d_inode;
372
373 tmp = d_alloc(lower_dir_dentry, name);
374 if (!tmp)
375 return ERR_PTR(-ENOMEM);
376
377 mutex_lock(&lower_dir_inode->i_mutex);
378 new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
379 mutex_unlock(&lower_dir_inode->i_mutex);
380
381 if (!new_dentry)
382 new_dentry = tmp;
383 else
384 dput(tmp);
385
386 return new_dentry;
387}
388
389
390/**
391 * ecryptfs_lookup_one_lower
392 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
393 * @lower_dir_dentry: lower parent directory
394 * @name: lower file name
395 *
396 * Get the lower dentry from vfs. If lower dentry does not exist yet,
397 * create it.
398 */
399static struct dentry *
400ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
401 struct dentry *lower_dir_dentry, struct qstr *name)
402{
403 struct nameidata nd;
404 struct vfsmount *lower_mnt;
405 int err;
406
407 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
408 ecryptfs_dentry->d_parent));
409 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
410 mntput(lower_mnt);
411
412 if (!err) {
413 /* we dont need the mount */
414 mntput(nd.path.mnt);
415 return nd.path.dentry;
416 }
417 if (err != -ENOENT)
418 return ERR_PTR(err);
419
420 /* create a new lower dentry */
421 return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
422}
423
424/**
350 * ecryptfs_lookup 425 * ecryptfs_lookup
351 * @ecryptfs_dir_inode: The eCryptfs directory inode 426 * @ecryptfs_dir_inode: The eCryptfs directory inode
352 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 427 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -363,6 +438,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
363 size_t encrypted_and_encoded_name_size; 438 size_t encrypted_and_encoded_name_size;
364 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 439 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
365 struct dentry *lower_dir_dentry, *lower_dentry; 440 struct dentry *lower_dir_dentry, *lower_dentry;
441 struct qstr lower_name;
366 int rc = 0; 442 int rc = 0;
367 443
368 ecryptfs_dentry->d_op = &ecryptfs_dops; 444 ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -373,14 +449,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
373 goto out_d_drop; 449 goto out_d_drop;
374 } 450 }
375 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 451 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
376 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 452 lower_name.name = ecryptfs_dentry->d_name.name;
377 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, 453 lower_name.len = ecryptfs_dentry->d_name.len;
378 lower_dir_dentry, 454 lower_name.hash = ecryptfs_dentry->d_name.hash;
379 ecryptfs_dentry->d_name.len); 455 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
380 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 456 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
457 &lower_name);
458 if (rc < 0)
459 goto out_d_drop;
460 }
461 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
462 lower_dir_dentry, &lower_name);
381 if (IS_ERR(lower_dentry)) { 463 if (IS_ERR(lower_dentry)) {
382 rc = PTR_ERR(lower_dentry); 464 rc = PTR_ERR(lower_dentry);
383 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 465 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
384 "[%d] on lower_dentry = [%s]\n", __func__, rc, 466 "[%d] on lower_dentry = [%s]\n", __func__, rc,
385 encrypted_and_encoded_name); 467 encrypted_and_encoded_name);
386 goto out_d_drop; 468 goto out_d_drop;
@@ -402,14 +484,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
402 "filename; rc = [%d]\n", __func__, rc); 484 "filename; rc = [%d]\n", __func__, rc);
403 goto out_d_drop; 485 goto out_d_drop;
404 } 486 }
405 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 487 lower_name.name = encrypted_and_encoded_name;
406 lower_dentry = lookup_one_len(encrypted_and_encoded_name, 488 lower_name.len = encrypted_and_encoded_name_size;
407 lower_dir_dentry, 489 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
408 encrypted_and_encoded_name_size - 1); 490 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
409 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 491 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
492 &lower_name);
493 if (rc < 0)
494 goto out_d_drop;
495 }
496 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
497 lower_dir_dentry, &lower_name);
410 if (IS_ERR(lower_dentry)) { 498 if (IS_ERR(lower_dentry)) {
411 rc = PTR_ERR(lower_dentry); 499 rc = PTR_ERR(lower_dentry);
412 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 500 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
413 "[%d] on lower_dentry = [%s]\n", __func__, rc, 501 "[%d] on lower_dentry = [%s]\n", __func__, rc,
414 encrypted_and_encoded_name); 502 encrypted_and_encoded_name);
415 goto out_d_drop; 503 goto out_d_drop;
@@ -804,10 +892,20 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
804 size_t num_zeros = (PAGE_CACHE_SIZE 892 size_t num_zeros = (PAGE_CACHE_SIZE
805 - (ia->ia_size & ~PAGE_CACHE_MASK)); 893 - (ia->ia_size & ~PAGE_CACHE_MASK));
806 894
895
896 /*
897 * XXX(truncate) this should really happen at the begginning
898 * of ->setattr. But the code is too messy to that as part
899 * of a larger patch. ecryptfs is also totally missing out
900 * on the inode_change_ok check at the beginning of
901 * ->setattr while would include this.
902 */
903 rc = inode_newsize_ok(inode, ia->ia_size);
904 if (rc)
905 goto out;
906
807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 907 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
808 rc = simple_setsize(inode, ia->ia_size); 908 truncate_setsize(inode, ia->ia_size);
809 if (rc)
810 goto out;
811 lower_ia->ia_size = ia->ia_size; 909 lower_ia->ia_size = ia->ia_size;
812 lower_ia->ia_valid |= ATTR_SIZE; 910 lower_ia->ia_valid |= ATTR_SIZE;
813 goto out; 911 goto out;
@@ -830,7 +928,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
830 goto out; 928 goto out;
831 } 929 }
832 } 930 }
833 simple_setsize(inode, ia->ia_size); 931 truncate_setsize(inode, ia->ia_size);
834 rc = ecryptfs_write_inode_size_to_metadata(inode); 932 rc = ecryptfs_write_inode_size_to_metadata(inode);
835 if (rc) { 933 if (rc) {
836 printk(KERN_ERR "Problem with " 934 printk(KERN_ERR "Problem with "
@@ -1015,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1015 rc = -EOPNOTSUPP; 1113 rc = -EOPNOTSUPP;
1016 goto out; 1114 goto out;
1017 } 1115 }
1018 mutex_lock(&lower_dentry->d_inode->i_mutex); 1116
1019 rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, 1117 rc = vfs_setxattr(lower_dentry, name, value, size, flags);
1020 size, flags);
1021 mutex_unlock(&lower_dentry->d_inode->i_mutex);
1022out: 1118out:
1023 return rc; 1119 return rc;
1024} 1120}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506e..b1f6858a522 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
446 */ 446 */
447static int 447static int
448ecryptfs_find_auth_tok_for_sig( 448ecryptfs_find_auth_tok_for_sig(
449 struct key **auth_tok_key,
449 struct ecryptfs_auth_tok **auth_tok, 450 struct ecryptfs_auth_tok **auth_tok,
450 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 451 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
451 char *sig) 452 char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
453 struct ecryptfs_global_auth_tok *global_auth_tok; 454 struct ecryptfs_global_auth_tok *global_auth_tok;
454 int rc = 0; 455 int rc = 0;
455 456
457 (*auth_tok_key) = NULL;
456 (*auth_tok) = NULL; 458 (*auth_tok) = NULL;
457 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, 459 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
458 mount_crypt_stat, sig)) { 460 mount_crypt_stat, sig)) {
459 struct key *auth_tok_key;
460 461
461 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, 462 /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
463 * mount_crypt_stat structure, we prevent to use auth toks that
464 * are not inserted through the ecryptfs_add_global_auth_tok
465 * function.
466 */
467 if (mount_crypt_stat->flags
468 & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
469 return -EINVAL;
470
471 rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
462 sig); 472 sig);
463 } else 473 } else
464 (*auth_tok) = global_auth_tok->global_auth_tok; 474 (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,12 +519,14 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
509 char *filename, size_t filename_size) 519 char *filename, size_t filename_size)
510{ 520{
511 struct ecryptfs_write_tag_70_packet_silly_stack *s; 521 struct ecryptfs_write_tag_70_packet_silly_stack *s;
522 struct key *auth_tok_key = NULL;
512 int rc = 0; 523 int rc = 0;
513 524
514 s = kmalloc(sizeof(*s), GFP_KERNEL); 525 s = kmalloc(sizeof(*s), GFP_KERNEL);
515 if (!s) { 526 if (!s) {
516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 527 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 528 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
529 rc = -ENOMEM;
518 goto out; 530 goto out;
519 } 531 }
520 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 532 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -605,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
605 } 617 }
606 dest[s->i++] = s->cipher_code; 618 dest[s->i++] = s->cipher_code;
607 rc = ecryptfs_find_auth_tok_for_sig( 619 rc = ecryptfs_find_auth_tok_for_sig(
620 &auth_tok_key,
608 &s->auth_tok, mount_crypt_stat, 621 &s->auth_tok, mount_crypt_stat,
609 mount_crypt_stat->global_default_fnek_sig); 622 mount_crypt_stat->global_default_fnek_sig);
610 if (rc) { 623 if (rc) {
@@ -752,6 +765,8 @@ out_free_unlock:
752out_unlock: 765out_unlock:
753 mutex_unlock(s->tfm_mutex); 766 mutex_unlock(s->tfm_mutex);
754out: 767out:
768 if (auth_tok_key)
769 key_put(auth_tok_key);
755 kfree(s); 770 kfree(s);
756 return rc; 771 return rc;
757} 772}
@@ -797,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
797 char *data, size_t max_packet_size) 812 char *data, size_t max_packet_size)
798{ 813{
799 struct ecryptfs_parse_tag_70_packet_silly_stack *s; 814 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
815 struct key *auth_tok_key = NULL;
800 int rc = 0; 816 int rc = 0;
801 817
802 (*packet_size) = 0; 818 (*packet_size) = 0;
@@ -806,6 +822,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
806 if (!s) { 822 if (!s) {
807 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 823 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
808 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 824 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
825 rc = -ENOMEM;
809 goto out; 826 goto out;
810 } 827 }
811 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 828 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -908,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
908 * >= ECRYPTFS_MAX_IV_BYTES. */ 925 * >= ECRYPTFS_MAX_IV_BYTES. */
909 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); 926 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
910 s->desc.info = s->iv; 927 s->desc.info = s->iv;
911 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, 928 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
929 &s->auth_tok, mount_crypt_stat,
912 s->fnek_sig_hex); 930 s->fnek_sig_hex);
913 if (rc) { 931 if (rc) {
914 printk(KERN_ERR "%s: Error attempting to find auth tok for " 932 printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -984,6 +1002,8 @@ out:
984 (*filename_size) = 0; 1002 (*filename_size) = 0;
985 (*filename) = NULL; 1003 (*filename) = NULL;
986 } 1004 }
1005 if (auth_tok_key)
1006 key_put(auth_tok_key);
987 kfree(s); 1007 kfree(s);
988 return rc; 1008 return rc;
989} 1009}
@@ -1555,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1555 ECRYPTFS_VERSION_MAJOR, 1575 ECRYPTFS_VERSION_MAJOR,
1556 ECRYPTFS_VERSION_MINOR); 1576 ECRYPTFS_VERSION_MINOR);
1557 rc = -EINVAL; 1577 rc = -EINVAL;
1558 goto out; 1578 goto out_release_key;
1559 } 1579 }
1560 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD 1580 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
1561 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { 1581 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
1562 printk(KERN_ERR "Invalid auth_tok structure " 1582 printk(KERN_ERR "Invalid auth_tok structure "
1563 "returned from key query\n"); 1583 "returned from key query\n");
1564 rc = -EINVAL; 1584 rc = -EINVAL;
1565 goto out; 1585 goto out_release_key;
1586 }
1587out_release_key:
1588 if (rc) {
1589 key_put(*auth_tok_key);
1590 (*auth_tok_key) = NULL;
1566 } 1591 }
1567out: 1592out:
1568 return rc; 1593 return rc;
@@ -1686,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1686 struct ecryptfs_auth_tok_list_item *auth_tok_list_item; 1711 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
1687 size_t tag_11_contents_size; 1712 size_t tag_11_contents_size;
1688 size_t tag_11_packet_size; 1713 size_t tag_11_packet_size;
1714 struct key *auth_tok_key = NULL;
1689 int rc = 0; 1715 int rc = 0;
1690 1716
1691 INIT_LIST_HEAD(&auth_tok_list); 1717 INIT_LIST_HEAD(&auth_tok_list);
@@ -1782,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1782 * just one will be sufficient to decrypt to get the FEK. */ 1808 * just one will be sufficient to decrypt to get the FEK. */
1783find_next_matching_auth_tok: 1809find_next_matching_auth_tok:
1784 found_auth_tok = 0; 1810 found_auth_tok = 0;
1811 if (auth_tok_key) {
1812 key_put(auth_tok_key);
1813 auth_tok_key = NULL;
1814 }
1785 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { 1815 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
1786 candidate_auth_tok = &auth_tok_list_item->auth_tok; 1816 candidate_auth_tok = &auth_tok_list_item->auth_tok;
1787 if (unlikely(ecryptfs_verbosity > 0)) { 1817 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1798,10 +1828,11 @@ find_next_matching_auth_tok:
1798 rc = -EINVAL; 1828 rc = -EINVAL;
1799 goto out_wipe_list; 1829 goto out_wipe_list;
1800 } 1830 }
1801 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, 1831 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
1832 &matching_auth_tok,
1802 crypt_stat->mount_crypt_stat, 1833 crypt_stat->mount_crypt_stat,
1803 candidate_auth_tok_sig); 1834 candidate_auth_tok_sig);
1804 if (matching_auth_tok) { 1835 if (!rc) {
1805 found_auth_tok = 1; 1836 found_auth_tok = 1;
1806 goto found_matching_auth_tok; 1837 goto found_matching_auth_tok;
1807 } 1838 }
@@ -1864,6 +1895,8 @@ found_matching_auth_tok:
1864out_wipe_list: 1895out_wipe_list:
1865 wipe_auth_tok_list(&auth_tok_list); 1896 wipe_auth_tok_list(&auth_tok_list);
1866out: 1897out:
1898 if (auth_tok_key)
1899 key_put(auth_tok_key);
1867 return rc; 1900 return rc;
1868} 1901}
1869 1902
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aaf..0851ab6980f 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
86 return 0; 86 return 0;
87} 87}
88 88
89int ecryptfs_init_kthread(void) 89int __init ecryptfs_init_kthread(void)
90{ 90{
91 int rc = 0; 91 int rc = 0;
92 92
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb2..a9dbd62518e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; 211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
212 ecryptfs_opt_err };
212 213
213static const match_table_t tokens = { 214static const match_table_t tokens = {
214 {ecryptfs_opt_sig, "sig=%s"}, 215 {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, 224 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 225 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
225 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, 226 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
227 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
226 {ecryptfs_opt_err, NULL} 228 {ecryptfs_opt_err, NULL}
227}; 229};
228 230
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
406 case ecryptfs_opt_unlink_sigs: 408 case ecryptfs_opt_unlink_sigs:
407 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; 409 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
408 break; 410 break;
411 case ecryptfs_opt_mount_auth_tok_only:
412 mount_crypt_stat->flags |=
413 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
414 break;
409 case ecryptfs_opt_err: 415 case ecryptfs_opt_err:
410 default: 416 default:
411 printk(KERN_WARNING 417 printk(KERN_WARNING
@@ -540,9 +546,8 @@ out:
540 * ecryptfs_interpose to perform most of the linking 546 * ecryptfs_interpose to perform most of the linking
541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 547 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
542 */ 548 */
543static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, 549static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
544 const char *dev_name, void *raw_data, 550 const char *dev_name, void *raw_data)
545 struct vfsmount *mnt)
546{ 551{
547 struct super_block *s; 552 struct super_block *s;
548 struct ecryptfs_sb_info *sbi; 553 struct ecryptfs_sb_info *sbi;
@@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
607 err = "Reading sb failed"; 612 err = "Reading sb failed";
608 goto out; 613 goto out;
609 } 614 }
610 simple_set_mnt(mnt, s); 615 return dget(s->s_root);
611 return 0;
612 616
613out: 617out:
614 if (sbi) { 618 if (sbi) {
@@ -616,7 +620,7 @@ out:
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi); 620 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 } 621 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc); 622 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
619 return rc; 623 return ERR_PTR(rc);
620} 624}
621 625
622/** 626/**
@@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
639static struct file_system_type ecryptfs_fs_type = { 643static struct file_system_type ecryptfs_fs_type = {
640 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
641 .name = "ecryptfs", 645 .name = "ecryptfs",
642 .get_sb = ecryptfs_get_sb, 646 .mount = ecryptfs_mount,
643 .kill_sb = ecryptfs_kill_block_super, 647 .kill_sb = ecryptfs_kill_block_super,
644 .fs_flags = 0 648 .fs_flags = 0
645}; 649};
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 46c4dd8dfcc..ab224809051 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -274,7 +274,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
274 struct user_namespace *user_ns, struct pid *pid, 274 struct user_namespace *user_ns, struct pid *pid,
275 u32 seq) 275 u32 seq)
276{ 276{
277 struct ecryptfs_daemon *daemon; 277 struct ecryptfs_daemon *uninitialized_var(daemon);
278 struct ecryptfs_msg_ctx *msg_ctx; 278 struct ecryptfs_msg_ctx *msg_ctx;
279 size_t msg_size; 279 size_t msg_size;
280 struct nsproxy *nsproxy; 280 struct nsproxy *nsproxy;
@@ -473,7 +473,7 @@ sleep:
473 return rc; 473 return rc;
474} 474}
475 475
476int ecryptfs_init_messaging(void) 476int __init ecryptfs_init_messaging(void)
477{ 477{
478 int i; 478 int i;
479 int rc = 0; 479 int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd..940a82e63dc 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
482 .read = ecryptfs_miscdev_read, 482 .read = ecryptfs_miscdev_read,
483 .write = ecryptfs_miscdev_write, 483 .write = ecryptfs_miscdev_write,
484 .release = ecryptfs_miscdev_release, 484 .release = ecryptfs_miscdev_release,
485 .llseek = noop_llseek,
485}; 486};
486 487
487static struct miscdevice ecryptfs_miscdev = { 488static struct miscdevice ecryptfs_miscdev = {
@@ -500,7 +501,7 @@ static struct miscdevice ecryptfs_miscdev = {
500 * 501 *
501 * Returns zero on success; non-zero otherwise 502 * Returns zero on success; non-zero otherwise
502 */ 503 */
503int ecryptfs_init_ecryptfs_miscdev(void) 504int __init ecryptfs_init_ecryptfs_miscdev(void)
504{ 505{
505 int rc; 506 int rc;
506 507
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0435886e4a9..253732382d3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -118,11 +118,15 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
118 */ 118 */
119static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) 119static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
120{ 120{
121 return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf); 121 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
122
123 if (!lower_dentry->d_sb->s_op->statfs)
124 return -ENOSYS;
125 return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
122} 126}
123 127
124/** 128/**
125 * ecryptfs_clear_inode 129 * ecryptfs_evict_inode
126 * @inode - The ecryptfs inode 130 * @inode - The ecryptfs inode
127 * 131 *
128 * Called by iput() when the inode reference count reached zero 132 * Called by iput() when the inode reference count reached zero
@@ -131,8 +135,10 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
131 * on the inode free list. We use this to drop out reference to the 135 * on the inode free list. We use this to drop out reference to the
132 * lower inode. 136 * lower inode.
133 */ 137 */
134static void ecryptfs_clear_inode(struct inode *inode) 138static void ecryptfs_evict_inode(struct inode *inode)
135{ 139{
140 truncate_inode_pages(&inode->i_data, 0);
141 end_writeback(inode);
136 iput(ecryptfs_inode_to_lower(inode)); 142 iput(ecryptfs_inode_to_lower(inode));
137} 143}
138 144
@@ -174,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
174 seq_printf(m, ",ecryptfs_encrypted_view"); 180 seq_printf(m, ",ecryptfs_encrypted_view");
175 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) 181 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
176 seq_printf(m, ",ecryptfs_unlink_sigs"); 182 seq_printf(m, ",ecryptfs_unlink_sigs");
183 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
184 seq_printf(m, ",ecryptfs_mount_auth_tok_only");
177 185
178 return 0; 186 return 0;
179} 187}
@@ -184,6 +192,6 @@ const struct super_operations ecryptfs_sops = {
184 .drop_inode = generic_delete_inode, 192 .drop_inode = generic_delete_inode,
185 .statfs = ecryptfs_statfs, 193 .statfs = ecryptfs_statfs,
186 .remount_fs = NULL, 194 .remount_fs = NULL,
187 .clear_inode = ecryptfs_clear_inode, 195 .evict_inode = ecryptfs_evict_inode,
188 .show_options = ecryptfs_show_options 196 .show_options = ecryptfs_show_options
189}; 197};
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f0494281081..5073a07652c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); 20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
21static int efs_fill_super(struct super_block *s, void *d, int silent); 21static int efs_fill_super(struct super_block *s, void *d, int silent);
22 22
23static int efs_get_sb(struct file_system_type *fs_type, 23static struct dentry *efs_mount(struct file_system_type *fs_type,
24 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 24 int flags, const char *dev_name, void *data)
25{ 25{
26 return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); 26 return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
27} 27}
28 28
29static struct file_system_type efs_fs_type = { 29static struct file_system_type efs_fs_type = {
30 .owner = THIS_MODULE, 30 .owner = THIS_MODULE,
31 .name = "efs", 31 .name = "efs",
32 .get_sb = efs_get_sb, 32 .mount = efs_mount,
33 .kill_sb = kill_block_super, 33 .kill_sb = kill_block_super,
34 .fs_flags = FS_REQUIRES_DEV, 34 .fs_flags = FS_REQUIRES_DEV,
35}; 35};
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf8..e0194b3e14d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
293 .poll = eventfd_poll, 293 .poll = eventfd_poll,
294 .read = eventfd_read, 294 .read = eventfd_read,
295 .write = eventfd_write, 295 .write = eventfd_write,
296 .llseek = noop_llseek,
296}; 297};
297 298
298/** 299/**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919c..8cf07242067 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
77/* Maximum number of nesting allowed inside epoll sets */ 77/* Maximum number of nesting allowed inside epoll sets */
78#define EP_MAX_NESTS 4 78#define EP_MAX_NESTS 4
79 79
80/* Maximum msec timeout value storeable in a long int */
81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
82
83#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 80#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
84 81
85#define EP_UNACTIVE_PTR ((void *) -1L) 82#define EP_UNACTIVE_PTR ((void *) -1L)
@@ -674,7 +671,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
674/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
675static const struct file_operations eventpoll_fops = { 672static const struct file_operations eventpoll_fops = {
676 .release = ep_eventpoll_release, 673 .release = ep_eventpoll_release,
677 .poll = ep_eventpoll_poll 674 .poll = ep_eventpoll_poll,
675 .llseek = noop_llseek,
678}; 676};
679 677
680/* Fast test to see if the file is an evenpoll file */ 678/* Fast test to see if the file is an evenpoll file */
@@ -1116,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
1116static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1114static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1117 int maxevents, long timeout) 1115 int maxevents, long timeout)
1118{ 1116{
1119 int res, eavail; 1117 int res, eavail, timed_out = 0;
1120 unsigned long flags; 1118 unsigned long flags;
1121 long jtimeout; 1119 long slack;
1122 wait_queue_t wait; 1120 wait_queue_t wait;
1123 1121 struct timespec end_time;
1124 /* 1122 ktime_t expires, *to = NULL;
1125 * Calculate the timeout by checking for the "infinite" value (-1) 1123
1126 * and the overflow condition. The passed timeout is in milliseconds, 1124 if (timeout > 0) {
1127 * that why (t * HZ) / 1000. 1125 ktime_get_ts(&end_time);
1128 */ 1126 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
1129 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? 1127 slack = select_estimate_accuracy(&end_time);
1130 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; 1128 to = &expires;
1129 *to = timespec_to_ktime(end_time);
1130 } else if (timeout == 0) {
1131 timed_out = 1;
1132 }
1131 1133
1132retry: 1134retry:
1133 spin_lock_irqsave(&ep->lock, flags); 1135 spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1151,7 @@ retry:
1149 * to TASK_INTERRUPTIBLE before doing the checks. 1151 * to TASK_INTERRUPTIBLE before doing the checks.
1150 */ 1152 */
1151 set_current_state(TASK_INTERRUPTIBLE); 1153 set_current_state(TASK_INTERRUPTIBLE);
1152 if (!list_empty(&ep->rdllist) || !jtimeout) 1154 if (!list_empty(&ep->rdllist) || timed_out)
1153 break; 1155 break;
1154 if (signal_pending(current)) { 1156 if (signal_pending(current)) {
1155 res = -EINTR; 1157 res = -EINTR;
@@ -1157,7 +1159,9 @@ retry:
1157 } 1159 }
1158 1160
1159 spin_unlock_irqrestore(&ep->lock, flags); 1161 spin_unlock_irqrestore(&ep->lock, flags);
1160 jtimeout = schedule_timeout(jtimeout); 1162 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1163 timed_out = 1;
1164
1161 spin_lock_irqsave(&ep->lock, flags); 1165 spin_lock_irqsave(&ep->lock, flags);
1162 } 1166 }
1163 __remove_wait_queue(&ep->wq, &wait); 1167 __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1179,7 @@ retry:
1175 * more luck. 1179 * more luck.
1176 */ 1180 */
1177 if (!res && eavail && 1181 if (!res && eavail &&
1178 !(res = ep_send_events(ep, events, maxevents)) && jtimeout) 1182 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1179 goto retry; 1183 goto retry;
1180 1184
1181 return res; 1185 return res;
diff --git a/fs/exec.c b/fs/exec.c
index e19de6a8033..99d33a1371e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/stat.h> 29#include <linux/stat.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
31#include <linux/smp_lock.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/init.h> 33#include <linux/init.h>
@@ -55,6 +54,7 @@
55#include <linux/fsnotify.h> 54#include <linux/fsnotify.h>
56#include <linux/fs_struct.h> 55#include <linux/fs_struct.h>
57#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
58 58
59#include <asm/uaccess.h> 59#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
@@ -66,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
66unsigned int core_pipe_limit; 66unsigned int core_pipe_limit;
67int suid_dumpable = 0; 67int suid_dumpable = 0;
68 68
69struct core_name {
70 char *corename;
71 int used, size;
72};
73static atomic_t call_count = ATOMIC_INIT(1);
74
69/* The maximal length of core_pattern is also specified in sysctl.c */ 75/* The maximal length of core_pattern is also specified in sysctl.c */
70 76
71static LIST_HEAD(formats); 77static LIST_HEAD(formats);
@@ -129,7 +135,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
129 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 135 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
130 goto exit; 136 goto exit;
131 137
132 fsnotify_open(file->f_path.dentry); 138 fsnotify_open(file);
133 139
134 error = -ENOEXEC; 140 error = -ENOEXEC;
135 if(file->f_op) { 141 if(file->f_op) {
@@ -362,13 +368,13 @@ err:
362/* 368/*
363 * count() counts the number of strings in array ARGV. 369 * count() counts the number of strings in array ARGV.
364 */ 370 */
365static int count(char __user * __user * argv, int max) 371static int count(const char __user * const __user * argv, int max)
366{ 372{
367 int i = 0; 373 int i = 0;
368 374
369 if (argv != NULL) { 375 if (argv != NULL) {
370 for (;;) { 376 for (;;) {
371 char __user * p; 377 const char __user * p;
372 378
373 if (get_user(p, argv)) 379 if (get_user(p, argv))
374 return -EFAULT; 380 return -EFAULT;
@@ -377,6 +383,9 @@ static int count(char __user * __user * argv, int max)
377 argv++; 383 argv++;
378 if (i++ >= max) 384 if (i++ >= max)
379 return -E2BIG; 385 return -E2BIG;
386
387 if (fatal_signal_pending(current))
388 return -ERESTARTNOHAND;
380 cond_resched(); 389 cond_resched();
381 } 390 }
382 } 391 }
@@ -388,7 +397,7 @@ static int count(char __user * __user * argv, int max)
388 * processes's memory to the new process's stack. The call to get_user_pages() 397 * processes's memory to the new process's stack. The call to get_user_pages()
389 * ensures the destination page is created and not swapped out. 398 * ensures the destination page is created and not swapped out.
390 */ 399 */
391static int copy_strings(int argc, char __user * __user * argv, 400static int copy_strings(int argc, const char __user *const __user *argv,
392 struct linux_binprm *bprm) 401 struct linux_binprm *bprm)
393{ 402{
394 struct page *kmapped_page = NULL; 403 struct page *kmapped_page = NULL;
@@ -397,7 +406,7 @@ static int copy_strings(int argc, char __user * __user * argv,
397 int ret; 406 int ret;
398 407
399 while (argc-- > 0) { 408 while (argc-- > 0) {
400 char __user *str; 409 const char __user *str;
401 int len; 410 int len;
402 unsigned long pos; 411 unsigned long pos;
403 412
@@ -420,6 +429,12 @@ static int copy_strings(int argc, char __user * __user * argv,
420 while (len > 0) { 429 while (len > 0) {
421 int offset, bytes_to_copy; 430 int offset, bytes_to_copy;
422 431
432 if (fatal_signal_pending(current)) {
433 ret = -ERESTARTNOHAND;
434 goto out;
435 }
436 cond_resched();
437
423 offset = pos % PAGE_SIZE; 438 offset = pos % PAGE_SIZE;
424 if (offset == 0) 439 if (offset == 0)
425 offset = PAGE_SIZE; 440 offset = PAGE_SIZE;
@@ -471,12 +486,13 @@ out:
471/* 486/*
472 * Like copy_strings, but get argv and its values from kernel memory. 487 * Like copy_strings, but get argv and its values from kernel memory.
473 */ 488 */
474int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) 489int copy_strings_kernel(int argc, const char *const *argv,
490 struct linux_binprm *bprm)
475{ 491{
476 int r; 492 int r;
477 mm_segment_t oldfs = get_fs(); 493 mm_segment_t oldfs = get_fs();
478 set_fs(KERNEL_DS); 494 set_fs(KERNEL_DS);
479 r = copy_strings(argc, (char __user * __user *)argv, bprm); 495 r = copy_strings(argc, (const char __user *const __user *)argv, bprm);
480 set_fs(oldfs); 496 set_fs(oldfs);
481 return r; 497 return r;
482} 498}
@@ -594,6 +610,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
594#else 610#else
595 stack_top = arch_align_stack(stack_top); 611 stack_top = arch_align_stack(stack_top);
596 stack_top = PAGE_ALIGN(stack_top); 612 stack_top = PAGE_ALIGN(stack_top);
613
614 if (unlikely(stack_top < mmap_min_addr) ||
615 unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
616 return -ENOMEM;
617
597 stack_shift = vma->vm_end - stack_top; 618 stack_shift = vma->vm_end - stack_top;
598 619
599 bprm->p -= stack_shift; 620 bprm->p -= stack_shift;
@@ -653,6 +674,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
653 else 674 else
654 stack_base = vma->vm_start - stack_expand; 675 stack_base = vma->vm_start - stack_expand;
655#endif 676#endif
677 current->mm->start_stack = bprm->p;
656 ret = expand_stack(vma, stack_base); 678 ret = expand_stack(vma, stack_base);
657 if (ret) 679 if (ret)
658 ret = -EFAULT; 680 ret = -EFAULT;
@@ -683,7 +705,7 @@ struct file *open_exec(const char *name)
683 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 705 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
684 goto exit; 706 goto exit;
685 707
686 fsnotify_open(file->f_path.dentry); 708 fsnotify_open(file);
687 709
688 err = deny_write_access(file); 710 err = deny_write_access(file);
689 if (err) 711 if (err)
@@ -744,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
744 tsk->mm = mm; 766 tsk->mm = mm;
745 tsk->active_mm = mm; 767 tsk->active_mm = mm;
746 activate_mm(active_mm, mm); 768 activate_mm(active_mm, mm);
769 if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
770 atomic_dec(&old_mm->oom_disable_count);
771 atomic_inc(&tsk->mm->oom_disable_count);
772 }
747 task_unlock(tsk); 773 task_unlock(tsk);
748 arch_pick_mmap_layout(mm); 774 arch_pick_mmap_layout(mm);
749 if (old_mm) { 775 if (old_mm) {
@@ -983,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
983 1009
984 bprm->mm = NULL; /* We're using it now */ 1010 bprm->mm = NULL; /* We're using it now */
985 1011
986 current->flags &= ~PF_RANDOMIZE; 1012 current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
987 flush_thread(); 1013 flush_thread();
988 current->personality &= ~bprm->per_clear; 1014 current->personality &= ~bprm->per_clear;
989 1015
@@ -997,7 +1023,7 @@ EXPORT_SYMBOL(flush_old_exec);
997void setup_new_exec(struct linux_binprm * bprm) 1023void setup_new_exec(struct linux_binprm * bprm)
998{ 1024{
999 int i, ch; 1025 int i, ch;
1000 char * name; 1026 const char *name;
1001 char tcomm[sizeof(current->comm)]; 1027 char tcomm[sizeof(current->comm)];
1002 1028
1003 arch_pick_mmap_layout(current->mm); 1029 arch_pick_mmap_layout(current->mm);
@@ -1063,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
1063 */ 1089 */
1064int prepare_bprm_creds(struct linux_binprm *bprm) 1090int prepare_bprm_creds(struct linux_binprm *bprm)
1065{ 1091{
1066 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1092 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1067 return -ERESTARTNOINTR; 1093 return -ERESTARTNOINTR;
1068 1094
1069 bprm->cred = prepare_exec_creds(); 1095 bprm->cred = prepare_exec_creds();
1070 if (likely(bprm->cred)) 1096 if (likely(bprm->cred))
1071 return 0; 1097 return 0;
1072 1098
1073 mutex_unlock(&current->cred_guard_mutex); 1099 mutex_unlock(&current->signal->cred_guard_mutex);
1074 return -ENOMEM; 1100 return -ENOMEM;
1075} 1101}
1076 1102
@@ -1078,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
1078{ 1104{
1079 free_arg_pages(bprm); 1105 free_arg_pages(bprm);
1080 if (bprm->cred) { 1106 if (bprm->cred) {
1081 mutex_unlock(&current->cred_guard_mutex); 1107 mutex_unlock(&current->signal->cred_guard_mutex);
1082 abort_creds(bprm->cred); 1108 abort_creds(bprm->cred);
1083 } 1109 }
1084 kfree(bprm); 1110 kfree(bprm);
@@ -1099,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1099 * credentials; any time after this it may be unlocked. 1125 * credentials; any time after this it may be unlocked.
1100 */ 1126 */
1101 security_bprm_committed_creds(bprm); 1127 security_bprm_committed_creds(bprm);
1102 mutex_unlock(&current->cred_guard_mutex); 1128 mutex_unlock(&current->signal->cred_guard_mutex);
1103} 1129}
1104EXPORT_SYMBOL(install_exec_creds); 1130EXPORT_SYMBOL(install_exec_creds);
1105 1131
1106/* 1132/*
1107 * determine how safe it is to execute the proposed program 1133 * determine how safe it is to execute the proposed program
1108 * - the caller must hold current->cred_guard_mutex to protect against 1134 * - the caller must hold ->cred_guard_mutex to protect against
1109 * PTRACE_ATTACH 1135 * PTRACE_ATTACH
1110 */ 1136 */
1111int check_unsafe_exec(struct linux_binprm *bprm) 1137int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1117,7 +1143,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1117 bprm->unsafe = tracehook_unsafe_exec(p); 1143 bprm->unsafe = tracehook_unsafe_exec(p);
1118 1144
1119 n_fs = 1; 1145 n_fs = 1;
1120 write_lock(&p->fs->lock); 1146 spin_lock(&p->fs->lock);
1121 rcu_read_lock(); 1147 rcu_read_lock();
1122 for (t = next_thread(p); t != p; t = next_thread(t)) { 1148 for (t = next_thread(p); t != p; t = next_thread(t)) {
1123 if (t->fs == p->fs) 1149 if (t->fs == p->fs)
@@ -1134,7 +1160,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1134 res = 1; 1160 res = 1;
1135 } 1161 }
1136 } 1162 }
1137 write_unlock(&p->fs->lock); 1163 spin_unlock(&p->fs->lock);
1138 1164
1139 return res; 1165 return res;
1140} 1166}
@@ -1316,9 +1342,9 @@ EXPORT_SYMBOL(search_binary_handler);
1316/* 1342/*
1317 * sys_execve() executes a new program. 1343 * sys_execve() executes a new program.
1318 */ 1344 */
1319int do_execve(char * filename, 1345int do_execve(const char * filename,
1320 char __user *__user *argv, 1346 const char __user *const __user *argv,
1321 char __user *__user *envp, 1347 const char __user *const __user *envp,
1322 struct pt_regs * regs) 1348 struct pt_regs * regs)
1323{ 1349{
1324 struct linux_binprm *bprm; 1350 struct linux_binprm *bprm;
@@ -1386,7 +1412,6 @@ int do_execve(char * filename,
1386 if (retval < 0) 1412 if (retval < 0)
1387 goto out; 1413 goto out;
1388 1414
1389 current->flags &= ~PF_KTHREAD;
1390 retval = search_binary_handler(bprm,regs); 1415 retval = search_binary_handler(bprm,regs);
1391 if (retval < 0) 1416 if (retval < 0)
1392 goto out; 1417 goto out;
@@ -1439,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
1439 1464
1440EXPORT_SYMBOL(set_binfmt); 1465EXPORT_SYMBOL(set_binfmt);
1441 1466
1467static int expand_corename(struct core_name *cn)
1468{
1469 char *old_corename = cn->corename;
1470
1471 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1472 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1473
1474 if (!cn->corename) {
1475 kfree(old_corename);
1476 return -ENOMEM;
1477 }
1478
1479 return 0;
1480}
1481
1482static int cn_printf(struct core_name *cn, const char *fmt, ...)
1483{
1484 char *cur;
1485 int need;
1486 int ret;
1487 va_list arg;
1488
1489 va_start(arg, fmt);
1490 need = vsnprintf(NULL, 0, fmt, arg);
1491 va_end(arg);
1492
1493 if (likely(need < cn->size - cn->used - 1))
1494 goto out_printf;
1495
1496 ret = expand_corename(cn);
1497 if (ret)
1498 goto expand_fail;
1499
1500out_printf:
1501 cur = cn->corename + cn->used;
1502 va_start(arg, fmt);
1503 vsnprintf(cur, need + 1, fmt, arg);
1504 va_end(arg);
1505 cn->used += need;
1506 return 0;
1507
1508expand_fail:
1509 return ret;
1510}
1511
1442/* format_corename will inspect the pattern parameter, and output a 1512/* format_corename will inspect the pattern parameter, and output a
1443 * name into corename, which must have space for at least 1513 * name into corename, which must have space for at least
1444 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 1514 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1445 */ 1515 */
1446static int format_corename(char *corename, long signr) 1516static int format_corename(struct core_name *cn, long signr)
1447{ 1517{
1448 const struct cred *cred = current_cred(); 1518 const struct cred *cred = current_cred();
1449 const char *pat_ptr = core_pattern; 1519 const char *pat_ptr = core_pattern;
1450 int ispipe = (*pat_ptr == '|'); 1520 int ispipe = (*pat_ptr == '|');
1451 char *out_ptr = corename;
1452 char *const out_end = corename + CORENAME_MAX_SIZE;
1453 int rc;
1454 int pid_in_pattern = 0; 1521 int pid_in_pattern = 0;
1522 int err = 0;
1523
1524 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1525 cn->corename = kmalloc(cn->size, GFP_KERNEL);
1526 cn->used = 0;
1527
1528 if (!cn->corename)
1529 return -ENOMEM;
1455 1530
1456 /* Repeat as long as we have more pattern to process and more output 1531 /* Repeat as long as we have more pattern to process and more output
1457 space */ 1532 space */
1458 while (*pat_ptr) { 1533 while (*pat_ptr) {
1459 if (*pat_ptr != '%') { 1534 if (*pat_ptr != '%') {
1460 if (out_ptr == out_end) 1535 if (*pat_ptr == 0)
1461 goto out; 1536 goto out;
1462 *out_ptr++ = *pat_ptr++; 1537 err = cn_printf(cn, "%c", *pat_ptr++);
1463 } else { 1538 } else {
1464 switch (*++pat_ptr) { 1539 switch (*++pat_ptr) {
1540 /* single % at the end, drop that */
1465 case 0: 1541 case 0:
1466 goto out; 1542 goto out;
1467 /* Double percent, output one percent */ 1543 /* Double percent, output one percent */
1468 case '%': 1544 case '%':
1469 if (out_ptr == out_end) 1545 err = cn_printf(cn, "%c", '%');
1470 goto out;
1471 *out_ptr++ = '%';
1472 break; 1546 break;
1473 /* pid */ 1547 /* pid */
1474 case 'p': 1548 case 'p':
1475 pid_in_pattern = 1; 1549 pid_in_pattern = 1;
1476 rc = snprintf(out_ptr, out_end - out_ptr, 1550 err = cn_printf(cn, "%d",
1477 "%d", task_tgid_vnr(current)); 1551 task_tgid_vnr(current));
1478 if (rc > out_end - out_ptr)
1479 goto out;
1480 out_ptr += rc;
1481 break; 1552 break;
1482 /* uid */ 1553 /* uid */
1483 case 'u': 1554 case 'u':
1484 rc = snprintf(out_ptr, out_end - out_ptr, 1555 err = cn_printf(cn, "%d", cred->uid);
1485 "%d", cred->uid);
1486 if (rc > out_end - out_ptr)
1487 goto out;
1488 out_ptr += rc;
1489 break; 1556 break;
1490 /* gid */ 1557 /* gid */
1491 case 'g': 1558 case 'g':
1492 rc = snprintf(out_ptr, out_end - out_ptr, 1559 err = cn_printf(cn, "%d", cred->gid);
1493 "%d", cred->gid);
1494 if (rc > out_end - out_ptr)
1495 goto out;
1496 out_ptr += rc;
1497 break; 1560 break;
1498 /* signal that caused the coredump */ 1561 /* signal that caused the coredump */
1499 case 's': 1562 case 's':
1500 rc = snprintf(out_ptr, out_end - out_ptr, 1563 err = cn_printf(cn, "%ld", signr);
1501 "%ld", signr);
1502 if (rc > out_end - out_ptr)
1503 goto out;
1504 out_ptr += rc;
1505 break; 1564 break;
1506 /* UNIX time of coredump */ 1565 /* UNIX time of coredump */
1507 case 't': { 1566 case 't': {
1508 struct timeval tv; 1567 struct timeval tv;
1509 do_gettimeofday(&tv); 1568 do_gettimeofday(&tv);
1510 rc = snprintf(out_ptr, out_end - out_ptr, 1569 err = cn_printf(cn, "%lu", tv.tv_sec);
1511 "%lu", tv.tv_sec);
1512 if (rc > out_end - out_ptr)
1513 goto out;
1514 out_ptr += rc;
1515 break; 1570 break;
1516 } 1571 }
1517 /* hostname */ 1572 /* hostname */
1518 case 'h': 1573 case 'h':
1519 down_read(&uts_sem); 1574 down_read(&uts_sem);
1520 rc = snprintf(out_ptr, out_end - out_ptr, 1575 err = cn_printf(cn, "%s",
1521 "%s", utsname()->nodename); 1576 utsname()->nodename);
1522 up_read(&uts_sem); 1577 up_read(&uts_sem);
1523 if (rc > out_end - out_ptr)
1524 goto out;
1525 out_ptr += rc;
1526 break; 1578 break;
1527 /* executable */ 1579 /* executable */
1528 case 'e': 1580 case 'e':
1529 rc = snprintf(out_ptr, out_end - out_ptr, 1581 err = cn_printf(cn, "%s", current->comm);
1530 "%s", current->comm);
1531 if (rc > out_end - out_ptr)
1532 goto out;
1533 out_ptr += rc;
1534 break; 1582 break;
1535 /* core limit size */ 1583 /* core limit size */
1536 case 'c': 1584 case 'c':
1537 rc = snprintf(out_ptr, out_end - out_ptr, 1585 err = cn_printf(cn, "%lu",
1538 "%lu", rlimit(RLIMIT_CORE)); 1586 rlimit(RLIMIT_CORE));
1539 if (rc > out_end - out_ptr)
1540 goto out;
1541 out_ptr += rc;
1542 break; 1587 break;
1543 default: 1588 default:
1544 break; 1589 break;
1545 } 1590 }
1546 ++pat_ptr; 1591 ++pat_ptr;
1547 } 1592 }
1593
1594 if (err)
1595 return err;
1548 } 1596 }
1597
1549 /* Backward compatibility with core_uses_pid: 1598 /* Backward compatibility with core_uses_pid:
1550 * 1599 *
1551 * If core_pattern does not include a %p (as is the default) 1600 * If core_pattern does not include a %p (as is the default)
1552 * and core_uses_pid is set, then .%pid will be appended to 1601 * and core_uses_pid is set, then .%pid will be appended to
1553 * the filename. Do not do this for piped commands. */ 1602 * the filename. Do not do this for piped commands. */
1554 if (!ispipe && !pid_in_pattern && core_uses_pid) { 1603 if (!ispipe && !pid_in_pattern && core_uses_pid) {
1555 rc = snprintf(out_ptr, out_end - out_ptr, 1604 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1556 ".%d", task_tgid_vnr(current)); 1605 if (err)
1557 if (rc > out_end - out_ptr) 1606 return err;
1558 goto out;
1559 out_ptr += rc;
1560 } 1607 }
1561out: 1608out:
1562 *out_ptr = 0;
1563 return ispipe; 1609 return ispipe;
1564} 1610}
1565 1611
@@ -1836,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
1836void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1882void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1837{ 1883{
1838 struct core_state core_state; 1884 struct core_state core_state;
1839 char corename[CORENAME_MAX_SIZE + 1]; 1885 struct core_name cn;
1840 struct mm_struct *mm = current->mm; 1886 struct mm_struct *mm = current->mm;
1841 struct linux_binfmt * binfmt; 1887 struct linux_binfmt * binfmt;
1842 const struct cred *old_cred; 1888 const struct cred *old_cred;
@@ -1891,13 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1891 */ 1937 */
1892 clear_thread_flag(TIF_SIGPENDING); 1938 clear_thread_flag(TIF_SIGPENDING);
1893 1939
1894 /* 1940 ispipe = format_corename(&cn, signr);
1895 * lock_kernel() because format_corename() is controlled by sysctl, which 1941
1896 * uses lock_kernel() 1942 if (ispipe == -ENOMEM) {
1897 */ 1943 printk(KERN_WARNING "format_corename failed\n");
1898 lock_kernel(); 1944 printk(KERN_WARNING "Aborting core\n");
1899 ispipe = format_corename(corename, signr); 1945 goto fail_corename;
1900 unlock_kernel(); 1946 }
1901 1947
1902 if (ispipe) { 1948 if (ispipe) {
1903 int dump_count; 1949 int dump_count;
@@ -1934,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1934 goto fail_dropcount; 1980 goto fail_dropcount;
1935 } 1981 }
1936 1982
1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); 1983 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
1938 if (!helper_argv) { 1984 if (!helper_argv) {
1939 printk(KERN_WARNING "%s failed to allocate memory\n", 1985 printk(KERN_WARNING "%s failed to allocate memory\n",
1940 __func__); 1986 __func__);
@@ -1947,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1947 argv_free(helper_argv); 1993 argv_free(helper_argv);
1948 if (retval) { 1994 if (retval) {
1949 printk(KERN_INFO "Core dump to %s pipe failed\n", 1995 printk(KERN_INFO "Core dump to %s pipe failed\n",
1950 corename); 1996 cn.corename);
1951 goto close_fail; 1997 goto close_fail;
1952 } 1998 }
1953 } else { 1999 } else {
@@ -1956,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1956 if (cprm.limit < binfmt->min_coredump) 2002 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock; 2003 goto fail_unlock;
1958 2004
1959 cprm.file = filp_open(corename, 2005 cprm.file = filp_open(cn.corename,
1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 2006 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1961 0600); 2007 0600);
1962 if (IS_ERR(cprm.file)) 2008 if (IS_ERR(cprm.file))
@@ -1998,6 +2044,8 @@ fail_dropcount:
1998 if (ispipe) 2044 if (ispipe)
1999 atomic_dec(&core_dump_count); 2045 atomic_dec(&core_dump_count);
2000fail_unlock: 2046fail_unlock:
2047 kfree(cn.corename);
2048fail_corename:
2001 coredump_finish(mm); 2049 coredump_finish(mm);
2002 revert_creds(old_cred); 2050 revert_creds(old_cred);
2003fail_creds: 2051fail_creds:
@@ -2005,3 +2053,43 @@ fail_creds:
2005fail: 2053fail:
2006 return; 2054 return;
2007} 2055}
2056
2057/*
2058 * Core dumping helper functions. These are the only things you should
2059 * do on a core-file: use only these functions to write out all the
2060 * necessary info.
2061 */
2062int dump_write(struct file *file, const void *addr, int nr)
2063{
2064 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2065}
2066EXPORT_SYMBOL(dump_write);
2067
2068int dump_seek(struct file *file, loff_t off)
2069{
2070 int ret = 1;
2071
2072 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2073 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2074 return 0;
2075 } else {
2076 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2077
2078 if (!buf)
2079 return 0;
2080 while (off > 0) {
2081 unsigned long n = off;
2082
2083 if (n > PAGE_SIZE)
2084 n = PAGE_SIZE;
2085 if (!dump_write(file, buf, n)) {
2086 ret = 0;
2087 break;
2088 }
2089 off -= n;
2090 }
2091 free_page((unsigned long)buf);
2092 }
2093 return ret;
2094}
2095EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc..dcc941d82d6 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
420 err = exofs_write_begin(NULL, page->mapping, pos, len, 420 err = exofs_write_begin(NULL, page->mapping, pos, len,
421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
422 if (err) 422 if (err)
423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", 423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
424 err); 424 err);
425 425
426 de->inode_no = cpu_to_le64(inode->i_ino); 426 de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, 556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
557 &page, NULL); 557 &page, NULL);
558 if (err) 558 if (err)
559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", 559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
560 err); 560 err);
561 if (pde) 561 if (pde)
562 pde->rec_len = cpu_to_le16(to - from); 562 pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 22721b2fd89..2dc925fa101 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -256,7 +256,6 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
256} 256}
257 257
258/* inode.c */ 258/* inode.c */
259void exofs_truncate(struct inode *inode);
260int exofs_setattr(struct dentry *, struct iattr *); 259int exofs_setattr(struct dentry *, struct iattr *);
261int exofs_write_begin(struct file *file, struct address_space *mapping, 260int exofs_write_begin(struct file *file, struct address_space *mapping,
262 loff_t pos, unsigned len, unsigned flags, 261 loff_t pos, unsigned len, unsigned flags,
@@ -264,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
264extern struct inode *exofs_iget(struct super_block *, unsigned long); 263extern struct inode *exofs_iget(struct super_block *, unsigned long);
265struct inode *exofs_new_inode(struct inode *, int); 264struct inode *exofs_new_inode(struct inode *, int);
266extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); 265extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
267extern void exofs_delete_inode(struct inode *); 266extern void exofs_evict_inode(struct inode *);
268 267
269/* dir.c: */ 268/* dir.c: */
270int exofs_add_link(struct dentry *, struct inode *); 269int exofs_add_link(struct dentry *, struct inode *);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index fef6899be39..b905c79b4f0 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
30 * along with exofs; if not, write to the Free Software 30 * along with exofs; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33
34#include <linux/buffer_head.h>
35
36#include "exofs.h" 33#include "exofs.h"
37 34
38static int exofs_release_file(struct inode *inode, struct file *filp) 35static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,19 +37,23 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 37 return 0;
41} 38}
42 39
40/* exofs_file_fsync - flush the inode to disk
41 *
42 * Note, in exofs all metadata is written as part of inode, regardless.
43 * The writeout is synchronous
44 */
43static int exofs_file_fsync(struct file *filp, int datasync) 45static int exofs_file_fsync(struct file *filp, int datasync)
44{ 46{
45 int ret; 47 int ret;
46 struct address_space *mapping = filp->f_mapping; 48 struct inode *inode = filp->f_mapping->host;
47 struct inode *inode = mapping->host;
48 struct super_block *sb; 49 struct super_block *sb;
49 50
50 ret = filemap_write_and_wait(mapping); 51 if (!(inode->i_state & I_DIRTY))
51 if (ret) 52 return 0;
52 return ret; 53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
54 return 0;
53 55
54 /* sync the inode attributes */ 56 ret = sync_inode_metadata(inode, 1);
55 ret = write_inode_now(inode, 1);
56 57
57 /* This is a good place to write the sb */ 58 /* This is a good place to write the sb */
58 /* TODO: Sechedule an sb-sync on create */ 59 /* TODO: Sechedule an sb-sync on create */
@@ -65,9 +66,9 @@ static int exofs_file_fsync(struct file *filp, int datasync)
65 66
66static int exofs_flush(struct file *file, fl_owner_t id) 67static int exofs_flush(struct file *file, fl_owner_t id)
67{ 68{
68 exofs_file_fsync(file, 1); 69 int ret = vfs_fsync(file, 0);
69 /* TODO: Flush the OSD target */ 70 /* TODO: Flush the OSD target */
70 return 0; 71 return ret;
71} 72}
72 73
73const struct file_operations exofs_file_operations = { 74const struct file_operations exofs_file_operations = {
@@ -86,6 +87,5 @@ const struct file_operations exofs_file_operations = {
86}; 87};
87 88
88const struct inode_operations exofs_file_inode_operations = { 89const struct inode_operations exofs_file_inode_operations = {
89 .truncate = exofs_truncate,
90 .setattr = exofs_setattr, 90 .setattr = exofs_setattr,
91}; 91};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 4bb6ef822e4..42685424817 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
32 */ 32 */
33 33
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/writeback.h>
36#include <linux/buffer_head.h>
37#include <scsi/scsi_device.h>
38 35
39#include "exofs.h" 36#include "exofs.h"
40 37
@@ -57,6 +54,9 @@ struct page_collect {
57 unsigned nr_pages; 54 unsigned nr_pages;
58 unsigned long length; 55 unsigned long length;
59 loff_t pg_first; /* keep 64bit also in 32-arches */ 56 loff_t pg_first; /* keep 64bit also in 32-arches */
57 bool read_4_write; /* This means two things: that the read is sync
58 * And the pages should not be unlocked.
59 */
60}; 60};
61 61
62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -74,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
74 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
75 pcol->length = 0; 75 pcol->length = 0;
76 pcol->pg_first = -1; 76 pcol->pg_first = -1;
77 pcol->read_4_write = false;
77} 78}
78 79
79static void _pcol_reset(struct page_collect *pcol) 80static void _pcol_reset(struct page_collect *pcol)
@@ -184,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
184/* Called at the end of reads, to optionally unlock pages and update their 185/* Called at the end of reads, to optionally unlock pages and update their
185 * status. 186 * status.
186 */ 187 */
187static int __readpages_done(struct page_collect *pcol, bool do_unlock) 188static int __readpages_done(struct page_collect *pcol)
188{ 189{
189 int i; 190 int i;
190 u64 resid; 191 u64 resid;
@@ -220,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
220 page_stat ? "bad_bytes" : "good_bytes"); 221 page_stat ? "bad_bytes" : "good_bytes");
221 222
222 ret = update_read_page(page, page_stat); 223 ret = update_read_page(page, page_stat);
223 if (do_unlock) 224 if (!pcol->read_4_write)
224 unlock_page(page); 225 unlock_page(page);
225 length += PAGE_SIZE; 226 length += PAGE_SIZE;
226 } 227 }
@@ -235,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
235{ 236{
236 struct page_collect *pcol = p; 237 struct page_collect *pcol = p;
237 238
238 __readpages_done(pcol, true); 239 __readpages_done(pcol);
239 atomic_dec(&pcol->sbi->s_curr_pending); 240 atomic_dec(&pcol->sbi->s_curr_pending);
240 kfree(pcol); 241 kfree(pcol);
241} 242}
@@ -256,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
256 } 257 }
257} 258}
258 259
259static int read_exec(struct page_collect *pcol, bool is_sync) 260static int read_exec(struct page_collect *pcol)
260{ 261{
261 struct exofs_i_info *oi = exofs_i(pcol->inode); 262 struct exofs_i_info *oi = exofs_i(pcol->inode);
262 struct exofs_io_state *ios = pcol->ios; 263 struct exofs_io_state *ios = pcol->ios;
@@ -266,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
266 if (!pcol->pages) 267 if (!pcol->pages)
267 return 0; 268 return 0;
268 269
269 /* see comment in _readpage() about sync reads */
270 WARN_ON(is_sync && (pcol->nr_pages != 1));
271
272 ios->pages = pcol->pages; 270 ios->pages = pcol->pages;
273 ios->nr_pages = pcol->nr_pages; 271 ios->nr_pages = pcol->nr_pages;
274 ios->length = pcol->length; 272 ios->length = pcol->length;
275 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 273 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
276 274
277 if (is_sync) { 275 if (pcol->read_4_write) {
278 exofs_oi_read(oi, pcol->ios); 276 exofs_oi_read(oi, pcol->ios);
279 return __readpages_done(pcol, false); 277 return __readpages_done(pcol);
280 } 278 }
281 279
282 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 280 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -302,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
302 return 0; 300 return 0;
303 301
304err: 302err:
305 if (!is_sync) 303 if (!pcol->read_4_write)
306 _unlock_pcol_pages(pcol, ret, READ); 304 _unlock_pcol_pages(pcol, ret, READ);
307 305
308 pcol_free(pcol); 306 pcol_free(pcol);
@@ -350,11 +348,12 @@ static int readpage_strip(void *data, struct page *page)
350 if (PageError(page)) 348 if (PageError(page))
351 ClearPageError(page); 349 ClearPageError(page);
352 350
353 unlock_page(page); 351 if (!pcol->read_4_write)
352 unlock_page(page);
354 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
355 " splitting\n", inode->i_ino, page->index); 354 " splitting\n", inode->i_ino, page->index);
356 355
357 return read_exec(pcol, false); 356 return read_exec(pcol);
358 } 357 }
359 358
360try_again: 359try_again:
@@ -364,7 +363,7 @@ try_again:
364 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 363 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
365 page->index)) { 364 page->index)) {
366 /* Discontinuity detected, split the request */ 365 /* Discontinuity detected, split the request */
367 ret = read_exec(pcol, false); 366 ret = read_exec(pcol);
368 if (unlikely(ret)) 367 if (unlikely(ret))
369 goto fail; 368 goto fail;
370 goto try_again; 369 goto try_again;
@@ -389,7 +388,7 @@ try_again:
389 page, len, pcol->nr_pages, pcol->length); 388 page, len, pcol->nr_pages, pcol->length);
390 389
391 /* split the request, and start again with current page */ 390 /* split the request, and start again with current page */
392 ret = read_exec(pcol, false); 391 ret = read_exec(pcol);
393 if (unlikely(ret)) 392 if (unlikely(ret))
394 goto fail; 393 goto fail;
395 394
@@ -418,26 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
418 return ret; 417 return ret;
419 } 418 }
420 419
421 return read_exec(&pcol, false); 420 return read_exec(&pcol);
422} 421}
423 422
424static int _readpage(struct page *page, bool is_sync) 423static int _readpage(struct page *page, bool read_4_write)
425{ 424{
426 struct page_collect pcol; 425 struct page_collect pcol;
427 int ret; 426 int ret;
428 427
429 _pcol_init(&pcol, 1, page->mapping->host); 428 _pcol_init(&pcol, 1, page->mapping->host);
430 429
431 /* readpage_strip might call read_exec(,is_sync==false) at several 430 pcol.read_4_write = read_4_write;
432 * places but not if we have a single page.
433 */
434 ret = readpage_strip(&pcol, page); 431 ret = readpage_strip(&pcol, page);
435 if (ret) { 432 if (ret) {
436 EXOFS_ERR("_readpage => %d\n", ret); 433 EXOFS_ERR("_readpage => %d\n", ret);
437 return ret; 434 return ret;
438 } 435 }
439 436
440 return read_exec(&pcol, is_sync); 437 return read_exec(&pcol);
441} 438}
442 439
443/* 440/*
@@ -508,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
508 505
509 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
510 if (!pcol_copy) { 507 if (!pcol_copy) {
511 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 508 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
512 ret = -ENOMEM; 509 ret = -ENOMEM;
513 goto err; 510 goto err;
514 } 511 }
@@ -524,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
524 521
525 ret = exofs_oi_write(oi, ios); 522 ret = exofs_oi_write(oi, ios);
526 if (unlikely(ret)) { 523 if (unlikely(ret)) {
527 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); 524 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
528 goto err; 525 goto err;
529 } 526 }
530 527
@@ -625,7 +622,7 @@ try_again:
625 /* split the request, next loop will start again */ 622 /* split the request, next loop will start again */
626 ret = write_exec(pcol); 623 ret = write_exec(pcol);
627 if (unlikely(ret)) { 624 if (unlikely(ret)) {
628 EXOFS_DBGMSG("write_exec faild => %d", ret); 625 EXOFS_DBGMSG("write_exec failed => %d", ret);
629 goto fail; 626 goto fail;
630 } 627 }
631 628
@@ -697,6 +694,13 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
697 return write_exec(&pcol); 694 return write_exec(&pcol);
698} 695}
699 696
697/* i_mutex held using inode->i_size directly */
698static void _write_failed(struct inode *inode, loff_t to)
699{
700 if (to > inode->i_size)
701 truncate_pagecache(inode, to, inode->i_size);
702}
703
700int exofs_write_begin(struct file *file, struct address_space *mapping, 704int exofs_write_begin(struct file *file, struct address_space *mapping,
701 loff_t pos, unsigned len, unsigned flags, 705 loff_t pos, unsigned len, unsigned flags,
702 struct page **pagep, void **fsdata) 706 struct page **pagep, void **fsdata)
@@ -709,8 +713,8 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
709 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 713 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
710 fsdata); 714 fsdata);
711 if (ret) { 715 if (ret) {
712 EXOFS_DBGMSG("simple_write_begin faild\n"); 716 EXOFS_DBGMSG("simple_write_begin failed\n");
713 return ret; 717 goto out;
714 } 718 }
715 719
716 page = *pagep; 720 page = *pagep;
@@ -722,9 +726,12 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
722 if (ret) { 726 if (ret) {
723 /*SetPageError was done by _readpage. Is it ok?*/ 727 /*SetPageError was done by _readpage. Is it ok?*/
724 unlock_page(page); 728 unlock_page(page);
725 EXOFS_DBGMSG("__readpage_filler faild\n"); 729 EXOFS_DBGMSG("__readpage_filler failed\n");
726 } 730 }
727 } 731 }
732out:
733 if (unlikely(ret))
734 _write_failed(mapping->host, pos + len);
728 735
729 return ret; 736 return ret;
730} 737}
@@ -750,6 +757,10 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
750 int ret; 757 int ret;
751 758
752 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); 759 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
760 if (unlikely(ret))
761 _write_failed(inode, pos + len);
762
763 /* TODO: once simple_write_end marks inode dirty remove */
753 if (i_size != inode->i_size) 764 if (i_size != inode->i_size)
754 mark_inode_dirty(inode); 765 mark_inode_dirty(inode);
755 return ret; 766 return ret;
@@ -759,15 +770,13 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
759{ 770{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index); 771 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1); 772 WARN_ON(1);
762 return try_to_free_buffers(page); 773 return 0;
763} 774}
764 775
765static void exofs_invalidatepage(struct page *page, unsigned long offset) 776static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{ 777{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); 778 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
768 WARN_ON(1); 779 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771} 780}
772 781
773const struct address_space_operations exofs_aops = { 782const struct address_space_operations exofs_aops = {
@@ -808,87 +817,55 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
808 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 817 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
809} 818}
810 819
811/*
812 * get_block_t - Fill in a buffer_head
813 * An OSD takes care of block allocation so we just fake an allocation by
814 * putting in the inode's sector_t in the buffer_head.
815 * TODO: What about the case of create==0 and @iblock does not exist in the
816 * object?
817 */
818static int exofs_get_block(struct inode *inode, sector_t iblock,
819 struct buffer_head *bh_result, int create)
820{
821 map_bh(bh_result, inode->i_sb, iblock);
822 return 0;
823}
824
825const struct osd_attr g_attr_logical_length = ATTR_DEF( 820const struct osd_attr g_attr_logical_length = ATTR_DEF(
826 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 821 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
827 822
828static int _do_truncate(struct inode *inode) 823static int _do_truncate(struct inode *inode, loff_t newsize)
829{ 824{
830 struct exofs_i_info *oi = exofs_i(inode); 825 struct exofs_i_info *oi = exofs_i(inode);
831 loff_t isize = i_size_read(inode);
832 int ret; 826 int ret;
833 827
834 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 828 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
835 829
836 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); 830 ret = exofs_oi_truncate(oi, (u64)newsize);
831 if (likely(!ret))
832 truncate_setsize(inode, newsize);
837 833
838 ret = exofs_oi_truncate(oi, (u64)isize); 834 EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
839 EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize); 835 inode->i_ino, newsize, ret);
840 return ret; 836 return ret;
841} 837}
842 838
843/* 839/*
844 * Truncate a file to the specified size - all we have to do is set the size 840 * Set inode attributes - update size attribute on OSD if needed,
845 * attribute. We make sure the object exists first. 841 * otherwise just call generic functions.
846 */
847void exofs_truncate(struct inode *inode)
848{
849 struct exofs_i_info *oi = exofs_i(inode);
850 int ret;
851
852 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
853 || S_ISLNK(inode->i_mode)))
854 return;
855 if (exofs_inode_is_fast_symlink(inode))
856 return;
857 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
858 return;
859
860 /* if we are about to truncate an object, and it hasn't been
861 * created yet, wait
862 */
863 if (unlikely(wait_obj_created(oi)))
864 goto fail;
865
866 ret = _do_truncate(inode);
867 if (ret)
868 goto fail;
869
870out:
871 mark_inode_dirty(inode);
872 return;
873fail:
874 make_bad_inode(inode);
875 goto out;
876}
877
878/*
879 * Set inode attributes - just call generic functions.
880 */ 842 */
881int exofs_setattr(struct dentry *dentry, struct iattr *iattr) 843int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
882{ 844{
883 struct inode *inode = dentry->d_inode; 845 struct inode *inode = dentry->d_inode;
884 int error; 846 int error;
885 847
848 /* if we are about to modify an object, and it hasn't been
849 * created yet, wait
850 */
851 error = wait_obj_created(exofs_i(inode));
852 if (unlikely(error))
853 return error;
854
886 error = inode_change_ok(inode, iattr); 855 error = inode_change_ok(inode, iattr);
887 if (error) 856 if (unlikely(error))
888 return error; 857 return error;
889 858
890 error = inode_setattr(inode, iattr); 859 if ((iattr->ia_valid & ATTR_SIZE) &&
891 return error; 860 iattr->ia_size != i_size_read(inode)) {
861 error = _do_truncate(inode, iattr->ia_size);
862 if (unlikely(error))
863 return error;
864 }
865
866 setattr_copy(inode, iattr);
867 mark_inode_dirty(inode);
868 return 0;
892} 869}
893 870
894static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( 871static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
@@ -1053,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1053 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1054 } 1031 }
1055 1032
1033 inode->i_mapping->backing_dev_info = sb->s_bdi;
1056 if (S_ISREG(inode->i_mode)) { 1034 if (S_ISREG(inode->i_mode)) {
1057 inode->i_op = &exofs_file_inode_operations; 1035 inode->i_op = &exofs_file_inode_operations;
1058 inode->i_fop = &exofs_file_operations; 1036 inode->i_fop = &exofs_file_operations;
@@ -1089,8 +1067,10 @@ bad_inode:
1089int __exofs_wait_obj_created(struct exofs_i_info *oi) 1067int __exofs_wait_obj_created(struct exofs_i_info *oi)
1090{ 1068{
1091 if (!obj_created(oi)) { 1069 if (!obj_created(oi)) {
1070 EXOFS_DBGMSG("!obj_created\n");
1092 BUG_ON(!obj_2bcreated(oi)); 1071 BUG_ON(!obj_2bcreated(oi));
1093 wait_event(oi->i_wq, obj_created(oi)); 1072 wait_event(oi->i_wq, obj_created(oi));
1073 EXOFS_DBGMSG("wait_event done\n");
1094 } 1074 }
1095 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1075 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1096} 1076}
@@ -1112,7 +1092,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1112 atomic_dec(&sbi->s_curr_pending); 1092 atomic_dec(&sbi->s_curr_pending);
1113 1093
1114 if (unlikely(ret)) { 1094 if (unlikely(ret)) {
1115 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1095 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1116 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1096 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1117 /*TODO: When FS is corrupted creation can fail, object already 1097 /*TODO: When FS is corrupted creation can fail, object already
1118 * exist. Get rid of this asynchronous creation, if exist 1098 * exist. Get rid of this asynchronous creation, if exist
@@ -1124,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
1124 1104
1125 set_obj_created(oi); 1105 set_obj_created(oi);
1126 1106
1127 atomic_dec(&inode->i_count);
1128 wake_up(&oi->i_wq); 1107 wake_up(&oi->i_wq);
1129} 1108}
1130 1109
@@ -1152,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1152 1131
1153 sbi = sb->s_fs_info; 1132 sbi = sb->s_fs_info;
1154 1133
1134 inode->i_mapping->backing_dev_info = sb->s_bdi;
1155 sb->s_dirt = 1; 1135 sb->s_dirt = 1;
1156 inode_init_owner(inode, dir, mode); 1136 inode_init_owner(inode, dir, mode);
1157 inode->i_ino = sbi->s_nextid++; 1137 inode->i_ino = sbi->s_nextid++;
@@ -1174,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1174 ios->obj.id = exofs_oi_objno(oi); 1154 ios->obj.id = exofs_oi_objno(oi);
1175 exofs_make_credential(oi->i_cred, &ios->obj); 1155 exofs_make_credential(oi->i_cred, &ios->obj);
1176 1156
1177 /* increment the refcount so that the inode will still be around when we
1178 * reach the callback
1179 */
1180 atomic_inc(&inode->i_count);
1181
1182 ios->done = create_done; 1157 ios->done = create_done;
1183 ios->private = inode; 1158 ios->private = inode;
1184 ios->cred = oi->i_cred; 1159 ios->cred = oi->i_cred;
1185 ret = exofs_sbi_create(ios); 1160 ret = exofs_sbi_create(ios);
1186 if (ret) { 1161 if (ret) {
1187 atomic_dec(&inode->i_count);
1188 exofs_put_io_state(ios); 1162 exofs_put_io_state(ios);
1189 return ERR_PTR(ret); 1163 return ERR_PTR(ret);
1190 } 1164 }
@@ -1232,7 +1206,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1232 1206
1233 args = kzalloc(sizeof(*args), GFP_KERNEL); 1207 args = kzalloc(sizeof(*args), GFP_KERNEL);
1234 if (!args) { 1208 if (!args) {
1235 EXOFS_DBGMSG("Faild kzalloc of args\n"); 1209 EXOFS_DBGMSG("Failed kzalloc of args\n");
1236 return -ENOMEM; 1210 return -ENOMEM;
1237 } 1211 }
1238 1212
@@ -1274,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1274 ios->out_attr_len = 1; 1248 ios->out_attr_len = 1;
1275 ios->out_attr = &attr; 1249 ios->out_attr = &attr;
1276 1250
1277 if (!obj_created(oi)) { 1251 wait_obj_created(oi);
1278 EXOFS_DBGMSG("!obj_created\n");
1279 BUG_ON(!obj_2bcreated(oi));
1280 wait_event(oi->i_wq, obj_created(oi));
1281 EXOFS_DBGMSG("wait_event done\n");
1282 }
1283 1252
1284 if (!do_sync) { 1253 if (!do_sync) {
1285 args->sbi = sbi; 1254 args->sbi = sbi;
@@ -1325,7 +1294,7 @@ static void delete_done(struct exofs_io_state *ios, void *p)
1325 * from the OSD here. We make sure the object was created before we try and 1294 * from the OSD here. We make sure the object was created before we try and
1326 * delete it. 1295 * delete it.
1327 */ 1296 */
1328void exofs_delete_inode(struct inode *inode) 1297void exofs_evict_inode(struct inode *inode)
1329{ 1298{
1330 struct exofs_i_info *oi = exofs_i(inode); 1299 struct exofs_i_info *oi = exofs_i(inode);
1331 struct super_block *sb = inode->i_sb; 1300 struct super_block *sb = inode->i_sb;
@@ -1335,30 +1304,27 @@ void exofs_delete_inode(struct inode *inode)
1335 1304
1336 truncate_inode_pages(&inode->i_data, 0); 1305 truncate_inode_pages(&inode->i_data, 0);
1337 1306
1338 if (is_bad_inode(inode)) 1307 /* TODO: should do better here */
1308 if (inode->i_nlink || is_bad_inode(inode))
1339 goto no_delete; 1309 goto no_delete;
1340 1310
1341 mark_inode_dirty(inode);
1342 exofs_update_inode(inode, inode_needs_sync(inode));
1343
1344 inode->i_size = 0; 1311 inode->i_size = 0;
1345 if (inode->i_blocks) 1312 end_writeback(inode);
1346 exofs_truncate(inode);
1347 1313
1348 clear_inode(inode); 1314 /* if we are deleting an obj that hasn't been created yet, wait.
1315 * This also makes sure that create_done cannot be called with an
1316 * already evicted inode.
1317 */
1318 wait_obj_created(oi);
1319 /* ignore the error, attempt a remove anyway */
1349 1320
1321 /* Now Remove the OSD objects */
1350 ret = exofs_get_io_state(&sbi->layout, &ios); 1322 ret = exofs_get_io_state(&sbi->layout, &ios);
1351 if (unlikely(ret)) { 1323 if (unlikely(ret)) {
1352 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1324 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1353 return; 1325 return;
1354 } 1326 }
1355 1327
1356 /* if we are deleting an obj that hasn't been created yet, wait */
1357 if (!obj_created(oi)) {
1358 BUG_ON(!obj_2bcreated(oi));
1359 wait_event(oi->i_wq, obj_created(oi));
1360 }
1361
1362 ios->obj.id = exofs_oi_objno(oi); 1328 ios->obj.id = exofs_oi_objno(oi);
1363 ios->done = delete_done; 1329 ios->done = delete_done;
1364 ios->private = sbi; 1330 ios->private = sbi;
@@ -1374,5 +1340,5 @@ void exofs_delete_inode(struct inode *inode)
1374 return; 1340 return;
1375 1341
1376no_delete: 1342no_delete:
1377 clear_inode(inode); 1343 end_writeback(inode);
1378} 1344}
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777..f74a2ec027a 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
55 55
56 ret = osd_finalize_request(or, 0, cred, NULL); 56 ret = osd_finalize_request(or, 0, cred, NULL);
57 if (unlikely(ret)) { 57 if (unlikely(ret)) {
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); 58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
59 goto out; 59 goto out;
60 } 60 }
61 61
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
79 */ 79 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", 82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 83 exofs_io_state_size(layout->s_numdevs));
84 *pios = NULL; 84 *pios = NULL;
85 return -ENOMEM; 85 return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
172 172
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 173 ret = osd_finalize_request(or, 0, ios->cred, NULL);
174 if (unlikely(ret)) { 174 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", 175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 176 ret);
177 return ret; 177 return ret;
178 } 178 }
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
305struct _striping_info { 305struct _striping_info {
306 u64 obj_offset; 306 u64 obj_offset;
307 u64 group_length; 307 u64 group_length;
308 u64 total_group_length;
309 u64 Major;
310 unsigned dev; 308 unsigned dev;
311 unsigned unit_off; 309 unsigned unit_off;
312}; 310};
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
343 (M * group_depth * stripe_unit); 341 (M * group_depth * stripe_unit);
344 342
345 si->group_length = T - H; 343 si->group_length = T - H;
346 si->total_group_length = T;
347 si->Major = M;
348} 344}
349 345
350static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, 346static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
@@ -365,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
365 361
366 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
367 if (unlikely(!per_dev->bio)) { 363 if (unlikely(!per_dev->bio)) {
368 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", 364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
369 bio_size); 365 bio_size);
370 return -ENOMEM; 366 return -ENOMEM;
371 } 367 }
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
392} 388}
393 389
394static int _prepare_one_group(struct exofs_io_state *ios, u64 length, 390static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
395 struct _striping_info *si, unsigned first_comp) 391 struct _striping_info *si)
396{ 392{
397 unsigned stripe_unit = ios->layout->stripe_unit; 393 unsigned stripe_unit = ios->layout->stripe_unit;
398 unsigned mirrors_p1 = ios->layout->mirrors_p1; 394 unsigned mirrors_p1 = ios->layout->mirrors_p1;
399 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 395 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
400 unsigned dev = si->dev; 396 unsigned dev = si->dev;
401 unsigned first_dev = dev - (dev % devs_in_group); 397 unsigned first_dev = dev - (dev % devs_in_group);
402 unsigned comp = first_comp + (dev - first_dev);
403 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; 398 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
404 unsigned cur_pg = ios->pages_consumed; 399 unsigned cur_pg = ios->pages_consumed;
405 int ret = 0; 400 int ret = 0;
406 401
407 while (length) { 402 while (length) {
408 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; 403 struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
409 unsigned cur_len, page_off = 0; 404 unsigned cur_len, page_off = 0;
410 405
411 if (!per_dev->length) { 406 if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
424 cur_len = stripe_unit; 419 cur_len = stripe_unit;
425 } 420 }
426 421
427 if (max_comp < comp) 422 if (max_comp < dev)
428 max_comp = comp; 423 max_comp = dev;
429
430 dev += mirrors_p1;
431 dev = (dev % devs_in_group) + first_dev;
432 } else { 424 } else {
433 cur_len = stripe_unit; 425 cur_len = stripe_unit;
434 } 426 }
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
440 if (unlikely(ret)) 432 if (unlikely(ret))
441 goto out; 433 goto out;
442 434
443 comp += mirrors_p1; 435 dev += mirrors_p1;
444 comp = (comp % devs_in_group) + first_comp; 436 dev = (dev % devs_in_group) + first_dev;
445 437
446 length -= cur_len; 438 length -= cur_len;
447 } 439 }
@@ -454,18 +446,15 @@ out:
454static int _prepare_for_striping(struct exofs_io_state *ios) 446static int _prepare_for_striping(struct exofs_io_state *ios)
455{ 447{
456 u64 length = ios->length; 448 u64 length = ios->length;
449 u64 offset = ios->offset;
457 struct _striping_info si; 450 struct _striping_info si;
458 unsigned devs_in_group = ios->layout->group_width *
459 ios->layout->mirrors_p1;
460 unsigned first_comp = 0;
461 int ret = 0; 451 int ret = 0;
462 452
463 _calc_stripe_info(ios, ios->offset, &si);
464
465 if (!ios->pages) { 453 if (!ios->pages) {
466 if (ios->kern_buff) { 454 if (ios->kern_buff) {
467 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 455 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
468 456
457 _calc_stripe_info(ios, ios->offset, &si);
469 per_dev->offset = si.obj_offset; 458 per_dev->offset = si.obj_offset;
470 per_dev->dev = si.dev; 459 per_dev->dev = si.dev;
471 460
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
479 } 468 }
480 469
481 while (length) { 470 while (length) {
471 _calc_stripe_info(ios, offset, &si);
472
482 if (length < si.group_length) 473 if (length < si.group_length)
483 si.group_length = length; 474 si.group_length = length;
484 475
485 ret = _prepare_one_group(ios, si.group_length, &si, first_comp); 476 ret = _prepare_one_group(ios, si.group_length, &si);
486 if (unlikely(ret)) 477 if (unlikely(ret))
487 goto out; 478 goto out;
488 479
480 offset += si.group_length;
489 length -= si.group_length; 481 length -= si.group_length;
490
491 si.group_length = si.total_group_length;
492 si.unit_off = 0;
493 ++si.Major;
494 si.obj_offset = si.Major * ios->layout->stripe_unit *
495 ios->layout->group_depth;
496
497 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
498 si.dev %= ios->layout->s_numdevs;
499
500 first_comp += devs_in_group;
501 first_comp %= ios->layout->s_numdevs;
502 } 482 }
503 483
504out: 484out:
@@ -584,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
584 master_dev->bio->bi_max_vecs); 564 master_dev->bio->bi_max_vecs);
585 if (unlikely(!bio)) { 565 if (unlikely(!bio)) {
586 EXOFS_DBGMSG( 566 EXOFS_DBGMSG(
587 "Faild to allocate BIO size=%u\n", 567 "Failed to allocate BIO size=%u\n",
588 master_dev->bio->bi_max_vecs); 568 master_dev->bio->bi_max_vecs);
589 ret = -ENOMEM; 569 ret = -ENOMEM;
590 goto out; 570 goto out;
@@ -599,7 +579,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
599 } else { 579 } else {
600 bio = master_dev->bio; 580 bio = master_dev->bio;
601 /* FIXME: bio_set_dir() */ 581 /* FIXME: bio_set_dir() */
602 bio->bi_rw |= (1 << BIO_RW); 582 bio->bi_rw |= REQ_WRITE;
603 } 583 }
604 584
605 osd_req_write(or, &ios->obj, per_dev->offset, bio, 585 osd_req_write(or, &ios->obj, per_dev->offset, bio,
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c23686..264e95d0283 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
153 153
154 inode->i_ctime = CURRENT_TIME; 154 inode->i_ctime = CURRENT_TIME;
155 inode_inc_link_count(inode); 155 inode_inc_link_count(inode);
156 atomic_inc(&inode->i_count); 156 ihold(inode);
157 157
158 return exofs_add_nondir(dentry, inode); 158 return exofs_add_nondir(dentry, inode);
159} 159}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 03149b9a517..79c3ae6e045 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/smp_lock.h>
35#include <linux/string.h> 34#include <linux/string.h>
36#include <linux/parser.h> 35#include <linux/parser.h>
37#include <linux/vfs.h> 36#include <linux/vfs.h>
@@ -660,19 +659,19 @@ free_bdi:
660/* 659/*
661 * Set up the superblock (calls exofs_fill_super eventually) 660 * Set up the superblock (calls exofs_fill_super eventually)
662 */ 661 */
663static int exofs_get_sb(struct file_system_type *type, 662static struct dentry *exofs_mount(struct file_system_type *type,
664 int flags, const char *dev_name, 663 int flags, const char *dev_name,
665 void *data, struct vfsmount *mnt) 664 void *data)
666{ 665{
667 struct exofs_mountopt opts; 666 struct exofs_mountopt opts;
668 int ret; 667 int ret;
669 668
670 ret = parse_options(data, &opts); 669 ret = parse_options(data, &opts);
671 if (ret) 670 if (ret)
672 return ret; 671 return ERR_PTR(ret);
673 672
674 opts.dev_name = dev_name; 673 opts.dev_name = dev_name;
675 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); 674 return mount_nodev(type, flags, &opts, exofs_fill_super);
676} 675}
677 676
678/* 677/*
@@ -743,7 +742,7 @@ static const struct super_operations exofs_sops = {
743 .alloc_inode = exofs_alloc_inode, 742 .alloc_inode = exofs_alloc_inode,
744 .destroy_inode = exofs_destroy_inode, 743 .destroy_inode = exofs_destroy_inode,
745 .write_inode = exofs_write_inode, 744 .write_inode = exofs_write_inode,
746 .delete_inode = exofs_delete_inode, 745 .evict_inode = exofs_evict_inode,
747 .put_super = exofs_put_super, 746 .put_super = exofs_put_super,
748 .write_super = exofs_write_super, 747 .write_super = exofs_write_super,
749 .sync_fs = exofs_sync_fs, 748 .sync_fs = exofs_sync_fs,
@@ -810,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
810static struct file_system_type exofs_type = { 809static struct file_system_type exofs_type = {
811 .owner = THIS_MODULE, 810 .owner = THIS_MODULE,
812 .name = "exofs", 811 .name = "exofs",
813 .get_sb = exofs_get_sb, 812 .mount = exofs_mount,
814 .kill_sb = generic_shutdown_super, 813 .kill_sb = generic_shutdown_super,
815}; 814};
816 815
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a6..51b304056f1 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
74find_disconnected_root(struct dentry *dentry) 74find_disconnected_root(struct dentry *dentry)
75{ 75{
76 dget(dentry); 76 dget(dentry);
77 spin_lock(&dentry->d_lock); 77 while (!IS_ROOT(dentry)) {
78 while (!IS_ROOT(dentry) && 78 struct dentry *parent = dget_parent(dentry);
79 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { 79
80 struct dentry *parent = dentry->d_parent; 80 if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
81 dget(parent); 81 dput(parent);
82 spin_unlock(&dentry->d_lock); 82 break;
83 }
84
83 dput(dentry); 85 dput(dentry);
84 dentry = parent; 86 dentry = parent;
85 spin_lock(&dentry->d_lock);
86 } 87 }
87 spin_unlock(&dentry->d_lock);
88 return dentry; 88 return dentry;
89} 89}
90 90
91
92/* 91/*
93 * Make sure target_dir is fully connected to the dentry tree. 92 * Make sure target_dir is fully connected to the dentry tree.
94 * 93 *
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e8766a39677..0d06f4e7569 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -571,7 +571,7 @@ do_more:
571error_return: 571error_return:
572 brelse(bitmap_bh); 572 brelse(bitmap_bh);
573 release_blocks(sb, freed); 573 release_blocks(sb, freed);
574 dquot_free_block(inode, freed); 574 dquot_free_block_nodirty(inode, freed);
575} 575}
576 576
577/** 577/**
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
646 return here; 646 return here;
647} 647}
648 648
649/* 649/**
650 * ext2_try_to_allocate() 650 * ext2_try_to_allocate()
651 * @sb: superblock 651 * @sb: superblock
652 * @handle: handle to this transaction
653 * @group: given allocation block group 652 * @group: given allocation block group
654 * @bitmap_bh: bufferhead holds the block bitmap 653 * @bitmap_bh: bufferhead holds the block bitmap
655 * @grp_goal: given target block within the group 654 * @grp_goal: given target block within the group
@@ -1418,7 +1417,8 @@ allocated:
1418 1417
1419 *errp = 0; 1418 *errp = 0;
1420 brelse(bitmap_bh); 1419 brelse(bitmap_bh);
1421 dquot_free_block(inode, *count-num); 1420 dquot_free_block_nodirty(inode, *count-num);
1421 mark_inode_dirty(inode);
1422 *count = num; 1422 *count = num;
1423 return ret_block; 1423 return ret_block;
1424 1424
@@ -1428,8 +1428,10 @@ out:
1428 /* 1428 /*
1429 * Undo the block allocation 1429 * Undo the block allocation
1430 */ 1430 */
1431 if (!performed_allocation) 1431 if (!performed_allocation) {
1432 dquot_free_block(inode, *count); 1432 dquot_free_block_nodirty(inode, *count);
1433 mark_inode_dirty(inode);
1434 }
1433 brelse(bitmap_bh); 1435 brelse(bitmap_bh);
1434 return 0; 1436 return 0;
1435} 1437}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957273e..2709b34206a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
98 if (IS_DIRSYNC(dir)) { 98 if (IS_DIRSYNC(dir)) {
99 err = write_one_page(page, 1); 99 err = write_one_page(page, 1);
100 if (!err) 100 if (!err)
101 err = ext2_sync_inode(dir); 101 err = sync_inode_metadata(dir, 1);
102 } else { 102 } else {
103 unlock_page(page); 103 unlock_page(page);
104 } 104 }
@@ -448,6 +448,11 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
448 return res; 448 return res;
449} 449}
450 450
451static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
452{
453 return __block_write_begin(page, pos, len, ext2_get_block);
454}
455
451/* Releases the page */ 456/* Releases the page */
452void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, 457void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
453 struct page *page, struct inode *inode, int update_times) 458 struct page *page, struct inode *inode, int update_times)
@@ -458,8 +463,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
458 int err; 463 int err;
459 464
460 lock_page(page); 465 lock_page(page);
461 err = __ext2_write_begin(NULL, page->mapping, pos, len, 466 err = ext2_prepare_chunk(page, pos, len);
462 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
463 BUG_ON(err); 467 BUG_ON(err);
464 de->inode = cpu_to_le32(inode->i_ino); 468 de->inode = cpu_to_le32(inode->i_ino);
465 ext2_set_de_type(de, inode); 469 ext2_set_de_type(de, inode);
@@ -542,8 +546,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
542got_it: 546got_it:
543 pos = page_offset(page) + 547 pos = page_offset(page) +
544 (char*)de - (char*)page_address(page); 548 (char*)de - (char*)page_address(page);
545 err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0, 549 err = ext2_prepare_chunk(page, pos, rec_len);
546 &page, NULL);
547 if (err) 550 if (err)
548 goto out_unlock; 551 goto out_unlock;
549 if (de->inode) { 552 if (de->inode) {
@@ -576,8 +579,7 @@ out_unlock:
576 */ 579 */
577int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) 580int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
578{ 581{
579 struct address_space *mapping = page->mapping; 582 struct inode *inode = page->mapping->host;
580 struct inode *inode = mapping->host;
581 char *kaddr = page_address(page); 583 char *kaddr = page_address(page);
582 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); 584 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
583 unsigned to = ((char *)dir - kaddr) + 585 unsigned to = ((char *)dir - kaddr) +
@@ -601,8 +603,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
601 from = (char*)pde - (char*)page_address(page); 603 from = (char*)pde - (char*)page_address(page);
602 pos = page_offset(page) + from; 604 pos = page_offset(page) + from;
603 lock_page(page); 605 lock_page(page);
604 err = __ext2_write_begin(NULL, page->mapping, pos, to - from, 0, 606 err = ext2_prepare_chunk(page, pos, to - from);
605 &page, NULL);
606 BUG_ON(err); 607 BUG_ON(err);
607 if (pde) 608 if (pde)
608 pde->rec_len = ext2_rec_len_to_disk(to - from); 609 pde->rec_len = ext2_rec_len_to_disk(to - from);
@@ -621,8 +622,7 @@ out:
621 */ 622 */
622int ext2_make_empty(struct inode *inode, struct inode *parent) 623int ext2_make_empty(struct inode *inode, struct inode *parent)
623{ 624{
624 struct address_space *mapping = inode->i_mapping; 625 struct page *page = grab_cache_page(inode->i_mapping, 0);
625 struct page *page = grab_cache_page(mapping, 0);
626 unsigned chunk_size = ext2_chunk_size(inode); 626 unsigned chunk_size = ext2_chunk_size(inode);
627 struct ext2_dir_entry_2 * de; 627 struct ext2_dir_entry_2 * de;
628 int err; 628 int err;
@@ -631,8 +631,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
631 if (!page) 631 if (!page)
632 return -ENOMEM; 632 return -ENOMEM;
633 633
634 err = __ext2_write_begin(NULL, page->mapping, 0, chunk_size, 0, 634 err = ext2_prepare_chunk(page, 0, chunk_size);
635 &page, NULL);
636 if (err) { 635 if (err) {
637 unlock_page(page); 636 unlock_page(page);
638 goto fail; 637 goto fail;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 52b34f1d273..6346a2acf32 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -119,17 +119,13 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, struct writeback_control *); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_evict_inode(struct inode *);
123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 123extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern int ext2_setattr (struct dentry *, struct iattr *); 124extern int ext2_setattr (struct dentry *, struct iattr *);
126extern void ext2_set_inode_flags(struct inode *inode); 125extern void ext2_set_inode_flags(struct inode *inode);
127extern void ext2_get_inode_flags(struct ext2_inode_info *); 126extern void ext2_get_inode_flags(struct ext2_inode_info *);
128extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 127extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
129 u64 start, u64 len); 128 u64 start, u64 len);
130int __ext2_write_begin(struct file *file, struct address_space *mapping,
131 loff_t pos, unsigned len, unsigned flags,
132 struct page **pagep, void **fsdata);
133 129
134/* ioctl.c */ 130/* ioctl.c */
135extern long ext2_ioctl(struct file *, unsigned int, unsigned long); 131extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 938dbc739d0..ad70479aabf 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -118,19 +118,14 @@ void ext2_free_inode (struct inode * inode)
118 * Note: we must free any quota before locking the superblock, 118 * Note: we must free any quota before locking the superblock,
119 * as writing the quota to disk may need the lock as well. 119 * as writing the quota to disk may need the lock as well.
120 */ 120 */
121 if (!is_bad_inode(inode)) { 121 /* Quota is already initialized in iput() */
122 /* Quota is already initialized in iput() */ 122 ext2_xattr_delete_inode(inode);
123 ext2_xattr_delete_inode(inode); 123 dquot_free_inode(inode);
124 dquot_free_inode(inode); 124 dquot_drop(inode);
125 dquot_drop(inode);
126 }
127 125
128 es = EXT2_SB(sb)->s_es; 126 es = EXT2_SB(sb)->s_es;
129 is_directory = S_ISDIR(inode->i_mode); 127 is_directory = S_ISDIR(inode->i_mode);
130 128
131 /* Do this BEFORE marking the inode not in use or returning an error */
132 clear_inode (inode);
133
134 if (ino < EXT2_FIRST_INO(sb) || 129 if (ino < EXT2_FIRST_INO(sb) ||
135 ino > le32_to_cpu(es->s_inodes_count)) { 130 ino > le32_to_cpu(es->s_inodes_count)) {
136 ext2_error (sb, "ext2_free_inode", 131 ext2_error (sb, "ext2_free_inode",
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3675088cb88..40ad210a504 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -69,26 +69,42 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
69/* 69/*
70 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
71 */ 71 */
72void ext2_delete_inode (struct inode * inode) 72void ext2_evict_inode(struct inode * inode)
73{ 73{
74 if (!is_bad_inode(inode)) 74 struct ext2_block_alloc_info *rsv;
75 int want_delete = 0;
76
77 if (!inode->i_nlink && !is_bad_inode(inode)) {
78 want_delete = 1;
75 dquot_initialize(inode); 79 dquot_initialize(inode);
80 } else {
81 dquot_drop(inode);
82 }
83
76 truncate_inode_pages(&inode->i_data, 0); 84 truncate_inode_pages(&inode->i_data, 0);
77 85
78 if (is_bad_inode(inode)) 86 if (want_delete) {
79 goto no_delete; 87 /* set dtime */
80 EXT2_I(inode)->i_dtime = get_seconds(); 88 EXT2_I(inode)->i_dtime = get_seconds();
81 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
82 __ext2_write_inode(inode, inode_needs_sync(inode)); 90 __ext2_write_inode(inode, inode_needs_sync(inode));
91 /* truncate to 0 */
92 inode->i_size = 0;
93 if (inode->i_blocks)
94 ext2_truncate_blocks(inode, 0);
95 }
83 96
84 inode->i_size = 0; 97 invalidate_inode_buffers(inode);
85 if (inode->i_blocks) 98 end_writeback(inode);
86 ext2_truncate_blocks(inode, 0);
87 ext2_free_inode (inode);
88 99
89 return; 100 ext2_discard_reservation(inode);
90no_delete: 101 rsv = EXT2_I(inode)->i_block_alloc_info;
91 clear_inode(inode); /* We must guarantee clearing of inode... */ 102 EXT2_I(inode)->i_block_alloc_info = NULL;
103 if (unlikely(rsv))
104 kfree(rsv);
105
106 if (want_delete)
107 ext2_free_inode(inode);
92} 108}
93 109
94typedef struct { 110typedef struct {
@@ -423,6 +439,8 @@ static int ext2_alloc_blocks(struct inode *inode,
423failed_out: 439failed_out:
424 for (i = 0; i <index; i++) 440 for (i = 0; i <index; i++)
425 ext2_free_blocks(inode, new_blocks[i], 1); 441 ext2_free_blocks(inode, new_blocks[i], 1);
442 if (index)
443 mark_inode_dirty(inode);
426 return ret; 444 return ret;
427} 445}
428 446
@@ -440,7 +458,7 @@ failed_out:
440 * the same format as ext2_get_branch() would do. We are calling it after 458 * the same format as ext2_get_branch() would do. We are calling it after
441 * we had read the existing part of chain and partial points to the last 459 * we had read the existing part of chain and partial points to the last
442 * triple of that (one with zero ->key). Upon the exit we have the same 460 * triple of that (one with zero ->key). Upon the exit we have the same
443 * picture as after the successful ext2_get_block(), excpet that in one 461 * picture as after the successful ext2_get_block(), except that in one
444 * place chain is disconnected - *branch->p is still zero (we did not 462 * place chain is disconnected - *branch->p is still zero (we did not
445 * set the last link), but branch->key contains the number that should 463 * set the last link), but branch->key contains the number that should
446 * be placed into *branch->p to fill that gap. 464 * be placed into *branch->p to fill that gap.
@@ -644,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
644 mutex_lock(&ei->truncate_mutex); 662 mutex_lock(&ei->truncate_mutex);
645 /* 663 /*
646 * If the indirect block is missing while we are reading 664 * If the indirect block is missing while we are reading
647 * the chain(ext3_get_branch() returns -EAGAIN err), or 665 * the chain(ext2_get_branch() returns -EAGAIN err), or
648 * if the chain has been changed after we grab the semaphore, 666 * if the chain has been changed after we grab the semaphore,
649 * (either because another process truncated this branch, or 667 * (either because another process truncated this branch, or
650 * another get_block allocated this branch) re-grab the chain to see if 668 * another get_block allocated this branch) re-grab the chain to see if
@@ -765,14 +783,6 @@ ext2_readpages(struct file *file, struct address_space *mapping,
765 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); 783 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
766} 784}
767 785
768int __ext2_write_begin(struct file *file, struct address_space *mapping,
769 loff_t pos, unsigned len, unsigned flags,
770 struct page **pagep, void **fsdata)
771{
772 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
773 pagep, fsdata, ext2_get_block);
774}
775
776static int 786static int
777ext2_write_begin(struct file *file, struct address_space *mapping, 787ext2_write_begin(struct file *file, struct address_space *mapping,
778 loff_t pos, unsigned len, unsigned flags, 788 loff_t pos, unsigned len, unsigned flags,
@@ -780,8 +790,8 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
780{ 790{
781 int ret; 791 int ret;
782 792
783 *pagep = NULL; 793 ret = block_write_begin(mapping, pos, len, flags, pagep,
784 ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 794 ext2_get_block);
785 if (ret < 0) 795 if (ret < 0)
786 ext2_write_failed(mapping, pos + len); 796 ext2_write_failed(mapping, pos + len);
787 return ret; 797 return ret;
@@ -806,13 +816,8 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
806{ 816{
807 int ret; 817 int ret;
808 818
809 /* 819 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
810 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 820 ext2_get_block);
811 * directory handling code to pass around offsets rather than struct
812 * pages in order to make this work easily.
813 */
814 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
815 fsdata, ext2_get_block);
816 if (ret < 0) 821 if (ret < 0)
817 ext2_write_failed(mapping, pos + len); 822 ext2_write_failed(mapping, pos + len);
818 return ret; 823 return ret;
@@ -838,7 +843,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
838 struct inode *inode = mapping->host; 843 struct inode *inode = mapping->host;
839 ssize_t ret; 844 ssize_t ret;
840 845
841 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, 846 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
842 iov, offset, nr_segs, ext2_get_block, NULL); 847 iov, offset, nr_segs, ext2_get_block, NULL);
843 if (ret < 0 && (rw & WRITE)) 848 if (ret < 0 && (rw & WRITE))
844 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); 849 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
@@ -1006,8 +1011,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
1006 else if (block_to_free == nr - count) 1011 else if (block_to_free == nr - count)
1007 count++; 1012 count++;
1008 else { 1013 else {
1009 mark_inode_dirty(inode);
1010 ext2_free_blocks (inode, block_to_free, count); 1014 ext2_free_blocks (inode, block_to_free, count);
1015 mark_inode_dirty(inode);
1011 free_this: 1016 free_this:
1012 block_to_free = nr; 1017 block_to_free = nr;
1013 count = 1; 1018 count = 1;
@@ -1015,8 +1020,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
1015 } 1020 }
1016 } 1021 }
1017 if (count > 0) { 1022 if (count > 0) {
1018 mark_inode_dirty(inode);
1019 ext2_free_blocks (inode, block_to_free, count); 1023 ext2_free_blocks (inode, block_to_free, count);
1024 mark_inode_dirty(inode);
1020 } 1025 }
1021} 1026}
1022 1027
@@ -1169,15 +1174,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1169 __ext2_truncate_blocks(inode, offset); 1174 __ext2_truncate_blocks(inode, offset);
1170} 1175}
1171 1176
1172int ext2_setsize(struct inode *inode, loff_t newsize) 1177static int ext2_setsize(struct inode *inode, loff_t newsize)
1173{ 1178{
1174 loff_t oldsize;
1175 int error; 1179 int error;
1176 1180
1177 error = inode_newsize_ok(inode, newsize);
1178 if (error)
1179 return error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode))) 1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL; 1183 return -EINVAL;
@@ -1197,16 +1197,13 @@ int ext2_setsize(struct inode *inode, loff_t newsize)
1197 if (error) 1197 if (error)
1198 return error; 1198 return error;
1199 1199
1200 oldsize = inode->i_size; 1200 truncate_setsize(inode, newsize);
1201 i_size_write(inode, newsize);
1202 truncate_pagecache(inode, oldsize, newsize);
1203
1204 __ext2_truncate_blocks(inode, newsize); 1201 __ext2_truncate_blocks(inode, newsize);
1205 1202
1206 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1207 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1208 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
1209 ext2_sync_inode (inode); 1206 sync_inode_metadata(inode, 1);
1210 } else { 1207 } else {
1211 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1212 } 1209 }
@@ -1526,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1526 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1527} 1524}
1528 1525
1529int ext2_sync_inode(struct inode *inode)
1530{
1531 struct writeback_control wbc = {
1532 .sync_mode = WB_SYNC_ALL,
1533 .nr_to_write = 0, /* sys_fsync did this */
1534 };
1535 return sync_inode(inode, &wbc);
1536}
1537
1538int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1526int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1539{ 1527{
1540 struct inode *inode = dentry->d_inode; 1528 struct inode *inode = dentry->d_inode;
@@ -1557,7 +1545,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1557 if (error) 1545 if (error)
1558 return error; 1546 return error;
1559 } 1547 }
1560 generic_setattr(inode, iattr); 1548 setattr_copy(inode, iattr);
1561 if (iattr->ia_valid & ATTR_MODE) 1549 if (iattr->ia_valid & ATTR_MODE)
1562 error = ext2_acl_chmod(inode); 1550 error = ext2_acl_chmod(inode);
1563 mark_inode_dirty(inode); 1551 mark_inode_dirty(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f..f8aecd2e329 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
206 206
207 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
208 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
209 atomic_inc(&inode->i_count); 209 ihold(inode);
210 210
211 err = ext2_add_link(dentry, inode); 211 err = ext2_add_link(dentry, inode);
212 if (!err) { 212 if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7ff43f4a59c..d89e0b6a2d7 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -195,17 +195,6 @@ static void destroy_inodecache(void)
195 kmem_cache_destroy(ext2_inode_cachep); 195 kmem_cache_destroy(ext2_inode_cachep);
196} 196}
197 197
198static void ext2_clear_inode(struct inode *inode)
199{
200 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
201
202 dquot_drop(inode);
203 ext2_discard_reservation(inode);
204 EXT2_I(inode)->i_block_alloc_info = NULL;
205 if (unlikely(rsv))
206 kfree(rsv);
207}
208
209static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) 198static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
210{ 199{
211 struct super_block *sb = vfs->mnt_sb; 200 struct super_block *sb = vfs->mnt_sb;
@@ -299,13 +288,12 @@ static const struct super_operations ext2_sops = {
299 .alloc_inode = ext2_alloc_inode, 288 .alloc_inode = ext2_alloc_inode,
300 .destroy_inode = ext2_destroy_inode, 289 .destroy_inode = ext2_destroy_inode,
301 .write_inode = ext2_write_inode, 290 .write_inode = ext2_write_inode,
302 .delete_inode = ext2_delete_inode, 291 .evict_inode = ext2_evict_inode,
303 .put_super = ext2_put_super, 292 .put_super = ext2_put_super,
304 .write_super = ext2_write_super, 293 .write_super = ext2_write_super,
305 .sync_fs = ext2_sync_fs, 294 .sync_fs = ext2_sync_fs,
306 .statfs = ext2_statfs, 295 .statfs = ext2_statfs,
307 .remount_fs = ext2_remount, 296 .remount_fs = ext2_remount,
308 .clear_inode = ext2_clear_inode,
309 .show_options = ext2_show_options, 297 .show_options = ext2_show_options,
310#ifdef CONFIG_QUOTA 298#ifdef CONFIG_QUOTA
311 .quota_read = ext2_quota_read, 299 .quota_read = ext2_quota_read,
@@ -759,15 +747,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
759 __le32 features; 747 __le32 features;
760 int err; 748 int err;
761 749
750 err = -ENOMEM;
762 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 751 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
763 if (!sbi) 752 if (!sbi)
764 return -ENOMEM; 753 goto failed_unlock;
765 754
766 sbi->s_blockgroup_lock = 755 sbi->s_blockgroup_lock =
767 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 756 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
768 if (!sbi->s_blockgroup_lock) { 757 if (!sbi->s_blockgroup_lock) {
769 kfree(sbi); 758 kfree(sbi);
770 return -ENOMEM; 759 goto failed_unlock;
771 } 760 }
772 sb->s_fs_info = sbi; 761 sb->s_fs_info = sbi;
773 sbi->s_sb_block = sb_block; 762 sbi->s_sb_block = sb_block;
@@ -1119,6 +1108,7 @@ failed_sbi:
1119 sb->s_fs_info = NULL; 1108 sb->s_fs_info = NULL;
1120 kfree(sbi->s_blockgroup_lock); 1109 kfree(sbi->s_blockgroup_lock);
1121 kfree(sbi); 1110 kfree(sbi);
1111failed_unlock:
1122 return ret; 1112 return ret;
1123} 1113}
1124 1114
@@ -1231,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1231 } 1221 }
1232 1222
1233 es = sbi->s_es; 1223 es = sbi->s_es;
1234 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1224 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
1235 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1236 invalidate_inodes(sb)) {
1237 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1238 "xip flag with busy inodes while remounting"); 1226 "xip flag with busy inodes while remounting");
1239 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1368,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1368 return 0; 1356 return 0;
1369} 1357}
1370 1358
1371static int ext2_get_sb(struct file_system_type *fs_type, 1359static struct dentry *ext2_mount(struct file_system_type *fs_type,
1372 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1360 int flags, const char *dev_name, void *data)
1373{ 1361{
1374 return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); 1362 return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
1375} 1363}
1376 1364
1377#ifdef CONFIG_QUOTA 1365#ifdef CONFIG_QUOTA
@@ -1485,7 +1473,7 @@ out:
1485static struct file_system_type ext2_fs_type = { 1473static struct file_system_type ext2_fs_type = {
1486 .owner = THIS_MODULE, 1474 .owner = THIS_MODULE,
1487 .name = "ext2", 1475 .name = "ext2",
1488 .get_sb = ext2_get_sb, 1476 .mount = ext2_mount,
1489 .kill_sb = kill_block_super, 1477 .kill_sb = kill_block_super,
1490 .fs_flags = FS_REQUIRES_DEV, 1478 .fs_flags = FS_REQUIRES_DEV,
1491}; 1479};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7c3915780b1..f84700be327 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -674,6 +674,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
674 new_bh = sb_getblk(sb, block); 674 new_bh = sb_getblk(sb, block);
675 if (!new_bh) { 675 if (!new_bh) {
676 ext2_free_blocks(inode, block, 1); 676 ext2_free_blocks(inode, block, 1);
677 mark_inode_dirty(inode);
677 error = -EIO; 678 error = -EIO;
678 goto cleanup; 679 goto cleanup;
679 } 680 }
@@ -698,13 +699,15 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
698 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; 699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
699 inode->i_ctime = CURRENT_TIME_SEC; 700 inode->i_ctime = CURRENT_TIME_SEC;
700 if (IS_SYNC(inode)) { 701 if (IS_SYNC(inode)) {
701 error = ext2_sync_inode (inode); 702 error = sync_inode_metadata(inode, 1);
702 /* In case sync failed due to ENOSPC the inode was actually 703 /* In case sync failed due to ENOSPC the inode was actually
703 * written (only some dirty data were not) so we just proceed 704 * written (only some dirty data were not) so we just proceed
704 * as if nothing happened and cleanup the unused block */ 705 * as if nothing happened and cleanup the unused block */
705 if (error && error != -ENOSPC) { 706 if (error && error != -ENOSPC) {
706 if (new_bh && new_bh != old_bh) 707 if (new_bh && new_bh != old_bh) {
707 dquot_free_block(inode, 1); 708 dquot_free_block_nodirty(inode, 1);
709 mark_inode_dirty(inode);
710 }
708 goto cleanup; 711 goto cleanup;
709 } 712 }
710 } else 713 } else
@@ -727,6 +730,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
727 mb_cache_entry_free(ce); 730 mb_cache_entry_free(ce);
728 ea_bdebug(old_bh, "freeing"); 731 ea_bdebug(old_bh, "freeing");
729 ext2_free_blocks(inode, old_bh->b_blocknr, 1); 732 ext2_free_blocks(inode, old_bh->b_blocknr, 1);
733 mark_inode_dirty(inode);
730 /* We let our caller release old_bh, so we 734 /* We let our caller release old_bh, so we
731 * need to duplicate the buffer before. */ 735 * need to duplicate the buffer before. */
732 get_bh(old_bh); 736 get_bh(old_bh);
@@ -736,7 +740,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
736 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 740 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
737 if (ce) 741 if (ce)
738 mb_cache_entry_release(ce); 742 mb_cache_entry_release(ce);
739 dquot_free_block(inode, 1); 743 dquot_free_block_nodirty(inode, 1);
744 mark_inode_dirty(inode);
740 mark_buffer_dirty(old_bh); 745 mark_buffer_dirty(old_bh);
741 ea_bdebug(old_bh, "refcount now=%d", 746 ea_bdebug(old_bh, "refcount now=%d",
742 le32_to_cpu(HDR(old_bh)->h_refcount)); 747 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -799,7 +804,7 @@ ext2_xattr_delete_inode(struct inode *inode)
799 mark_buffer_dirty(bh); 804 mark_buffer_dirty(bh);
800 if (IS_SYNC(inode)) 805 if (IS_SYNC(inode))
801 sync_dirty_buffer(bh); 806 sync_dirty_buffer(bh);
802 dquot_free_block(inode, 1); 807 dquot_free_block_nodirty(inode, 1);
803 } 808 }
804 EXT2_I(inode)->i_file_acl = 0; 809 EXT2_I(inode)->i_file_acl = 0;
805 810
@@ -838,7 +843,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
838 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); 843 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
839 if (!ce) 844 if (!ce)
840 return -ENOMEM; 845 return -ENOMEM;
841 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 846 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
842 if (error) { 847 if (error) {
843 mb_cache_entry_free(ce); 848 mb_cache_entry_free(ce);
844 if (error == -EBUSY) { 849 if (error == -EBUSY) {
@@ -912,8 +917,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
912 return NULL; /* never share */ 917 return NULL; /* never share */
913 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 918 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
914again: 919again:
915 ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, 920 ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
916 inode->i_sb->s_bdev, hash); 921 hash);
917 while (ce) { 922 while (ce) {
918 struct buffer_head *bh; 923 struct buffer_head *bh;
919 924
@@ -945,7 +950,7 @@ again:
945 unlock_buffer(bh); 950 unlock_buffer(bh);
946 brelse(bh); 951 brelse(bh);
947 } 952 }
948 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 953 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
949 } 954 }
950 return NULL; 955 return NULL;
951} 956}
@@ -1021,9 +1026,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
1021int __init 1026int __init
1022init_ext2_xattr(void) 1027init_ext2_xattr(void)
1023{ 1028{
1024 ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, 1029 ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
1025 sizeof(struct mb_cache_entry) +
1026 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1027 if (!ext2_xattr_cache) 1030 if (!ext2_xattr_cache)
1028 return -ENOMEM; 1031 return -ENOMEM;
1029 return 0; 1032 return 0;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f4..e8c6ba0e4a3 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 default y
34 help 35 help
35 The journal mode options for ext3 have different tradeoffs 36 The journal mode options for ext3 have different tradeoffs
36 between when data is guaranteed to be on disk and 37 between when data is guaranteed to be on disk and
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4de..b3db2264942 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
792 if (here < 0) 792 if (here < 0)
793 here = 0; 793 here = 0;
794 794
795 p = ((char *)bh->b_data) + (here >> 3); 795 p = bh->b_data + (here >> 3);
796 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); 796 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
797 next = (r - ((char *)bh->b_data)) << 3; 797 next = (r - bh->b_data) << 3;
798 798
799 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) 799 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
800 return next; 800 return next;
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
810 810
811/** 811/**
812 * claim_block() 812 * claim_block()
813 * @lock: the spin lock for this block group
813 * @block: the free block (group relative) to allocate 814 * @block: the free block (group relative) to allocate
814 * @bh: the bufferhead containts the block group bitmap 815 * @bh: the buffer_head contains the block group bitmap
815 * 816 *
816 * We think we can allocate this block in this bitmap. Try to set the bit. 817 * We think we can allocate this block in this bitmap. Try to set the bit.
817 * If that succeeds then check that nobody has allocated and then freed the 818 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
956 * but we will shift to the place where start_block is, 957 * but we will shift to the place where start_block is,
957 * then start from there, when looking for a reservable space. 958 * then start from there, when looking for a reservable space.
958 * 959 *
959 * @size: the target new reservation window size 960 * @my_rsv: the reservation window
960 * 961 *
961 * @group_first_block: the first block we consider to start 962 * @sb: the super block
963 *
964 * @start_block: the first block we consider to start
962 * the real search from 965 * the real search from
963 * 966 *
964 * @last_block: 967 * @last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
1084 * 1087 *
1085 * failed: we failed to find a reservation window in this group 1088 * failed: we failed to find a reservation window in this group
1086 * 1089 *
1087 * @rsv: the reservation 1090 * @my_rsv: the reservation window
1088 * 1091 *
1089 * @grp_goal: The goal (group-relative). It is where the search for a 1092 * @grp_goal: The goal (group-relative). It is where the search for a
1090 * free reservable space should start from. 1093 * free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1273 * @group: given allocation block group 1276 * @group: given allocation block group
1274 * @bitmap_bh: bufferhead holds the block bitmap 1277 * @bitmap_bh: bufferhead holds the block bitmap
1275 * @grp_goal: given target block within the group 1278 * @grp_goal: given target block within the group
1276 * @count: target number of blocks to allocate
1277 * @my_rsv: reservation window 1279 * @my_rsv: reservation window
1280 * @count: target number of blocks to allocate
1278 * @errp: pointer to store the error code 1281 * @errp: pointer to store the error code
1279 * 1282 *
1280 * This is the main function used to allocate a new block and its reservation 1283 * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a..09b13bb34c9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
94 BLKDEV_IFL_WAIT);
95 return ret; 94 return ret;
96} 95}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 498021eb88f..9724aef2246 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -119,20 +119,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
119 ino = inode->i_ino; 119 ino = inode->i_ino;
120 ext3_debug ("freeing inode %lu\n", ino); 120 ext3_debug ("freeing inode %lu\n", ino);
121 121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 dquot_initialize(inode);
127 ext3_xattr_delete_inode(handle, inode);
128 dquot_free_inode(inode);
129 dquot_drop(inode);
130
131 is_directory = S_ISDIR(inode->i_mode); 122 is_directory = S_ISDIR(inode->i_mode);
132 123
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT3_SB(sb)->s_es; 124 es = EXT3_SB(sb)->s_es;
137 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 125 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext3_error (sb, "ext3_free_inode", 126 ext3_error (sb, "ext3_free_inode",
@@ -582,9 +570,14 @@ got:
582 ei->i_state_flags = 0; 570 ei->i_state_flags = 0;
583 ext3_set_inode_state(inode, EXT3_STATE_NEW); 571 ext3_set_inode_state(inode, EXT3_STATE_NEW);
584 572
585 ei->i_extra_isize = 573 /* See comment in ext3_iget for explanation */
586 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 574 if (ino >= EXT3_FIRST_INO(sb) + 1 &&
587 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 575 EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
576 ei->i_extra_isize =
577 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
578 } else {
579 ei->i_extra_isize = 0;
580 }
588 581
589 ret = inode; 582 ret = inode;
590 dquot_initialize(inode); 583 dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2..a9580617edd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -190,18 +190,28 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
190} 190}
191 191
192/* 192/*
193 * Called at the last iput() if i_nlink is zero. 193 * Called at inode eviction from icache
194 */ 194 */
195void ext3_delete_inode (struct inode * inode) 195void ext3_evict_inode (struct inode *inode)
196{ 196{
197 struct ext3_block_alloc_info *rsv;
197 handle_t *handle; 198 handle_t *handle;
199 int want_delete = 0;
198 200
199 if (!is_bad_inode(inode)) 201 if (!inode->i_nlink && !is_bad_inode(inode)) {
200 dquot_initialize(inode); 202 dquot_initialize(inode);
203 want_delete = 1;
204 }
201 205
202 truncate_inode_pages(&inode->i_data, 0); 206 truncate_inode_pages(&inode->i_data, 0);
203 207
204 if (is_bad_inode(inode)) 208 ext3_discard_reservation(inode);
209 rsv = EXT3_I(inode)->i_block_alloc_info;
210 EXT3_I(inode)->i_block_alloc_info = NULL;
211 if (unlikely(rsv))
212 kfree(rsv);
213
214 if (!want_delete)
205 goto no_delete; 215 goto no_delete;
206 216
207 handle = start_transaction(inode); 217 handle = start_transaction(inode);
@@ -238,15 +248,22 @@ void ext3_delete_inode (struct inode * inode)
238 * having errors), but we can't free the inode if the mark_dirty 248 * having errors), but we can't free the inode if the mark_dirty
239 * fails. 249 * fails.
240 */ 250 */
241 if (ext3_mark_inode_dirty(handle, inode)) 251 if (ext3_mark_inode_dirty(handle, inode)) {
242 /* If that failed, just do the required in-core inode clear. */ 252 /* If that failed, just dquot_drop() and be done with that */
243 clear_inode(inode); 253 dquot_drop(inode);
244 else 254 end_writeback(inode);
255 } else {
256 ext3_xattr_delete_inode(handle, inode);
257 dquot_free_inode(inode);
258 dquot_drop(inode);
259 end_writeback(inode);
245 ext3_free_inode(handle, inode); 260 ext3_free_inode(handle, inode);
261 }
246 ext3_journal_stop(handle); 262 ext3_journal_stop(handle);
247 return; 263 return;
248no_delete: 264no_delete:
249 clear_inode(inode); /* We must guarantee clearing of inode... */ 265 end_writeback(inode);
266 dquot_drop(inode);
250} 267}
251 268
252typedef struct { 269typedef struct {
@@ -481,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
481} 498}
482 499
483/** 500/**
484 * ext3_blks_to_allocate: Look up the block map and count the number 501 * ext3_blks_to_allocate - Look up the block map and count the number
485 * of direct blocks need to be allocated for the given branch. 502 * of direct blocks need to be allocated for the given branch.
486 * 503 *
487 * @branch: chain of indirect blocks 504 * @branch: chain of indirect blocks
@@ -519,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
519} 536}
520 537
521/** 538/**
522 * ext3_alloc_blocks: multiple allocate blocks needed for a branch 539 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
540 * @handle: handle for this transaction
541 * @inode: owner
542 * @goal: preferred place for allocation
523 * @indirect_blks: the number of blocks need to allocate for indirect 543 * @indirect_blks: the number of blocks need to allocate for indirect
524 * blocks 544 * blocks
525 * 545 * @blks: number of blocks need to allocated for direct blocks
526 * @new_blocks: on return it will store the new block numbers for 546 * @new_blocks: on return it will store the new block numbers for
527 * the indirect blocks(if needed) and the first direct block, 547 * the indirect blocks(if needed) and the first direct block,
528 * @blks: on return it will store the total number of allocated 548 * @err: here we store the error value
529 * direct blocks 549 *
550 * return the number of direct blocks allocated
530 */ 551 */
531static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 552static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
532 ext3_fsblk_t goal, int indirect_blks, int blks, 553 ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -581,9 +602,11 @@ failed_out:
581 602
582/** 603/**
583 * ext3_alloc_branch - allocate and set up a chain of blocks. 604 * ext3_alloc_branch - allocate and set up a chain of blocks.
605 * @handle: handle for this transaction
584 * @inode: owner 606 * @inode: owner
585 * @indirect_blks: number of allocated indirect blocks 607 * @indirect_blks: number of allocated indirect blocks
586 * @blks: number of allocated direct blocks 608 * @blks: number of allocated direct blocks
609 * @goal: preferred place for allocation
587 * @offsets: offsets (in the blocks) to store the pointers to next. 610 * @offsets: offsets (in the blocks) to store the pointers to next.
588 * @branch: place to store the chain in. 611 * @branch: place to store the chain in.
589 * 612 *
@@ -683,10 +706,9 @@ failed:
683 706
684/** 707/**
685 * ext3_splice_branch - splice the allocated branch onto inode. 708 * ext3_splice_branch - splice the allocated branch onto inode.
709 * @handle: handle for this transaction
686 * @inode: owner 710 * @inode: owner
687 * @block: (logical) number of block we are adding 711 * @block: (logical) number of block we are adding
688 * @chain: chain of indirect blocks (with a missing link - see
689 * ext3_alloc_branch)
690 * @where: location of missing link 712 * @where: location of missing link
691 * @num: number of indirect blocks we are adding 713 * @num: number of indirect blocks we are adding
692 * @blks: number of direct blocks we are adding 714 * @blks: number of direct blocks we are adding
@@ -1149,9 +1171,25 @@ static int walk_page_buffers( handle_t *handle,
1149static int do_journal_get_write_access(handle_t *handle, 1171static int do_journal_get_write_access(handle_t *handle,
1150 struct buffer_head *bh) 1172 struct buffer_head *bh)
1151{ 1173{
1174 int dirty = buffer_dirty(bh);
1175 int ret;
1176
1152 if (!buffer_mapped(bh) || buffer_freed(bh)) 1177 if (!buffer_mapped(bh) || buffer_freed(bh))
1153 return 0; 1178 return 0;
1154 return ext3_journal_get_write_access(handle, bh); 1179 /*
1180 * __block_prepare_write() could have dirtied some buffers. Clean
1181 * the dirty bit as jbd2_journal_get_write_access() could complain
1182 * otherwise about fs integrity issues. Setting of the dirty bit
1183 * by __block_prepare_write() isn't a real problem here as we clear
1184 * the bit before releasing a page lock and thus writeback cannot
1185 * ever write the buffer.
1186 */
1187 if (dirty)
1188 clear_buffer_dirty(bh);
1189 ret = ext3_journal_get_write_access(handle, bh);
1190 if (!ret && dirty)
1191 ret = ext3_journal_dirty_metadata(handle, bh);
1192 return ret;
1155} 1193}
1156 1194
1157/* 1195/*
@@ -1196,8 +1234,7 @@ retry:
1196 ret = PTR_ERR(handle); 1234 ret = PTR_ERR(handle);
1197 goto out; 1235 goto out;
1198 } 1236 }
1199 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1237 ret = __block_write_begin(page, pos, len, ext3_get_block);
1200 ext3_get_block);
1201 if (ret) 1238 if (ret)
1202 goto write_begin_failed; 1239 goto write_begin_failed;
1203 1240
@@ -1625,10 +1662,7 @@ static int ext3_writeback_writepage(struct page *page,
1625 goto out_fail; 1662 goto out_fail;
1626 } 1663 }
1627 1664
1628 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1665 ret = block_write_full_page(page, ext3_get_block, wbc);
1629 ret = nobh_writepage(page, ext3_get_block, wbc);
1630 else
1631 ret = block_write_full_page(page, ext3_get_block, wbc);
1632 1666
1633 err = ext3_journal_stop(handle); 1667 err = ext3_journal_stop(handle);
1634 if (!ret) 1668 if (!ret)
@@ -1667,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
1667 * doesn't seem much point in redirtying the page here. 1701 * doesn't seem much point in redirtying the page here.
1668 */ 1702 */
1669 ClearPageChecked(page); 1703 ClearPageChecked(page);
1670 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1704 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1671 ext3_get_block); 1705 ext3_get_block);
1672 if (ret != 0) { 1706 if (ret != 0) {
1673 ext3_journal_stop(handle); 1707 ext3_journal_stop(handle);
1674 goto out_unlock; 1708 goto out_unlock;
@@ -1785,6 +1819,17 @@ retry:
1785 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1819 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1786 offset, nr_segs, 1820 offset, nr_segs,
1787 ext3_get_block, NULL); 1821 ext3_get_block, NULL);
1822 /*
1823 * In case of error extending write may have instantiated a few
1824 * blocks outside i_size. Trim these off again.
1825 */
1826 if (unlikely((rw & WRITE) && ret < 0)) {
1827 loff_t isize = i_size_read(inode);
1828 loff_t end = offset + iov_length(iov, nr_segs);
1829
1830 if (end > isize)
1831 vmtruncate(inode, isize);
1832 }
1788 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1833 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1789 goto retry; 1834 goto retry;
1790 1835
@@ -1922,17 +1967,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1922 length = blocksize - (offset & (blocksize - 1)); 1967 length = blocksize - (offset & (blocksize - 1));
1923 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1968 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1924 1969
1925 /*
1926 * For "nobh" option, we can only work if we don't need to
1927 * read-in the page - otherwise we create buffers to do the IO.
1928 */
1929 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1930 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1931 zero_user(page, offset, length);
1932 set_page_dirty(page);
1933 goto unlock;
1934 }
1935
1936 if (!page_has_buffers(page)) 1970 if (!page_has_buffers(page))
1937 create_empty_buffers(page, blocksize, 0); 1971 create_empty_buffers(page, blocksize, 0);
1938 1972
@@ -2284,27 +2318,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2284 depth); 2318 depth);
2285 2319
2286 /* 2320 /*
2287 * We've probably journalled the indirect block several
2288 * times during the truncate. But it's no longer
2289 * needed and we now drop it from the transaction via
2290 * journal_revoke().
2291 *
2292 * That's easy if it's exclusively part of this
2293 * transaction. But if it's part of the committing
2294 * transaction then journal_forget() will simply
2295 * brelse() it. That means that if the underlying
2296 * block is reallocated in ext3_get_block(),
2297 * unmap_underlying_metadata() will find this block
2298 * and will try to get rid of it. damn, damn.
2299 *
2300 * If this block has already been committed to the
2301 * journal, a revoke record will be written. And
2302 * revoke records must be emitted *before* clearing
2303 * this block's bit in the bitmaps.
2304 */
2305 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2306
2307 /*
2308 * Everything below this this pointer has been 2321 * Everything below this this pointer has been
2309 * released. Now let this top-of-subtree go. 2322 * released. Now let this top-of-subtree go.
2310 * 2323 *
@@ -2327,6 +2340,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2327 truncate_restart_transaction(handle, inode); 2340 truncate_restart_transaction(handle, inode);
2328 } 2341 }
2329 2342
2343 /*
2344 * We've probably journalled the indirect block several
2345 * times during the truncate. But it's no longer
2346 * needed and we now drop it from the transaction via
2347 * journal_revoke().
2348 *
2349 * That's easy if it's exclusively part of this
2350 * transaction. But if it's part of the committing
2351 * transaction then journal_forget() will simply
2352 * brelse() it. That means that if the underlying
2353 * block is reallocated in ext3_get_block(),
2354 * unmap_underlying_metadata() will find this block
2355 * and will try to get rid of it. damn, damn. Thus
2356 * we don't allow a block to be reallocated until
2357 * a transaction freeing it has fully committed.
2358 *
2359 * We also have to make sure journal replay after a
2360 * crash does not overwrite non-journaled data blocks
2361 * with old metadata when the block got reallocated for
2362 * data. Thus we have to store a revoke record for a
2363 * block in the same transaction in which we free the
2364 * block.
2365 */
2366 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2367
2330 ext3_free_blocks(handle, inode, nr, 1); 2368 ext3_free_blocks(handle, inode, nr, 1);
2331 2369
2332 if (parent_bh) { 2370 if (parent_bh) {
@@ -2497,7 +2535,6 @@ void ext3_truncate(struct inode *inode)
2497 */ 2535 */
2498 } else { 2536 } else {
2499 /* Shared branch grows from an indirect block */ 2537 /* Shared branch grows from an indirect block */
2500 BUFFER_TRACE(partial->bh, "get_write_access");
2501 ext3_free_branches(handle, inode, partial->bh, 2538 ext3_free_branches(handle, inode, partial->bh,
2502 partial->p, 2539 partial->p,
2503 partial->p+1, (chain+n-1) - partial); 2540 partial->p+1, (chain+n-1) - partial);
@@ -2554,7 +2591,7 @@ out_stop:
2554 * If this was a simple ftruncate(), and the file will remain alive 2591 * If this was a simple ftruncate(), and the file will remain alive
2555 * then we need to clear up the orphan record which we created above. 2592 * then we need to clear up the orphan record which we created above.
2556 * However, if this was a real unlink then we were called by 2593 * However, if this was a real unlink then we were called by
2557 * ext3_delete_inode(), and we allow that function to clean up the 2594 * ext3_evict_inode(), and we allow that function to clean up the
2558 * orphan info for us. 2595 * orphan info for us.
2559 */ 2596 */
2560 if (inode->i_nlink) 2597 if (inode->i_nlink)
@@ -3198,9 +3235,17 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3198 ext3_journal_stop(handle); 3235 ext3_journal_stop(handle);
3199 } 3236 }
3200 3237
3201 rc = inode_setattr(inode, attr); 3238 if ((attr->ia_valid & ATTR_SIZE) &&
3239 attr->ia_size != i_size_read(inode)) {
3240 rc = vmtruncate(inode, attr->ia_size);
3241 if (rc)
3242 goto err_out;
3243 }
3244
3245 setattr_copy(inode, attr);
3246 mark_inode_dirty(inode);
3202 3247
3203 if (!rc && (ia_valid & ATTR_MODE)) 3248 if (ia_valid & ATTR_MODE)
3204 rc = ext3_acl_chmod(inode); 3249 rc = ext3_acl_chmod(inode);
3205 3250
3206err_out: 3251err_out:
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca4..bce9dce639b 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset;
1451 struct buffer_head * bh; 1450 struct buffer_head * bh;
1452 struct ext3_dir_entry_2 *de; 1451 struct ext3_dir_entry_2 *de;
1453 struct super_block * sb; 1452 struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1469 ext3_mark_inode_dirty(handle, dir); 1468 ext3_mark_inode_dirty(handle, dir);
1470 } 1469 }
1471 blocks = dir->i_size >> sb->s_blocksize_bits; 1470 blocks = dir->i_size >> sb->s_blocksize_bits;
1472 for (block = 0, offset = 0; block < blocks; block++) { 1471 for (block = 0; block < blocks; block++) {
1473 bh = ext3_bread(handle, dir, block, 0, &retval); 1472 bh = ext3_bread(handle, dir, block, 0, &retval);
1474 if(!bh) 1473 if(!bh)
1475 return retval; 1474 return retval;
@@ -2261,7 +2260,7 @@ retry:
2261 2260
2262 inode->i_ctime = CURRENT_TIME_SEC; 2261 inode->i_ctime = CURRENT_TIME_SEC;
2263 inc_nlink(inode); 2262 inc_nlink(inode);
2264 atomic_inc(&inode->i_count); 2263 ihold(inode);
2265 2264
2266 err = ext3_add_entry(handle, dentry, inode); 2265 err = ext3_add_entry(handle, dentry, inode);
2267 if (!err) { 2266 if (!err) {
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef..e746d30b123 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
964 ext3_fsblk_t n_blocks_count) 964 ext3_fsblk_t n_blocks_count)
965{ 965{
966 ext3_fsblk_t o_blocks_count; 966 ext3_fsblk_t o_blocks_count;
967 unsigned long o_groups_count;
968 ext3_grpblk_t last; 967 ext3_grpblk_t last;
969 ext3_grpblk_t add; 968 ext3_grpblk_t add;
970 struct buffer_head * bh; 969 struct buffer_head * bh;
@@ -976,10 +975,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
976 * yet: we're going to revalidate es->s_blocks_count after 975 * yet: we're going to revalidate es->s_blocks_count after
977 * taking the s_resize_lock below. */ 976 * taking the s_resize_lock below. */
978 o_blocks_count = le32_to_cpu(es->s_blocks_count); 977 o_blocks_count = le32_to_cpu(es->s_blocks_count);
979 o_groups_count = EXT3_SB(sb)->s_groups_count;
980 978
981 if (test_opt(sb, DEBUG)) 979 if (test_opt(sb, DEBUG))
982 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", 980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
981 " upto "E3FSBLK" blocks\n",
983 o_blocks_count, n_blocks_count); 982 o_blocks_count, n_blocks_count);
984 983
985 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 984 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -987,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
987 986
988 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 987 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
989 printk(KERN_ERR "EXT3-fs: filesystem on %s:" 988 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
990 " too large to resize to %lu blocks safely\n", 989 " too large to resize to "E3FSBLK" blocks safely\n",
991 sb->s_id, n_blocks_count); 990 sb->s_id, n_blocks_count);
992 if (sizeof(sector_t) < 8) 991 if (sizeof(sector_t) < 8)
993 ext3_warning(sb, __func__, 992 ext3_warning(sb, __func__,
@@ -1067,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1067 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1068 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1069 mutex_unlock(&EXT3_SB(sb)->s_resize_lock); 1068 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1070 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1069 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1071 o_blocks_count + add); 1070 o_blocks_count, o_blocks_count + add);
1072 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1071 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1073 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, 1072 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
1074 o_blocks_count + add); 1073 o_blocks_count, o_blocks_count + add);
1075 if ((err = ext3_journal_stop(handle))) 1074 if ((err = ext3_journal_stop(handle)))
1076 goto exit_put; 1075 goto exit_put;
1077 if (test_opt(sb, DEBUG)) 1076 if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e..2fedaf8b501 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
415 lock_kernel();
416
417 ext3_xattr_put_super(sb); 414 ext3_xattr_put_super(sb);
418 err = journal_destroy(sbi->s_journal); 415 err = journal_destroy(sbi->s_journal);
419 sbi->s_journal = NULL; 416 sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
462 sb->s_fs_info = NULL; 459 sb->s_fs_info = NULL;
463 kfree(sbi->s_blockgroup_lock); 460 kfree(sbi->s_blockgroup_lock);
464 kfree(sbi); 461 kfree(sbi);
465
466 unlock_kernel();
467} 462}
468 463
469static struct kmem_cache *ext3_inode_cachep; 464static struct kmem_cache *ext3_inode_cachep;
@@ -527,17 +522,6 @@ static void destroy_inodecache(void)
527 kmem_cache_destroy(ext3_inode_cachep); 522 kmem_cache_destroy(ext3_inode_cachep);
528} 523}
529 524
530static void ext3_clear_inode(struct inode *inode)
531{
532 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
533
534 dquot_drop(inode);
535 ext3_discard_reservation(inode);
536 EXT3_I(inode)->i_block_alloc_info = NULL;
537 if (unlikely(rsv))
538 kfree(rsv);
539}
540
541static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) 525static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
542{ 526{
543#if defined(CONFIG_QUOTA) 527#if defined(CONFIG_QUOTA)
@@ -661,9 +645,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
661 */ 645 */
662 seq_puts(seq, ",barrier="); 646 seq_puts(seq, ",barrier=");
663 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 647 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
664 if (test_opt(sb, NOBH))
665 seq_puts(seq, ",nobh");
666
667 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); 648 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
668 if (test_opt(sb, DATA_ERR_ABORT)) 649 if (test_opt(sb, DATA_ERR_ABORT))
669 seq_puts(seq, ",data_err=abort"); 650 seq_puts(seq, ",data_err=abort");
@@ -783,14 +764,13 @@ static const struct super_operations ext3_sops = {
783 .destroy_inode = ext3_destroy_inode, 764 .destroy_inode = ext3_destroy_inode,
784 .write_inode = ext3_write_inode, 765 .write_inode = ext3_write_inode,
785 .dirty_inode = ext3_dirty_inode, 766 .dirty_inode = ext3_dirty_inode,
786 .delete_inode = ext3_delete_inode, 767 .evict_inode = ext3_evict_inode,
787 .put_super = ext3_put_super, 768 .put_super = ext3_put_super,
788 .sync_fs = ext3_sync_fs, 769 .sync_fs = ext3_sync_fs,
789 .freeze_fs = ext3_freeze, 770 .freeze_fs = ext3_freeze,
790 .unfreeze_fs = ext3_unfreeze, 771 .unfreeze_fs = ext3_unfreeze,
791 .statfs = ext3_statfs, 772 .statfs = ext3_statfs,
792 .remount_fs = ext3_remount, 773 .remount_fs = ext3_remount,
793 .clear_inode = ext3_clear_inode,
794 .show_options = ext3_show_options, 774 .show_options = ext3_show_options,
795#ifdef CONFIG_QUOTA 775#ifdef CONFIG_QUOTA
796 .quota_read = ext3_quota_read, 776 .quota_read = ext3_quota_read,
@@ -1255,10 +1235,12 @@ set_qf_format:
1255 *n_blocks_count = option; 1235 *n_blocks_count = option;
1256 break; 1236 break;
1257 case Opt_nobh: 1237 case Opt_nobh:
1258 set_opt(sbi->s_mount_opt, NOBH); 1238 ext3_msg(sb, KERN_WARNING,
1239 "warning: ignoring deprecated nobh option");
1259 break; 1240 break;
1260 case Opt_bh: 1241 case Opt_bh:
1261 clear_opt(sbi->s_mount_opt, NOBH); 1242 ext3_msg(sb, KERN_WARNING,
1243 "warning: ignoring deprecated bh option");
1262 break; 1244 break;
1263 default: 1245 default:
1264 ext3_msg(sb, KERN_ERR, 1246 ext3_msg(sb, KERN_ERR,
@@ -1319,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1319 ext3_msg(sb, KERN_WARNING, 1301 ext3_msg(sb, KERN_WARNING,
1320 "warning: mounting fs with errors, " 1302 "warning: mounting fs with errors, "
1321 "running e2fsck is recommended"); 1303 "running e2fsck is recommended");
1322 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1304 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1323 le16_to_cpu(es->s_mnt_count) >= 1305 le16_to_cpu(es->s_mnt_count) >=
1324 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1306 le16_to_cpu(es->s_max_mnt_count))
1325 ext3_msg(sb, KERN_WARNING, 1307 ext3_msg(sb, KERN_WARNING,
1326 "warning: maximal mount count reached, " 1308 "warning: maximal mount count reached, "
1327 "running e2fsck is recommended"); 1309 "running e2fsck is recommended");
@@ -1338,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1338 valid forever! :) */ 1320 valid forever! :) */
1339 es->s_state &= cpu_to_le16(~EXT3_VALID_FS); 1321 es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
1340#endif 1322#endif
1341 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1323 if (!le16_to_cpu(es->s_max_mnt_count))
1342 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); 1324 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1343 le16_add_cpu(&es->s_mnt_count, 1); 1325 le16_add_cpu(&es->s_mnt_count, 1);
1344 es->s_mtime = cpu_to_le32(get_seconds()); 1326 es->s_mtime = cpu_to_le32(get_seconds());
@@ -1640,8 +1622,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1640 sbi->s_resgid = EXT3_DEF_RESGID; 1622 sbi->s_resgid = EXT3_DEF_RESGID;
1641 sbi->s_sb_block = sb_block; 1623 sbi->s_sb_block = sb_block;
1642 1624
1643 unlock_kernel();
1644
1645 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1625 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1646 if (!blocksize) { 1626 if (!blocksize) {
1647 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); 1627 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1667,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1667 * Note: s_es must be initialized as soon as possible because 1647 * Note: s_es must be initialized as soon as possible because
1668 * some ext3 macro-instructions depend on its value 1648 * some ext3 macro-instructions depend on its value
1669 */ 1649 */
1670 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 1650 es = (struct ext3_super_block *) (bh->b_data + offset);
1671 sbi->s_es = es; 1651 sbi->s_es = es;
1672 sb->s_magic = le16_to_cpu(es->s_magic); 1652 sb->s_magic = le16_to_cpu(es->s_magic);
1673 if (sb->s_magic != EXT3_SUPER_MAGIC) 1653 if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1778,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1778 "error: can't read superblock on 2nd try"); 1758 "error: can't read superblock on 2nd try");
1779 goto failed_mount; 1759 goto failed_mount;
1780 } 1760 }
1781 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); 1761 es = (struct ext3_super_block *)(bh->b_data + offset);
1782 sbi->s_es = es; 1762 sbi->s_es = es;
1783 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { 1763 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1784 ext3_msg(sb, KERN_ERR, 1764 ext3_msg(sb, KERN_ERR,
@@ -1862,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1862 goto failed_mount; 1842 goto failed_mount;
1863 } 1843 }
1864 1844
1865 if (le32_to_cpu(es->s_blocks_count) > 1845 if (generic_check_addressable(sb->s_blocksize_bits,
1866 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1846 le32_to_cpu(es->s_blocks_count))) {
1867 ext3_msg(sb, KERN_ERR, 1847 ext3_msg(sb, KERN_ERR,
1868 "error: filesystem is too large to mount safely"); 1848 "error: filesystem is too large to mount safely");
1869 if (sizeof(sector_t) < 8) 1849 if (sizeof(sector_t) < 8)
@@ -1877,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1877 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - 1857 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1878 le32_to_cpu(es->s_first_data_block) - 1) 1858 le32_to_cpu(es->s_first_data_block) - 1)
1879 / EXT3_BLOCKS_PER_GROUP(sb)) + 1; 1859 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1880 db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / 1860 db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
1881 EXT3_DESC_PER_BLOCK(sb);
1882 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), 1861 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1883 GFP_KERNEL); 1862 GFP_KERNEL);
1884 if (sbi->s_group_desc == NULL) { 1863 if (sbi->s_group_desc == NULL) {
1885 ext3_msg(sb, KERN_ERR, 1864 ext3_msg(sb, KERN_ERR,
1886 "error: not enough memory"); 1865 "error: not enough memory");
1866 ret = -ENOMEM;
1887 goto failed_mount; 1867 goto failed_mount;
1888 } 1868 }
1889 1869
@@ -1971,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1971 } 1951 }
1972 if (err) { 1952 if (err) {
1973 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 1953 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1954 ret = err;
1974 goto failed_mount3; 1955 goto failed_mount3;
1975 } 1956 }
1976 1957
@@ -2001,14 +1982,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 break; 1982 break;
2002 } 1983 }
2003 1984
2004 if (test_opt(sb, NOBH)) {
2005 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
2006 ext3_msg(sb, KERN_WARNING,
2007 "warning: ignoring nobh option - "
2008 "it is supported only with writeback mode");
2009 clear_opt(sbi->s_mount_opt, NOBH);
2010 }
2011 }
2012 /* 1985 /*
2013 * The journal_load will have done any necessary log recovery, 1986 * The journal_load will have done any necessary log recovery,
2014 * so we can safely mount the rest of the filesystem now. 1987 * so we can safely mount the rest of the filesystem now.
@@ -2046,7 +2019,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2046 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2019 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2047 "writeback"); 2020 "writeback");
2048 2021
2049 lock_kernel();
2050 return 0; 2022 return 0;
2051 2023
2052cantfind_ext3: 2024cantfind_ext3:
@@ -2076,7 +2048,6 @@ out_fail:
2076 sb->s_fs_info = NULL; 2048 sb->s_fs_info = NULL;
2077 kfree(sbi->s_blockgroup_lock); 2049 kfree(sbi->s_blockgroup_lock);
2078 kfree(sbi); 2050 kfree(sbi);
2079 lock_kernel();
2080 return ret; 2051 return ret;
2081} 2052}
2082 2053
@@ -2189,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2189 goto out_bdev; 2160 goto out_bdev;
2190 } 2161 }
2191 2162
2192 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 2163 es = (struct ext3_super_block *) (bh->b_data + offset);
2193 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || 2164 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2194 !(le32_to_cpu(es->s_feature_incompat) & 2165 !(le32_to_cpu(es->s_feature_incompat) &
2195 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { 2166 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2382,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb,
2382 2353
2383 if (!sbh) 2354 if (!sbh)
2384 return error; 2355 return error;
2356
2357 if (buffer_write_io_error(sbh)) {
2358 /*
2359 * Oh, dear. A previous attempt to write the
2360 * superblock failed. This could happen because the
2361 * USB device was yanked out. Or it could happen to
2362 * be a transient write error and maybe the block will
2363 * be remapped. Nothing we can do but to retry the
2364 * write and hope for the best.
2365 */
2366 ext3_msg(sb, KERN_ERR, "previous I/O error to "
2367 "superblock detected");
2368 clear_buffer_write_io_error(sbh);
2369 set_buffer_uptodate(sbh);
2370 }
2385 /* 2371 /*
2386 * If the file system is mounted read-only, don't update the 2372 * If the file system is mounted read-only, don't update the
2387 * superblock write time. This avoids updating the superblock 2373 * superblock write time. This avoids updating the superblock
@@ -2398,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb,
2398 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2384 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2399 BUFFER_TRACE(sbh, "marking dirty"); 2385 BUFFER_TRACE(sbh, "marking dirty");
2400 mark_buffer_dirty(sbh); 2386 mark_buffer_dirty(sbh);
2401 if (sync) 2387 if (sync) {
2402 error = sync_dirty_buffer(sbh); 2388 error = sync_dirty_buffer(sbh);
2389 if (buffer_write_io_error(sbh)) {
2390 ext3_msg(sb, KERN_ERR, "I/O error while writing "
2391 "superblock");
2392 clear_buffer_write_io_error(sbh);
2393 set_buffer_uptodate(sbh);
2394 }
2395 }
2403 return error; 2396 return error;
2404} 2397}
2405 2398
@@ -2559,8 +2552,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2559 int i; 2552 int i;
2560#endif 2553#endif
2561 2554
2562 lock_kernel();
2563
2564 /* Store the original options */ 2555 /* Store the original options */
2565 lock_super(sb); 2556 lock_super(sb);
2566 old_sb_flags = sb->s_flags; 2557 old_sb_flags = sb->s_flags;
@@ -2669,7 +2660,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2669 kfree(old_opts.s_qf_names[i]); 2660 kfree(old_opts.s_qf_names[i]);
2670#endif 2661#endif
2671 unlock_super(sb); 2662 unlock_super(sb);
2672 unlock_kernel();
2673 2663
2674 if (enable_quota) 2664 if (enable_quota)
2675 dquot_resume(sb, -1); 2665 dquot_resume(sb, -1);
@@ -2690,7 +2680,6 @@ restore_opts:
2690 } 2680 }
2691#endif 2681#endif
2692 unlock_super(sb); 2682 unlock_super(sb);
2693 unlock_kernel();
2694 return err; 2683 return err;
2695} 2684}
2696 2685
@@ -3031,16 +3020,16 @@ out:
3031 3020
3032#endif 3021#endif
3033 3022
3034static int ext3_get_sb(struct file_system_type *fs_type, 3023static struct dentry *ext3_mount(struct file_system_type *fs_type,
3035 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3024 int flags, const char *dev_name, void *data)
3036{ 3025{
3037 return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); 3026 return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
3038} 3027}
3039 3028
3040static struct file_system_type ext3_fs_type = { 3029static struct file_system_type ext3_fs_type = {
3041 .owner = THIS_MODULE, 3030 .owner = THIS_MODULE,
3042 .name = "ext3", 3031 .name = "ext3",
3043 .get_sb = ext3_get_sb, 3032 .mount = ext3_mount,
3044 .kill_sb = kill_block_super, 3033 .kill_sb = kill_block_super,
3045 .fs_flags = FS_REQUIRES_DEV, 3034 .fs_flags = FS_REQUIRES_DEV,
3046}; 3035};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 71fb8d65e54..e69dc6dfaa8 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
1139 ea_bdebug(bh, "out of memory"); 1139 ea_bdebug(bh, "out of memory");
1140 return; 1140 return;
1141 } 1141 }
1142 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 1142 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1143 if (error) { 1143 if (error) {
1144 mb_cache_entry_free(ce); 1144 mb_cache_entry_free(ce);
1145 if (error == -EBUSY) { 1145 if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1211 return NULL; /* never share */ 1211 return NULL; /* never share */
1212 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1212 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1213again: 1213again:
1214 ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, 1214 ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
1215 inode->i_sb->s_bdev, hash); 1215 hash);
1216 while (ce) { 1216 while (ce) {
1217 struct buffer_head *bh; 1217 struct buffer_head *bh;
1218 1218
@@ -1237,7 +1237,7 @@ again:
1237 return bh; 1237 return bh;
1238 } 1238 }
1239 brelse(bh); 1239 brelse(bh);
1240 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 1240 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1241 } 1241 }
1242 return NULL; 1242 return NULL;
1243} 1243}
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1313int __init 1313int __init
1314init_ext3_xattr(void) 1314init_ext3_xattr(void)
1315{ 1315{
1316 ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, 1316 ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
1317 sizeof(struct mb_cache_entry) +
1318 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1319 if (!ext3_xattr_cache) 1317 if (!ext3_xattr_cache)
1320 return -ENOMEM; 1318 return -ENOMEM;
1321 return 0; 1319 return 0;
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5f..c947e36eda6 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
10 10
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index feaf498feaa..5e2ed4504ea 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
204 return error; 204 return error;
205 else { 205 else {
206 inode->i_mode = mode; 206 inode->i_mode = mode;
207 inode->i_ctime = ext4_current_time(inode);
207 ext4_mark_inode_dirty(handle, inode); 208 ext4_mark_inode_dirty(handle, inode);
208 if (error == 0) 209 if (error == 0)
209 acl = NULL; 210 acl = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 95b7594c76f..14c3af26c67 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
171 * less than the blocksize * 8 ( which is the size 171 * less than the blocksize * 8 ( which is the size
172 * of bitmap ), set rest of the block bitmap to 1 172 * of bitmap ), set rest of the block bitmap to 1
173 */ 173 */
174 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 174 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
175 bh->b_data);
175 } 176 }
176 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
177} 178}
@@ -377,14 +378,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
377 ext4_grpblk_t bit; 378 ext4_grpblk_t bit;
378 unsigned int i; 379 unsigned int i;
379 struct ext4_group_desc *desc; 380 struct ext4_group_desc *desc;
380 struct ext4_super_block *es; 381 struct ext4_sb_info *sbi = EXT4_SB(sb);
381 struct ext4_sb_info *sbi;
382 int err = 0, ret, blk_free_count; 382 int err = 0, ret, blk_free_count;
383 ext4_grpblk_t blocks_freed; 383 ext4_grpblk_t blocks_freed;
384 struct ext4_group_info *grp; 384 struct ext4_group_info *grp;
385 385
386 sbi = EXT4_SB(sb);
387 es = sbi->s_es;
388 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 386 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
389 387
390 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 388 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +475,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
477 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 475 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
478 if (!err) 476 if (!err)
479 err = ret; 477 err = ret;
480 sb->s_dirt = 1;
481 478
482error_return: 479error_return:
483 brelse(bitmap_bh); 480 brelse(bitmap_bh);
@@ -493,7 +490,7 @@ error_return:
493 * Check if filesystem has nblocks free & available for allocation. 490 * Check if filesystem has nblocks free & available for allocation.
494 * On success return 1, return 0 on failure. 491 * On success return 1, return 0 on failure.
495 */ 492 */
496int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 493static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
497{ 494{
498 s64 free_blocks, dirty_blocks, root_blocks; 495 s64 free_blocks, dirty_blocks, root_blocks;
499 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 496 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 5b6973fbf1b..fac90f3fba8 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
29 29
30static struct kmem_cache *ext4_system_zone_cachep; 30static struct kmem_cache *ext4_system_zone_cachep;
31 31
32int __init init_ext4_system_zone(void) 32int __init ext4_init_system_zone(void)
33{ 33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL) 35 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM; 36 return -ENOMEM;
38 return 0; 37 return 0;
39} 38}
40 39
41void exit_ext4_system_zone(void) 40void ext4_exit_system_zone(void)
42{ 41{
43 kmem_cache_destroy(ext4_system_zone_cachep); 42 kmem_cache_destroy(ext4_system_zone_cachep);
44} 43}
@@ -229,16 +228,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
229 228
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 229 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count < start_blk) || 230 (start_blk + count < start_blk) ||
232 (start_blk + count > ext4_blocks_count(sbi->s_es))) 231 (start_blk + count > ext4_blocks_count(sbi->s_es))) {
232 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
233 return 0; 233 return 0;
234 }
234 while (n) { 235 while (n) {
235 entry = rb_entry(n, struct ext4_system_zone, node); 236 entry = rb_entry(n, struct ext4_system_zone, node);
236 if (start_blk + count - 1 < entry->start_blk) 237 if (start_blk + count - 1 < entry->start_blk)
237 n = n->rb_left; 238 n = n->rb_left;
238 else if (start_blk >= (entry->start_blk + entry->count)) 239 else if (start_blk >= (entry->start_blk + entry->count))
239 n = n->rb_right; 240 n = n->rb_right;
240 else 241 else {
242 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
241 return 0; 243 return 0;
244 }
242 } 245 }
243 return 1; 246 return 1;
244} 247}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ea5e6cb7e2a..ece76fb6a40 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
39 struct file *filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = ext4_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext4_readdir, /* we take BKL. needed?*/ 44 .readdir = ext4_readdir, /* we take BKL. needed?*/
45 .unlocked_ioctl = ext4_ioctl, 45 .unlocked_ioctl = ext4_ioctl,
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct ext4_dir_entry_2 *de, 65 struct inode *dir,
66 struct buffer_head *bh, 66 struct ext4_dir_entry_2 *de,
67 unsigned int offset) 67 struct buffer_head *bh,
68 unsigned int offset)
68{ 69{
69 const char *error_msg = NULL; 70 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len, 71 const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 84 error_msg = "inode out of bounds";
84 85
85 if (error_msg != NULL) 86 if (error_msg != NULL)
86 ext4_error_inode(function, dir, 87 ext4_error_inode(dir, function, line, bh->b_blocknr,
87 "bad entry in directory: %s - block=%llu" 88 "bad entry in directory: %s - "
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 error_msg, (unsigned long long) bh->b_blocknr, 90 error_msg, (unsigned) (offset%bh->b_size), offset,
90 (unsigned) (offset%bh->b_size), offset,
91 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
92 rlen, de->name_len); 92 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 93 return error_msg == NULL ? 1 : 0;
@@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp,
121 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
122 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
123 */ 123 */
124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
125 EXT4_INODE_INDEX);
125 } 126 }
126 stored = 0; 127 stored = 0;
127 offset = filp->f_pos & (sb->s_blocksize - 1); 128 offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -193,7 +194,7 @@ revalidate:
193 while (!error && filp->f_pos < inode->i_size 194 while (!error && filp->f_pos < inode->i_size
194 && offset < sb->s_blocksize) { 195 && offset < sb->s_blocksize) {
195 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
196 if (!ext4_check_dir_entry("ext4_readdir", inode, de, 197 if (!ext4_check_dir_entry(inode, de,
197 bh, offset)) { 198 bh, offset)) {
198 /* 199 /*
199 * On error, skip the f_pos to the next block 200 * On error, skip the f_pos to the next block
@@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
343 struct dir_private_info *info; 344 struct dir_private_info *info;
344 int len; 345 int len;
345 346
346 info = (struct dir_private_info *) dir_file->private_data; 347 info = dir_file->private_data;
347 p = &info->root.rb_node; 348 p = &info->root.rb_node;
348 349
349 /* Create and allocate the fname structure */ 350 /* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de57128..8b5dd6369f8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,10 +57,13 @@
57#endif 57#endif
58 58
59#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
60 ext4_error_inode(__func__, (inode), (fmt), ## a) 60 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
61
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
61 64
62#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, fmt, a...) \
63 ext4_error_file(__func__, (file), (fmt), ## a) 66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
64 67
65/* data type for block offset of block group */ 68/* data type for block offset of block group */
66typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -165,17 +168,42 @@ struct mpage_da_data {
165 int pages_written; 168 int pages_written;
166 int retval; 169 int retval;
167}; 170};
168#define EXT4_IO_UNWRITTEN 0x1 171
172/*
173 * Flags for ext4_io_end->flags
174 */
175#define EXT4_IO_END_UNWRITTEN 0x0001
176#define EXT4_IO_END_ERROR 0x0002
177
178struct ext4_io_page {
179 struct page *p_page;
180 int p_count;
181};
182
183#define MAX_IO_PAGES 128
184
169typedef struct ext4_io_end { 185typedef struct ext4_io_end {
170 struct list_head list; /* per-file finished AIO list */ 186 struct list_head list; /* per-file finished IO list */
171 struct inode *inode; /* file being written to */ 187 struct inode *inode; /* file being written to */
172 unsigned int flag; /* unwritten or not */ 188 unsigned int flag; /* unwritten or not */
173 struct page *page; /* page struct for buffer write */ 189 struct page *page; /* page struct for buffer write */
174 loff_t offset; /* offset in the file */ 190 loff_t offset; /* offset in the file */
175 ssize_t size; /* size of the extent */ 191 ssize_t size; /* size of the extent */
176 struct work_struct work; /* data work queue */ 192 struct work_struct work; /* data work queue */
193 struct kiocb *iocb; /* iocb struct for AIO */
194 int result; /* error value for AIO */
195 int num_io_pages;
196 struct ext4_io_page *pages[MAX_IO_PAGES];
177} ext4_io_end_t; 197} ext4_io_end_t;
178 198
199struct ext4_io_submit {
200 int io_op;
201 struct bio *io_bio;
202 ext4_io_end_t *io_end;
203 struct ext4_io_page *io_page;
204 sector_t io_next_block;
205};
206
179/* 207/*
180 * Special inodes numbers 208 * Special inodes numbers
181 */ 209 */
@@ -200,6 +228,7 @@ typedef struct ext4_io_end {
200#define EXT4_MIN_BLOCK_SIZE 1024 228#define EXT4_MIN_BLOCK_SIZE 1024
201#define EXT4_MAX_BLOCK_SIZE 65536 229#define EXT4_MAX_BLOCK_SIZE 65536
202#define EXT4_MIN_BLOCK_LOG_SIZE 10 230#define EXT4_MIN_BLOCK_LOG_SIZE 10
231#define EXT4_MAX_BLOCK_LOG_SIZE 16
203#ifdef __KERNEL__ 232#ifdef __KERNEL__
204# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) 233# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
205#else 234#else
@@ -460,7 +489,7 @@ struct ext4_new_group_data {
460}; 489};
461 490
462/* 491/*
463 * Flags used by ext4_get_blocks() 492 * Flags used by ext4_map_blocks()
464 */ 493 */
465 /* Allocate any needed blocks and/or convert an unitialized 494 /* Allocate any needed blocks and/or convert an unitialized
466 extent to be an initialized ext4 */ 495 extent to be an initialized ext4 */
@@ -873,7 +902,6 @@ struct ext4_inode_info {
873#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 902#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
874#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ 903#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
875#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 904#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
876#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
877#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 905#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
878#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 906#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
879#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 907#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
@@ -885,6 +913,7 @@ struct ext4_inode_info {
885#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 913#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
886#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 914#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
887#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 915#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
916#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
888 917
889#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 918#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
890#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 919#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -982,7 +1011,7 @@ struct ext4_super_block {
982 __le32 s_last_orphan; /* start of list of inodes to delete */ 1011 __le32 s_last_orphan; /* start of list of inodes to delete */
983 __le32 s_hash_seed[4]; /* HTREE hash seed */ 1012 __le32 s_hash_seed[4]; /* HTREE hash seed */
984 __u8 s_def_hash_version; /* Default hash version to use */ 1013 __u8 s_def_hash_version; /* Default hash version to use */
985 __u8 s_reserved_char_pad; 1014 __u8 s_jnl_backup_type;
986 __le16 s_desc_size; /* size of group descriptor */ 1015 __le16 s_desc_size; /* size of group descriptor */
987/*100*/ __le32 s_default_mount_opts; 1016/*100*/ __le32 s_default_mount_opts;
988 __le32 s_first_meta_bg; /* First metablock block group */ 1017 __le32 s_first_meta_bg; /* First metablock block group */
@@ -1000,12 +1029,34 @@ struct ext4_super_block {
1000 __le64 s_mmp_block; /* Block for multi-mount protection */ 1029 __le64 s_mmp_block; /* Block for multi-mount protection */
1001 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1030 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1002 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1031 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1003 __u8 s_reserved_char_pad2; 1032 __u8 s_reserved_char_pad;
1004 __le16 s_reserved_pad; 1033 __le16 s_reserved_pad;
1005 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ 1034 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
1006 __u32 s_reserved[160]; /* Padding to the end of the block */ 1035 __le32 s_snapshot_inum; /* Inode number of active snapshot */
1036 __le32 s_snapshot_id; /* sequential ID of active snapshot */
1037 __le64 s_snapshot_r_blocks_count; /* reserved blocks for active
1038 snapshot's future use */
1039 __le32 s_snapshot_list; /* inode number of the head of the
1040 on-disk snapshot list */
1041#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
1042 __le32 s_error_count; /* number of fs errors */
1043 __le32 s_first_error_time; /* first time an error happened */
1044 __le32 s_first_error_ino; /* inode involved in first error */
1045 __le64 s_first_error_block; /* block involved of first error */
1046 __u8 s_first_error_func[32]; /* function where the error happened */
1047 __le32 s_first_error_line; /* line number where error happened */
1048 __le32 s_last_error_time; /* most recent time of an error */
1049 __le32 s_last_error_ino; /* inode involved in last error */
1050 __le32 s_last_error_line; /* line number where error happened */
1051 __le64 s_last_error_block; /* block involved of last error */
1052 __u8 s_last_error_func[32]; /* function where the error happened */
1053#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
1054 __u8 s_mount_opts[64];
1055 __le32 s_reserved[112]; /* Padding to the end of the block */
1007}; 1056};
1008 1057
1058#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
1059
1009#ifdef __KERNEL__ 1060#ifdef __KERNEL__
1010 1061
1011/* 1062/*
@@ -1061,7 +1112,6 @@ struct ext4_sb_info {
1061 struct completion s_kobj_unregister; 1112 struct completion s_kobj_unregister;
1062 1113
1063 /* Journaling */ 1114 /* Journaling */
1064 struct inode *s_journal_inode;
1065 struct journal_s *s_journal; 1115 struct journal_s *s_journal;
1066 struct list_head s_orphan; 1116 struct list_head s_orphan;
1067 struct mutex s_orphan_lock; 1117 struct mutex s_orphan_lock;
@@ -1094,10 +1144,7 @@ struct ext4_sb_info {
1094 /* for buddy allocator */ 1144 /* for buddy allocator */
1095 struct ext4_group_info ***s_group_info; 1145 struct ext4_group_info ***s_group_info;
1096 struct inode *s_buddy_cache; 1146 struct inode *s_buddy_cache;
1097 long s_blocks_reserved;
1098 spinlock_t s_reserve_lock;
1099 spinlock_t s_md_lock; 1147 spinlock_t s_md_lock;
1100 tid_t s_last_transaction;
1101 unsigned short *s_mb_offsets; 1148 unsigned short *s_mb_offsets;
1102 unsigned int *s_mb_maxs; 1149 unsigned int *s_mb_maxs;
1103 1150
@@ -1115,7 +1162,6 @@ struct ext4_sb_info {
1115 unsigned long s_mb_last_start; 1162 unsigned long s_mb_last_start;
1116 1163
1117 /* stats for buddy allocator */ 1164 /* stats for buddy allocator */
1118 spinlock_t s_mb_pa_lock;
1119 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 1165 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
1120 atomic_t s_bal_success; /* we found long enough chunks */ 1166 atomic_t s_bal_success; /* we found long enough chunks */
1121 atomic_t s_bal_allocated; /* in blocks */ 1167 atomic_t s_bal_allocated; /* in blocks */
@@ -1143,6 +1189,14 @@ struct ext4_sb_info {
1143 1189
1144 /* workqueue for dio unwritten */ 1190 /* workqueue for dio unwritten */
1145 struct workqueue_struct *dio_unwritten_wq; 1191 struct workqueue_struct *dio_unwritten_wq;
1192
1193 /* timer for periodic error stats printing */
1194 struct timer_list s_err_report;
1195
1196 /* Lazy inode table initialization info */
1197 struct ext4_li_request *s_li_request;
1198 /* Wait multiplier for lazy initialization thread */
1199 unsigned int s_li_wait_mult;
1146}; 1200};
1147 1201
1148static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1202static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1313,6 +1367,10 @@ EXT4_INODE_BIT_FNS(state, state_flags)
1313#define EXT4_DEFM_JMODE_DATA 0x0020 1367#define EXT4_DEFM_JMODE_DATA 0x0020
1314#define EXT4_DEFM_JMODE_ORDERED 0x0040 1368#define EXT4_DEFM_JMODE_ORDERED 0x0040
1315#define EXT4_DEFM_JMODE_WBACK 0x0060 1369#define EXT4_DEFM_JMODE_WBACK 0x0060
1370#define EXT4_DEFM_NOBARRIER 0x0100
1371#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
1372#define EXT4_DEFM_DISCARD 0x0400
1373#define EXT4_DEFM_NODELALLOC 0x0800
1316 1374
1317/* 1375/*
1318 * Default journal batch times 1376 * Default journal batch times
@@ -1379,6 +1437,43 @@ struct ext4_dir_entry_2 {
1379#define EXT4_MAX_REC_LEN ((1<<16)-1) 1437#define EXT4_MAX_REC_LEN ((1<<16)-1)
1380 1438
1381/* 1439/*
1440 * If we ever get support for fs block sizes > page_size, we'll need
1441 * to remove the #if statements in the next two functions...
1442 */
1443static inline unsigned int
1444ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
1445{
1446 unsigned len = le16_to_cpu(dlen);
1447
1448#if (PAGE_CACHE_SIZE >= 65536)
1449 if (len == EXT4_MAX_REC_LEN || len == 0)
1450 return blocksize;
1451 return (len & 65532) | ((len & 3) << 16);
1452#else
1453 return len;
1454#endif
1455}
1456
1457static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
1458{
1459 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
1460 BUG();
1461#if (PAGE_CACHE_SIZE >= 65536)
1462 if (len < 65536)
1463 return cpu_to_le16(len);
1464 if (len == blocksize) {
1465 if (blocksize == 65536)
1466 return cpu_to_le16(EXT4_MAX_REC_LEN);
1467 else
1468 return cpu_to_le16(0);
1469 }
1470 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
1471#else
1472 return cpu_to_le16(len);
1473#endif
1474}
1475
1476/*
1382 * Hash Tree Directory indexing 1477 * Hash Tree Directory indexing
1383 * (c) Daniel Phillips, 2001 1478 * (c) Daniel Phillips, 2001
1384 */ 1479 */
@@ -1463,7 +1558,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1463void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 1558void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1464 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); 1559 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1465 1560
1466extern struct proc_dir_entry *ext4_proc_root; 1561/*
1562 * Timeout and state flag for lazy initialization inode thread.
1563 */
1564#define EXT4_DEF_LI_WAIT_MULT 10
1565#define EXT4_DEF_LI_MAX_START_DELAY 5
1566#define EXT4_LAZYINIT_QUIT 0x0001
1567#define EXT4_LAZYINIT_RUNNING 0x0002
1568
1569/*
1570 * Lazy inode table initialization info
1571 */
1572struct ext4_lazy_init {
1573 unsigned long li_state;
1574
1575 wait_queue_head_t li_wait_daemon;
1576 wait_queue_head_t li_wait_task;
1577 struct timer_list li_timer;
1578 struct task_struct *li_task;
1579
1580 struct list_head li_request_list;
1581 struct mutex li_list_mtx;
1582};
1583
1584struct ext4_li_request {
1585 struct super_block *lr_super;
1586 struct ext4_sb_info *lr_sbi;
1587 ext4_group_t lr_next_group;
1588 struct list_head lr_request;
1589 unsigned long lr_next_sched;
1590 unsigned long lr_timeout;
1591};
1592
1593struct ext4_features {
1594 struct kobject f_kobj;
1595 struct completion f_kobj_unregister;
1596};
1467 1597
1468/* 1598/*
1469 * Function prototypes 1599 * Function prototypes
@@ -1491,7 +1621,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1491extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1621extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1492 ext4_fsblk_t goal, unsigned long *count, int *errp); 1622 ext4_fsblk_t goal, unsigned long *count, int *errp);
1493extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1623extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1494extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1495extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1624extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1496 ext4_fsblk_t block, unsigned long count); 1625 ext4_fsblk_t block, unsigned long count);
1497extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1626extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1510,9 +1639,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1510 ext4_init_block_bitmap(sb, NULL, group, desc) 1639 ext4_init_block_bitmap(sb, NULL, group, desc)
1511 1640
1512/* dir.c */ 1641/* dir.c */
1513extern int ext4_check_dir_entry(const char *, struct inode *, 1642extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1514 struct ext4_dir_entry_2 *, 1643 struct ext4_dir_entry_2 *,
1515 struct buffer_head *, unsigned int); 1644 struct buffer_head *, unsigned int);
1645#define ext4_check_dir_entry(dir, de, bh, offset) \
1646 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
1516extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1647extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1517 __u32 minor_hash, 1648 __u32 minor_hash,
1518 struct ext4_dir_entry_2 *dirent); 1649 struct ext4_dir_entry_2 *dirent);
@@ -1533,11 +1664,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1533extern unsigned long ext4_count_free_inodes(struct super_block *); 1664extern unsigned long ext4_count_free_inodes(struct super_block *);
1534extern unsigned long ext4_count_dirs(struct super_block *); 1665extern unsigned long ext4_count_dirs(struct super_block *);
1535extern void ext4_check_inodes_bitmap(struct super_block *); 1666extern void ext4_check_inodes_bitmap(struct super_block *);
1536extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 1667extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1537 struct buffer_head *bh, 1668extern int ext4_init_inode_table(struct super_block *sb,
1538 ext4_group_t group, 1669 ext4_group_t group, int barrier);
1539 struct ext4_group_desc *desc);
1540extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1541 1670
1542/* mballoc.c */ 1671/* mballoc.c */
1543extern long ext4_mb_stats; 1672extern long ext4_mb_stats;
@@ -1548,16 +1677,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1548 struct ext4_allocation_request *, int *); 1677 struct ext4_allocation_request *, int *);
1549extern int ext4_mb_reserve_blocks(struct super_block *, int); 1678extern int ext4_mb_reserve_blocks(struct super_block *, int);
1550extern void ext4_discard_preallocations(struct inode *); 1679extern void ext4_discard_preallocations(struct inode *);
1551extern int __init init_ext4_mballoc(void); 1680extern int __init ext4_init_mballoc(void);
1552extern void exit_ext4_mballoc(void); 1681extern void ext4_exit_mballoc(void);
1553extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1682extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1554 struct buffer_head *bh, ext4_fsblk_t block, 1683 struct buffer_head *bh, ext4_fsblk_t block,
1555 unsigned long count, int flags); 1684 unsigned long count, int flags);
1556extern int ext4_mb_add_groupinfo(struct super_block *sb, 1685extern int ext4_mb_add_groupinfo(struct super_block *sb,
1557 ext4_group_t i, struct ext4_group_desc *desc); 1686 ext4_group_t i, struct ext4_group_desc *desc);
1558extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1687extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1559extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1688
1560 ext4_group_t, int);
1561/* inode.c */ 1689/* inode.c */
1562struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1690struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1563 ext4_lblk_t, int, int *); 1691 ext4_lblk_t, int, int *);
@@ -1571,7 +1699,8 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
1571extern int ext4_setattr(struct dentry *, struct iattr *); 1699extern int ext4_setattr(struct dentry *, struct iattr *);
1572extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1700extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1573 struct kstat *stat); 1701 struct kstat *stat);
1574extern void ext4_delete_inode(struct inode *); 1702extern void ext4_evict_inode(struct inode *);
1703extern void ext4_clear_inode(struct inode *);
1575extern int ext4_sync_inode(handle_t *, struct inode *); 1704extern int ext4_sync_inode(handle_t *, struct inode *);
1576extern void ext4_dirty_inode(struct inode *); 1705extern void ext4_dirty_inode(struct inode *);
1577extern int ext4_change_inode_journal_flag(struct inode *, int); 1706extern int ext4_change_inode_journal_flag(struct inode *, int);
@@ -1584,13 +1713,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
1584extern int ext4_alloc_da_blocks(struct inode *inode); 1713extern int ext4_alloc_da_blocks(struct inode *inode);
1585extern void ext4_set_aops(struct inode *inode); 1714extern void ext4_set_aops(struct inode *inode);
1586extern int ext4_writepage_trans_blocks(struct inode *); 1715extern int ext4_writepage_trans_blocks(struct inode *);
1587extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1588extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1716extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1589extern int ext4_block_truncate_page(handle_t *handle, 1717extern int ext4_block_truncate_page(handle_t *handle,
1590 struct address_space *mapping, loff_t from); 1718 struct address_space *mapping, loff_t from);
1591extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1719extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1592extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1720extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1593extern int flush_completed_IO(struct inode *inode);
1594extern void ext4_da_update_reserve_space(struct inode *inode, 1721extern void ext4_da_update_reserve_space(struct inode *inode,
1595 int used, int quota_claim); 1722 int used, int quota_claim);
1596/* ioctl.c */ 1723/* ioctl.c */
@@ -1601,8 +1728,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1601extern int ext4_ext_migrate(struct inode *); 1728extern int ext4_ext_migrate(struct inode *);
1602 1729
1603/* namei.c */ 1730/* namei.c */
1604extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1605extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1606extern int ext4_orphan_add(handle_t *, struct inode *); 1731extern int ext4_orphan_add(handle_t *, struct inode *);
1607extern int ext4_orphan_del(handle_t *, struct inode *); 1732extern int ext4_orphan_del(handle_t *, struct inode *);
1608extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1733extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1616,25 +1741,38 @@ extern int ext4_group_extend(struct super_block *sb,
1616 ext4_fsblk_t n_blocks_count); 1741 ext4_fsblk_t n_blocks_count);
1617 1742
1618/* super.c */ 1743/* super.c */
1619extern void __ext4_error(struct super_block *, const char *, const char *, ...) 1744extern void __ext4_error(struct super_block *, const char *, unsigned int,
1620 __attribute__ ((format (printf, 3, 4))); 1745 const char *, ...)
1621#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) 1746 __attribute__ ((format (printf, 4, 5)));
1622extern void ext4_error_inode(const char *, struct inode *, const char *, ...) 1747#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
1623 __attribute__ ((format (printf, 3, 4))); 1748 __LINE__, ## message)
1624extern void ext4_error_file(const char *, struct file *, const char *, ...) 1749extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1625 __attribute__ ((format (printf, 3, 4))); 1750 ext4_fsblk_t, const char *, ...)
1626extern void __ext4_std_error(struct super_block *, const char *, int); 1751 __attribute__ ((format (printf, 5, 6)));
1627extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1752extern void ext4_error_file(struct file *, const char *, unsigned int,
1628 __attribute__ ((format (printf, 3, 4))); 1753 const char *, ...)
1629extern void __ext4_warning(struct super_block *, const char *, 1754 __attribute__ ((format (printf, 4, 5)));
1755extern void __ext4_std_error(struct super_block *, const char *,
1756 unsigned int, int);
1757extern void __ext4_abort(struct super_block *, const char *, unsigned int,
1758 const char *, ...)
1759 __attribute__ ((format (printf, 4, 5)));
1760#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
1761 __LINE__, ## message)
1762extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1630 const char *, ...) 1763 const char *, ...)
1631 __attribute__ ((format (printf, 3, 4))); 1764 __attribute__ ((format (printf, 4, 5)));
1632#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) 1765#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
1766 __LINE__, ## message)
1633extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1767extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1634 __attribute__ ((format (printf, 3, 4))); 1768 __attribute__ ((format (printf, 3, 4)));
1635extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1769extern void __ext4_grp_locked_error(const char *, unsigned int, \
1636 const char *, const char *, ...) 1770 struct super_block *, ext4_group_t, \
1637 __attribute__ ((format (printf, 4, 5))); 1771 unsigned long, ext4_fsblk_t, \
1772 const char *, ...)
1773 __attribute__ ((format (printf, 7, 8)));
1774#define ext4_grp_locked_error(sb, grp, message...) \
1775 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
1638extern void ext4_update_dynamic_rev(struct super_block *sb); 1776extern void ext4_update_dynamic_rev(struct super_block *sb);
1639extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1777extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1640 __u32 compat); 1778 __u32 compat);
@@ -1768,7 +1906,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1768#define ext4_std_error(sb, errno) \ 1906#define ext4_std_error(sb, errno) \
1769do { \ 1907do { \
1770 if ((errno)) \ 1908 if ((errno)) \
1771 __ext4_std_error((sb), __func__, (errno)); \ 1909 __ext4_std_error((sb), __func__, __LINE__, (errno)); \
1772} while (0) 1910} while (0)
1773 1911
1774#ifdef CONFIG_SMP 1912#ifdef CONFIG_SMP
@@ -1860,6 +1998,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
1860 spin_unlock(ext4_group_lock_ptr(sb, group)); 1998 spin_unlock(ext4_group_lock_ptr(sb, group));
1861} 1999}
1862 2000
2001static inline void ext4_mark_super_dirty(struct super_block *sb)
2002{
2003 if (EXT4_SB(sb)->s_journal == NULL)
2004 sb->s_dirt =1;
2005}
2006
1863/* 2007/*
1864 * Inodes and files operations 2008 * Inodes and files operations
1865 */ 2009 */
@@ -1870,6 +2014,7 @@ extern const struct file_operations ext4_dir_operations;
1870/* file.c */ 2014/* file.c */
1871extern const struct inode_operations ext4_file_inode_operations; 2015extern const struct inode_operations ext4_file_inode_operations;
1872extern const struct file_operations ext4_file_operations; 2016extern const struct file_operations ext4_file_operations;
2017extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
1873 2018
1874/* namei.c */ 2019/* namei.c */
1875extern const struct inode_operations ext4_dir_inode_operations; 2020extern const struct inode_operations ext4_dir_inode_operations;
@@ -1883,8 +2028,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1883/* block_validity */ 2028/* block_validity */
1884extern void ext4_release_system_zone(struct super_block *sb); 2029extern void ext4_release_system_zone(struct super_block *sb);
1885extern int ext4_setup_system_zone(struct super_block *sb); 2030extern int ext4_setup_system_zone(struct super_block *sb);
1886extern int __init init_ext4_system_zone(void); 2031extern int __init ext4_init_system_zone(void);
1887extern void exit_ext4_system_zone(void); 2032extern void ext4_exit_system_zone(void);
1888extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2033extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1889 ext4_fsblk_t start_blk, 2034 ext4_fsblk_t start_blk,
1890 unsigned int count); 2035 unsigned int count);
@@ -1905,9 +2050,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1905 ssize_t len); 2050 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2051extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags); 2052 struct ext4_map_blocks *map, int flags);
1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1909 sector_t block, unsigned int max_blocks,
1910 struct buffer_head *bh, int flags);
1911extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2053extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1912 __u64 start, __u64 len); 2054 __u64 start, __u64 len);
1913/* move_extent.c */ 2055/* move_extent.c */
@@ -1915,6 +2057,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1915 __u64 start_orig, __u64 start_donor, 2057 __u64 start_orig, __u64 start_donor,
1916 __u64 len, __u64 *moved_len); 2058 __u64 len, __u64 *moved_len);
1917 2059
2060/* page-io.c */
2061extern int __init ext4_init_pageio(void);
2062extern void ext4_exit_pageio(void);
2063extern void ext4_free_io_end(ext4_io_end_t *io);
2064extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2065extern int ext4_end_io_nolock(ext4_io_end_t *io);
2066extern void ext4_io_submit(struct ext4_io_submit *io);
2067extern int ext4_bio_write_page(struct ext4_io_submit *io,
2068 struct page *page,
2069 int len,
2070 struct writeback_control *wbc);
1918 2071
1919/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2072/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1920enum ext4_state_bits { 2073enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb..28ce70fd9cd 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 226}
227 227
228/*
229 * ext4_ext_pblock:
230 * combine low and high parts of physical block number into ext4_fsblk_t
231 */
232static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
233{
234 ext4_fsblk_t block;
235
236 block = le32_to_cpu(ex->ee_start_lo);
237 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
238 return block;
239}
240
241/*
242 * ext4_idx_pblock:
243 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
244 */
245static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
246{
247 ext4_fsblk_t block;
248
249 block = le32_to_cpu(ix->ei_leaf_lo);
250 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
251 return block;
252}
253
254/*
255 * ext4_ext_store_pblock:
256 * stores a large physical block number into an extent struct,
257 * breaking it into parts
258 */
259static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
260 ext4_fsblk_t pb)
261{
262 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
263 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
264 0xffff);
265}
266
267/*
268 * ext4_idx_store_pblock:
269 * stores a large physical block number into an index struct,
270 * breaking it into parts
271 */
272static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
273 ext4_fsblk_t pb)
274{
275 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
276 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
277 0xffff);
278}
279
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, 280extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks); 281 sector_t lblocks);
230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
233extern int ext4_extent_tree_init(handle_t *, struct inode *); 282extern int ext4_extent_tree_init(handle_t *, struct inode *);
234extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 283extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
235 int num, 284 int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
237extern int ext4_can_extents_be_merged(struct inode *inode, 286extern int ext4_can_extents_be_merged(struct inode *inode,
238 struct ext4_extent *ex1, 287 struct ext4_extent *ex1,
239 struct ext4_extent *ex2); 288 struct ext4_extent *ex2);
240extern int ext4_ext_try_to_merge(struct inode *inode,
241 struct ext4_ext_path *path,
242 struct ext4_extent *);
243extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
244extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); 289extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
245extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
246 ext_prepare_callback, void *);
247extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 290extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
248 struct ext4_ext_path *); 291 struct ext4_ext_path *);
249extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
250 ext4_lblk_t *, ext4_fsblk_t *);
251extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
252 ext4_lblk_t *, ext4_fsblk_t *);
253extern void ext4_ext_drop_refs(struct ext4_ext_path *); 292extern void ext4_ext_drop_refs(struct ext4_ext_path *);
254extern int ext4_ext_check_inode(struct inode *inode); 293extern int ext4_ext_check_inode(struct inode *inode);
255#endif /* _EXT4_EXTENTS */ 294#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71c..6e272ef6ba9 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
11{ 11{
12 int err = 0; 12 int err = 0;
13 13
14 if (ext4_handle_valid(handle)) { 14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh); 15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err) 16 if (err)
17 ext4_journal_abort_handle(where, __func__, bh, 17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err); 18 handle, err);
19 } 19 }
20 return err; 20 return err;
21} 21}
22 22
23int __ext4_journal_get_write_access(const char *where, handle_t *handle, 23int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 struct buffer_head *bh) 24 handle_t *handle, struct buffer_head *bh)
25{ 25{
26 int err = 0; 26 int err = 0;
27 27
28 if (ext4_handle_valid(handle)) { 28 if (ext4_handle_valid(handle)) {
29 err = jbd2_journal_get_write_access(handle, bh); 29 err = jbd2_journal_get_write_access(handle, bh);
30 if (err) 30 if (err)
31 ext4_journal_abort_handle(where, __func__, bh, 31 ext4_journal_abort_handle(where, line, __func__, bh,
32 handle, err); 32 handle, err);
33 } 33 }
34 return err; 34 return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
46 * If the handle isn't valid we're not journaling, but we still need to 46 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head. 47 * call into ext4_journal_revoke() to put the buffer head.
48 */ 48 */
49int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 49int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
50 struct inode *inode, struct buffer_head *bh, 50 int is_metadata, struct inode *inode,
51 ext4_fsblk_t blocknr) 51 struct buffer_head *bh, ext4_fsblk_t blocknr)
52{ 52{
53 int err; 53 int err;
54 54
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
79 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 79 BUFFER_TRACE(bh, "call jbd2_journal_forget");
80 err = jbd2_journal_forget(handle, bh); 80 err = jbd2_journal_forget(handle, bh);
81 if (err) 81 if (err)
82 ext4_journal_abort_handle(where, __func__, bh, 82 ext4_journal_abort_handle(where, line, __func__,
83 handle, err); 83 bh, handle, err);
84 return err; 84 return err;
85 } 85 }
86 return 0; 86 return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 92 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh); 93 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) { 94 if (err) {
95 ext4_journal_abort_handle(where, __func__, bh, handle, err); 95 ext4_journal_abort_handle(where, line, __func__,
96 ext4_abort(inode->i_sb, __func__, 96 bh, handle, err);
97 __ext4_abort(inode->i_sb, where, line,
97 "error %d when attempting revoke", err); 98 "error %d when attempting revoke", err);
98 } 99 }
99 BUFFER_TRACE(bh, "exit"); 100 BUFFER_TRACE(bh, "exit");
100 return err; 101 return err;
101} 102}
102 103
103int __ext4_journal_get_create_access(const char *where, 104int __ext4_journal_get_create_access(const char *where, unsigned int line,
104 handle_t *handle, struct buffer_head *bh) 105 handle_t *handle, struct buffer_head *bh)
105{ 106{
106 int err = 0; 107 int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
108 if (ext4_handle_valid(handle)) { 109 if (ext4_handle_valid(handle)) {
109 err = jbd2_journal_get_create_access(handle, bh); 110 err = jbd2_journal_get_create_access(handle, bh);
110 if (err) 111 if (err)
111 ext4_journal_abort_handle(where, __func__, bh, 112 ext4_journal_abort_handle(where, line, __func__,
112 handle, err); 113 bh, handle, err);
113 } 114 }
114 return err; 115 return err;
115} 116}
116 117
117int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 118int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
118 struct inode *inode, struct buffer_head *bh) 119 handle_t *handle, struct inode *inode,
120 struct buffer_head *bh)
119{ 121{
120 int err = 0; 122 int err = 0;
121 123
122 if (ext4_handle_valid(handle)) { 124 if (ext4_handle_valid(handle)) {
123 err = jbd2_journal_dirty_metadata(handle, bh); 125 err = jbd2_journal_dirty_metadata(handle, bh);
124 if (err) 126 if (err)
125 ext4_journal_abort_handle(where, __func__, bh, 127 ext4_journal_abort_handle(where, line, __func__,
126 handle, err); 128 bh, handle, err);
127 } else { 129 } else {
128 if (inode) 130 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 131 mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
132 if (inode && inode_needs_sync(inode)) { 134 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 135 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 136 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, 137 struct ext4_super_block *es;
136 "IO error syncing inode, " 138
137 "inode=%lu, block=%llu", 139 es = EXT4_SB(inode->i_sb)->s_es;
138 inode->i_ino, 140 es->s_last_error_block =
139 (unsigned long long) bh->b_blocknr); 141 cpu_to_le64(bh->b_blocknr);
142 ext4_error_inode(inode, where, line,
143 bh->b_blocknr,
144 "IO error syncing itable block");
140 err = -EIO; 145 err = -EIO;
141 } 146 }
142 } 147 }
143 } 148 }
144 return err; 149 return err;
145} 150}
151
152int __ext4_handle_dirty_super(const char *where, unsigned int line,
153 handle_t *handle, struct super_block *sb)
154{
155 struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
156 int err = 0;
157
158 if (ext4_handle_valid(handle)) {
159 err = jbd2_journal_dirty_metadata(handle, bh);
160 if (err)
161 ext4_journal_abort_handle(where, line, __func__,
162 bh, handle, err);
163 } else
164 sb->s_dirt = 1;
165 return err;
166}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index dade0c02479..b0bd792c58c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122/* 122/*
123 * Wrapper functions with which ext4 calls into JBD. 123 * Wrapper functions with which ext4 calls into JBD.
124 */ 124 */
125void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn,
126 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
127 128
128int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
129 struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
130 131
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132int __ext4_journal_get_write_access(const char *where, unsigned int line,
132 struct buffer_head *bh); 133 handle_t *handle, struct buffer_head *bh);
133 134
134int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 135int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
135 struct inode *inode, struct buffer_head *bh, 136 int is_metadata, struct inode *inode,
136 ext4_fsblk_t blocknr); 137 struct buffer_head *bh, ext4_fsblk_t blocknr);
137 138
138int __ext4_journal_get_create_access(const char *where, 139int __ext4_journal_get_create_access(const char *where, unsigned int line,
139 handle_t *handle, struct buffer_head *bh); 140 handle_t *handle, struct buffer_head *bh);
140 141
141int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 142int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
142 struct inode *inode, struct buffer_head *bh); 143 handle_t *handle, struct inode *inode,
144 struct buffer_head *bh);
145
146int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb);
143 148
144#define ext4_journal_get_undo_access(handle, bh) \ 149#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 151#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__func__, (handle), (bh)) 152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
149 __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ 154 __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
150 (block_nr)) 155 (bh), (block_nr))
151#define ext4_journal_get_create_access(handle, bh) \ 156#define ext4_journal_get_create_access(handle, bh) \
152 __ext4_journal_get_create_access(__func__, (handle), (bh)) 157 __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
153#define ext4_handle_dirty_metadata(handle, inode, bh) \ 158#define ext4_handle_dirty_metadata(handle, inode, bh) \
154 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 159 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
160 (bh))
161#define ext4_handle_dirty_super(handle, sb) \
162 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
155 163
156handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 164handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
157int __ext4_journal_stop(const char *where, handle_t *handle); 165int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
158 166
159#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 167#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
160 168
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
207} 215}
208 216
209#define ext4_journal_stop(handle) \ 217#define ext4_journal_stop(handle) \
210 __ext4_journal_stop(__func__, (handle)) 218 __ext4_journal_stop(__func__, __LINE__, (handle))
211 219
212static inline handle_t *ext4_journal_current_handle(void) 220static inline handle_t *ext4_journal_current_handle(void)
213{ 221{
@@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode)
308 * This function controls whether or not we should try to go down the 316 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking 317 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based 318 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is 319 * files, and it doesn't work if data journaling is enabled, since the
312 * enabled, since the dioread_nolock code uses b_private to pass 320 * dioread_nolock code uses b_private to pass information back to the
313 * information back to the I/O completion handler, and this conflicts 321 * I/O completion handler, and this conflicts with the jbd's use of
314 * with the jbd's use of b_private. 322 * b_private.
315 */ 323 */
316static inline int ext4_should_dioread_nolock(struct inode *inode) 324static inline int ext4_should_dioread_nolock(struct inode *inode)
317{ 325{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) 326 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0; 327 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode)) 328 if (!S_ISREG(inode->i_mode))
323 return 0; 329 return 0;
324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 330 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 377309c1af6..0554c48cb1f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54 ext4_fsblk_t block;
55
56 block = le32_to_cpu(ex->ee_start_lo);
57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58 return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67 ext4_fsblk_t block;
68
69 block = le32_to_cpu(ix->ei_leaf_lo);
70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71 return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle, 47static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode, 48 struct inode *inode,
98 int needed) 49 int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
169 /* try to predict block placement */ 120 /* try to predict block placement */
170 ex = path[depth].p_ext; 121 ex = path[depth].p_ext;
171 if (ex) 122 if (ex)
172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 123 return (ext4_ext_pblock(ex) +
124 (block - le32_to_cpu(ex->ee_block)));
173 125
174 /* it looks like index is empty; 126 /* it looks like index is empty;
175 * try to find starting block from index itself */ 127 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
354 306
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 307static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{ 308{
357 ext4_fsblk_t block = ext_pblock(ext); 309 ext4_fsblk_t block = ext4_ext_pblock(ext);
358 int len = ext4_ext_get_actual_len(ext); 310 int len = ext4_ext_get_actual_len(ext);
359 311
360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 312 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
363static int ext4_valid_extent_idx(struct inode *inode, 315static int ext4_valid_extent_idx(struct inode *inode,
364 struct ext4_extent_idx *ext_idx) 316 struct ext4_extent_idx *ext_idx)
365{ 317{
366 ext4_fsblk_t block = idx_pblock(ext_idx); 318 ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
367 319
368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 320 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369} 321}
@@ -401,9 +353,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
401 return 1; 353 return 1;
402} 354}
403 355
404static int __ext4_ext_check(const char *function, struct inode *inode, 356static int __ext4_ext_check(const char *function, unsigned int line,
405 struct ext4_extent_header *eh, 357 struct inode *inode, struct ext4_extent_header *eh,
406 int depth) 358 int depth)
407{ 359{
408 const char *error_msg; 360 const char *error_msg;
409 int max = 0; 361 int max = 0;
@@ -436,7 +388,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
436 return 0; 388 return 0;
437 389
438corrupted: 390corrupted:
439 ext4_error_inode(function, inode, 391 ext4_error_inode(inode, function, line, 0,
440 "bad header/extent: %s - magic %x, " 392 "bad header/extent: %s - magic %x, "
441 "entries %u, max %u(%u), depth %u(%u)", 393 "entries %u, max %u(%u), depth %u(%u)",
442 error_msg, le16_to_cpu(eh->eh_magic), 394 error_msg, le16_to_cpu(eh->eh_magic),
@@ -447,7 +399,7 @@ corrupted:
447} 399}
448 400
449#define ext4_ext_check(inode, eh, depth) \ 401#define ext4_ext_check(inode, eh, depth) \
450 __ext4_ext_check(__func__, inode, eh, depth) 402 __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
451 403
452int ext4_ext_check_inode(struct inode *inode) 404int ext4_ext_check_inode(struct inode *inode)
453{ 405{
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
463 for (k = 0; k <= l; k++, path++) { 415 for (k = 0; k <= l; k++, path++) {
464 if (path->p_idx) { 416 if (path->p_idx) {
465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 417 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466 idx_pblock(path->p_idx)); 418 ext4_idx_pblock(path->p_idx));
467 } else if (path->p_ext) { 419 } else if (path->p_ext) {
468 ext_debug(" %d:[%d]%d:%llu ", 420 ext_debug(" %d:[%d]%d:%llu ",
469 le32_to_cpu(path->p_ext->ee_block), 421 le32_to_cpu(path->p_ext->ee_block),
470 ext4_ext_is_uninitialized(path->p_ext), 422 ext4_ext_is_uninitialized(path->p_ext),
471 ext4_ext_get_actual_len(path->p_ext), 423 ext4_ext_get_actual_len(path->p_ext),
472 ext_pblock(path->p_ext)); 424 ext4_ext_pblock(path->p_ext));
473 } else 425 } else
474 ext_debug(" []"); 426 ext_debug(" []");
475 } 427 }
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 446 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 447 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496 ext4_ext_is_uninitialized(ex), 448 ext4_ext_is_uninitialized(ex),
497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 449 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
498 } 450 }
499 ext_debug("\n"); 451 ext_debug("\n");
500} 452}
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
545 497
546 path->p_idx = l - 1; 498 path->p_idx = l - 1;
547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 499 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548 idx_pblock(path->p_idx)); 500 ext4_idx_pblock(path->p_idx));
549 501
550#ifdef CHECK_BINSEARCH 502#ifdef CHECK_BINSEARCH
551 { 503 {
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
614 path->p_ext = l - 1; 566 path->p_ext = l - 1;
615 ext_debug(" -> %d:%llu:[%d]%d ", 567 ext_debug(" -> %d:%llu:[%d]%d ",
616 le32_to_cpu(path->p_ext->ee_block), 568 le32_to_cpu(path->p_ext->ee_block),
617 ext_pblock(path->p_ext), 569 ext4_ext_pblock(path->p_ext),
618 ext4_ext_is_uninitialized(path->p_ext), 570 ext4_ext_is_uninitialized(path->p_ext),
619 ext4_ext_get_actual_len(path->p_ext)); 571 ext4_ext_get_actual_len(path->p_ext));
620 572
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 634 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683 635
684 ext4_ext_binsearch_idx(inode, path + ppos, block); 636 ext4_ext_binsearch_idx(inode, path + ppos, block);
685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 637 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
686 path[ppos].p_depth = i; 638 path[ppos].p_depth = i;
687 path[ppos].p_ext = NULL; 639 path[ppos].p_ext = NULL;
688 640
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
721 ext4_ext_binsearch(inode, path + ppos, block); 673 ext4_ext_binsearch(inode, path + ppos, block);
722 /* if not an empty leaf */ 674 /* if not an empty leaf */
723 if (path[ppos].p_ext) 675 if (path[ppos].p_ext)
724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 676 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
725 677
726 ext4_ext_show_path(inode, path); 678 ext4_ext_show_path(inode, path);
727 679
@@ -739,9 +691,9 @@ err:
739 * insert new index [@logical;@ptr] into the block at @curp; 691 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp 692 * check where to insert: before @curp or after @curp
741 */ 693 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 694static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743 struct ext4_ext_path *curp, 695 struct ext4_ext_path *curp,
744 int logical, ext4_fsblk_t ptr) 696 int logical, ext4_fsblk_t ptr)
745{ 697{
746 struct ext4_extent_idx *ix; 698 struct ext4_extent_idx *ix;
747 int len, err; 699 int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
917 EXT_MAX_EXTENT(path[depth].p_hdr)) { 869 EXT_MAX_EXTENT(path[depth].p_hdr)) {
918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 870 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
919 le32_to_cpu(path[depth].p_ext->ee_block), 871 le32_to_cpu(path[depth].p_ext->ee_block),
920 ext_pblock(path[depth].p_ext), 872 ext4_ext_pblock(path[depth].p_ext),
921 ext4_ext_is_uninitialized(path[depth].p_ext), 873 ext4_ext_is_uninitialized(path[depth].p_ext),
922 ext4_ext_get_actual_len(path[depth].p_ext), 874 ext4_ext_get_actual_len(path[depth].p_ext),
923 newblock); 875 newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 959 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 960 ext_debug("%d: move %d:%llu in new index %llu\n", i,
1009 le32_to_cpu(path[i].p_idx->ei_block), 961 le32_to_cpu(path[i].p_idx->ei_block),
1010 idx_pblock(path[i].p_idx), 962 ext4_idx_pblock(path[i].p_idx),
1011 newblock); 963 newblock);
1012 /*memmove(++fidx, path[i].p_idx++, 964 /*memmove(++fidx, path[i].p_idx++,
1013 sizeof(struct ext4_extent_idx)); 965 sizeof(struct ext4_extent_idx));
@@ -1083,7 +1035,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1083{ 1035{
1084 struct ext4_ext_path *curp = path; 1036 struct ext4_ext_path *curp = path;
1085 struct ext4_extent_header *neh; 1037 struct ext4_extent_header *neh;
1086 struct ext4_extent_idx *fidx;
1087 struct buffer_head *bh; 1038 struct buffer_head *bh;
1088 ext4_fsblk_t newblock; 1039 ext4_fsblk_t newblock;
1089 int err = 0; 1040 int err = 0;
@@ -1144,10 +1095,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1144 ext4_idx_store_pblock(curp->p_idx, newblock); 1095 ext4_idx_store_pblock(curp->p_idx, newblock);
1145 1096
1146 neh = ext_inode_hdr(inode); 1097 neh = ext_inode_hdr(inode);
1147 fidx = EXT_FIRST_INDEX(neh);
1148 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1098 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1149 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1099 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1150 le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); 1100 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1101 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1151 1102
1152 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1103 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1153 err = ext4_ext_dirty(handle, inode, curp); 1104 err = ext4_ext_dirty(handle, inode, curp);
@@ -1233,9 +1184,9 @@ out:
1233 * returns 0 at @phys 1184 * returns 0 at @phys
1234 * return value contains 0 (success) or error code 1185 * return value contains 0 (success) or error code
1235 */ 1186 */
1236int 1187static int ext4_ext_search_left(struct inode *inode,
1237ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1188 struct ext4_ext_path *path,
1238 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1189 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1239{ 1190{
1240 struct ext4_extent_idx *ix; 1191 struct ext4_extent_idx *ix;
1241 struct ext4_extent *ex; 1192 struct ext4_extent *ex;
@@ -1287,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1287 } 1238 }
1288 1239
1289 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1240 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1290 *phys = ext_pblock(ex) + ee_len - 1; 1241 *phys = ext4_ext_pblock(ex) + ee_len - 1;
1291 return 0; 1242 return 0;
1292} 1243}
1293 1244
@@ -1298,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1298 * returns 0 at @phys 1249 * returns 0 at @phys
1299 * return value contains 0 (success) or error code 1250 * return value contains 0 (success) or error code
1300 */ 1251 */
1301int 1252static int ext4_ext_search_right(struct inode *inode,
1302ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1253 struct ext4_ext_path *path,
1303 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1254 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1304{ 1255{
1305 struct buffer_head *bh = NULL; 1256 struct buffer_head *bh = NULL;
1306 struct ext4_extent_header *eh; 1257 struct ext4_extent_header *eh;
@@ -1343,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1343 } 1294 }
1344 } 1295 }
1345 *logical = le32_to_cpu(ex->ee_block); 1296 *logical = le32_to_cpu(ex->ee_block);
1346 *phys = ext_pblock(ex); 1297 *phys = ext4_ext_pblock(ex);
1347 return 0; 1298 return 0;
1348 } 1299 }
1349 1300
@@ -1358,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1358 /* next allocated block in this leaf */ 1309 /* next allocated block in this leaf */
1359 ex++; 1310 ex++;
1360 *logical = le32_to_cpu(ex->ee_block); 1311 *logical = le32_to_cpu(ex->ee_block);
1361 *phys = ext_pblock(ex); 1312 *phys = ext4_ext_pblock(ex);
1362 return 0; 1313 return 0;
1363 } 1314 }
1364 1315
@@ -1377,7 +1328,7 @@ got_index:
1377 * follow it and find the closest allocated 1328 * follow it and find the closest allocated
1378 * block to the right */ 1329 * block to the right */
1379 ix++; 1330 ix++;
1380 block = idx_pblock(ix); 1331 block = ext4_idx_pblock(ix);
1381 while (++depth < path->p_depth) { 1332 while (++depth < path->p_depth) {
1382 bh = sb_bread(inode->i_sb, block); 1333 bh = sb_bread(inode->i_sb, block);
1383 if (bh == NULL) 1334 if (bh == NULL)
@@ -1389,7 +1340,7 @@ got_index:
1389 return -EIO; 1340 return -EIO;
1390 } 1341 }
1391 ix = EXT_FIRST_INDEX(eh); 1342 ix = EXT_FIRST_INDEX(eh);
1392 block = idx_pblock(ix); 1343 block = ext4_idx_pblock(ix);
1393 put_bh(bh); 1344 put_bh(bh);
1394 } 1345 }
1395 1346
@@ -1403,7 +1354,7 @@ got_index:
1403 } 1354 }
1404 ex = EXT_FIRST_EXTENT(eh); 1355 ex = EXT_FIRST_EXTENT(eh);
1405 *logical = le32_to_cpu(ex->ee_block); 1356 *logical = le32_to_cpu(ex->ee_block);
1406 *phys = ext_pblock(ex); 1357 *phys = ext4_ext_pblock(ex);
1407 put_bh(bh); 1358 put_bh(bh);
1408 return 0; 1359 return 0;
1409} 1360}
@@ -1574,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1574 return 0; 1525 return 0;
1575#endif 1526#endif
1576 1527
1577 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1528 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1578 return 1; 1529 return 1;
1579 return 0; 1530 return 0;
1580} 1531}
@@ -1586,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1586 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1537 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1587 * 1 if they got merged. 1538 * 1 if they got merged.
1588 */ 1539 */
1589int ext4_ext_try_to_merge(struct inode *inode, 1540static int ext4_ext_try_to_merge(struct inode *inode,
1590 struct ext4_ext_path *path, 1541 struct ext4_ext_path *path,
1591 struct ext4_extent *ex) 1542 struct ext4_extent *ex)
1592{ 1543{
1593 struct ext4_extent_header *eh; 1544 struct ext4_extent_header *eh;
1594 unsigned int depth, len; 1545 unsigned int depth, len;
@@ -1633,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1633 * such that there will be no overlap, and then returns 1. 1584 * such that there will be no overlap, and then returns 1.
1634 * If there is no overlap found, it returns 0. 1585 * If there is no overlap found, it returns 0.
1635 */ 1586 */
1636unsigned int ext4_ext_check_overlap(struct inode *inode, 1587static unsigned int ext4_ext_check_overlap(struct inode *inode,
1637 struct ext4_extent *newext, 1588 struct ext4_extent *newext,
1638 struct ext4_ext_path *path) 1589 struct ext4_ext_path *path)
1639{ 1590{
1640 ext4_lblk_t b1, b2; 1591 ext4_lblk_t b1, b2;
1641 unsigned int depth, len1; 1592 unsigned int depth, len1;
@@ -1707,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1707 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1658 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1708 && ext4_can_extents_be_merged(inode, ex, newext)) { 1659 && ext4_can_extents_be_merged(inode, ex, newext)) {
1709 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1660 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1710 ext4_ext_is_uninitialized(newext), 1661 ext4_ext_is_uninitialized(newext),
1711 ext4_ext_get_actual_len(newext), 1662 ext4_ext_get_actual_len(newext),
1712 le32_to_cpu(ex->ee_block), 1663 le32_to_cpu(ex->ee_block),
1713 ext4_ext_is_uninitialized(ex), 1664 ext4_ext_is_uninitialized(ex),
1714 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1665 ext4_ext_get_actual_len(ex),
1666 ext4_ext_pblock(ex));
1715 err = ext4_ext_get_access(handle, inode, path + depth); 1667 err = ext4_ext_get_access(handle, inode, path + depth);
1716 if (err) 1668 if (err)
1717 return err; 1669 return err;
@@ -1781,7 +1733,7 @@ has_space:
1781 /* there is no extent in this leaf, create first one */ 1733 /* there is no extent in this leaf, create first one */
1782 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1734 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1783 le32_to_cpu(newext->ee_block), 1735 le32_to_cpu(newext->ee_block),
1784 ext_pblock(newext), 1736 ext4_ext_pblock(newext),
1785 ext4_ext_is_uninitialized(newext), 1737 ext4_ext_is_uninitialized(newext),
1786 ext4_ext_get_actual_len(newext)); 1738 ext4_ext_get_actual_len(newext));
1787 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1739 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1795,7 +1747,7 @@ has_space:
1795 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1747 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1796 "move %d from 0x%p to 0x%p\n", 1748 "move %d from 0x%p to 0x%p\n",
1797 le32_to_cpu(newext->ee_block), 1749 le32_to_cpu(newext->ee_block),
1798 ext_pblock(newext), 1750 ext4_ext_pblock(newext),
1799 ext4_ext_is_uninitialized(newext), 1751 ext4_ext_is_uninitialized(newext),
1800 ext4_ext_get_actual_len(newext), 1752 ext4_ext_get_actual_len(newext),
1801 nearex, len, nearex + 1, nearex + 2); 1753 nearex, len, nearex + 1, nearex + 2);
@@ -1809,7 +1761,7 @@ has_space:
1809 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1761 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1810 "move %d from 0x%p to 0x%p\n", 1762 "move %d from 0x%p to 0x%p\n",
1811 le32_to_cpu(newext->ee_block), 1763 le32_to_cpu(newext->ee_block),
1812 ext_pblock(newext), 1764 ext4_ext_pblock(newext),
1813 ext4_ext_is_uninitialized(newext), 1765 ext4_ext_is_uninitialized(newext),
1814 ext4_ext_get_actual_len(newext), 1766 ext4_ext_get_actual_len(newext),
1815 nearex, len, nearex + 1, nearex + 2); 1767 nearex, len, nearex + 1, nearex + 2);
@@ -1820,7 +1772,7 @@ has_space:
1820 le16_add_cpu(&eh->eh_entries, 1); 1772 le16_add_cpu(&eh->eh_entries, 1);
1821 nearex = path[depth].p_ext; 1773 nearex = path[depth].p_ext;
1822 nearex->ee_block = newext->ee_block; 1774 nearex->ee_block = newext->ee_block;
1823 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1775 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1824 nearex->ee_len = newext->ee_len; 1776 nearex->ee_len = newext->ee_len;
1825 1777
1826merge: 1778merge:
@@ -1846,9 +1798,9 @@ cleanup:
1846 return err; 1798 return err;
1847} 1799}
1848 1800
1849int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1801static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1850 ext4_lblk_t num, ext_prepare_callback func, 1802 ext4_lblk_t num, ext_prepare_callback func,
1851 void *cbdata) 1803 void *cbdata)
1852{ 1804{
1853 struct ext4_ext_path *path = NULL; 1805 struct ext4_ext_path *path = NULL;
1854 struct ext4_ext_cache cbex; 1806 struct ext4_ext_cache cbex;
@@ -1924,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1924 } else { 1876 } else {
1925 cbex.ec_block = le32_to_cpu(ex->ee_block); 1877 cbex.ec_block = le32_to_cpu(ex->ee_block);
1926 cbex.ec_len = ext4_ext_get_actual_len(ex); 1878 cbex.ec_len = ext4_ext_get_actual_len(ex);
1927 cbex.ec_start = ext_pblock(ex); 1879 cbex.ec_start = ext4_ext_pblock(ex);
1928 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1880 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1929 } 1881 }
1930 1882
@@ -2074,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2074 2026
2075 /* free index block */ 2027 /* free index block */
2076 path--; 2028 path--;
2077 leaf = idx_pblock(path->p_idx); 2029 leaf = ext4_idx_pblock(path->p_idx);
2078 if (unlikely(path->p_hdr->eh_entries == 0)) { 2030 if (unlikely(path->p_hdr->eh_entries == 0)) {
2079 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2031 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2080 return -EIO; 2032 return -EIO;
@@ -2182,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2182 ext4_fsblk_t start; 2134 ext4_fsblk_t start;
2183 2135
2184 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2136 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2185 start = ext_pblock(ex) + ee_len - num; 2137 start = ext4_ext_pblock(ex) + ee_len - num;
2186 ext_debug("free last %u blocks starting %llu\n", num, start); 2138 ext_debug("free last %u blocks starting %llu\n", num, start);
2187 ext4_free_blocks(handle, inode, 0, start, num, flags); 2139 ext4_free_blocks(handle, inode, 0, start, num, flags);
2188 } else if (from == le32_to_cpu(ex->ee_block) 2140 } else if (from == le32_to_cpu(ex->ee_block)
@@ -2311,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2311 goto out; 2263 goto out;
2312 2264
2313 ext_debug("new extent: %u:%u:%llu\n", block, num, 2265 ext_debug("new extent: %u:%u:%llu\n", block, num,
2314 ext_pblock(ex)); 2266 ext4_ext_pblock(ex));
2315 ex--; 2267 ex--;
2316 ex_ee_block = le32_to_cpu(ex->ee_block); 2268 ex_ee_block = le32_to_cpu(ex->ee_block);
2317 ex_ee_len = ext4_ext_get_actual_len(ex); 2269 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2422,9 +2374,9 @@ again:
2422 struct buffer_head *bh; 2374 struct buffer_head *bh;
2423 /* go to the next level */ 2375 /* go to the next level */
2424 ext_debug("move to level %d (block %llu)\n", 2376 ext_debug("move to level %d (block %llu)\n",
2425 i + 1, idx_pblock(path[i].p_idx)); 2377 i + 1, ext4_idx_pblock(path[i].p_idx));
2426 memset(path + i + 1, 0, sizeof(*path)); 2378 memset(path + i + 1, 0, sizeof(*path));
2427 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2379 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2428 if (!bh) { 2380 if (!bh) {
2429 /* should we reset i_size? */ 2381 /* should we reset i_size? */
2430 err = -EIO; 2382 err = -EIO;
@@ -2536,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
2536#endif 2488#endif
2537} 2489}
2538 2490
2539static void bi_complete(struct bio *bio, int error)
2540{
2541 complete((struct completion *)bio->bi_private);
2542}
2543
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2491/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2492static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2493{
2494 ext4_fsblk_t ee_pblock;
2495 unsigned int ee_len;
2547 int ret; 2496 int ret;
2548 struct bio *bio;
2549 int blkbits, blocksize;
2550 sector_t ee_pblock;
2551 struct completion event;
2552 unsigned int ee_len, len, done, offset;
2553 2497
2554
2555 blkbits = inode->i_blkbits;
2556 blocksize = inode->i_sb->s_blocksize;
2557 ee_len = ext4_ext_get_actual_len(ex); 2498 ee_len = ext4_ext_get_actual_len(ex);
2558 ee_pblock = ext_pblock(ex); 2499 ee_pblock = ext4_ext_pblock(ex);
2559
2560 /* convert ee_pblock to 512 byte sectors */
2561 ee_pblock = ee_pblock << (blkbits - 9);
2562
2563 while (ee_len > 0) {
2564
2565 if (ee_len > BIO_MAX_PAGES)
2566 len = BIO_MAX_PAGES;
2567 else
2568 len = ee_len;
2569
2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2574 bio->bi_sector = ee_pblock;
2575 bio->bi_bdev = inode->i_sb->s_bdev;
2576
2577 done = 0;
2578 offset = 0;
2579 while (done < len) {
2580 ret = bio_add_page(bio, ZERO_PAGE(0),
2581 blocksize, offset);
2582 if (ret != blocksize) {
2583 /*
2584 * We can't add any more pages because of
2585 * hardware limitations. Start a new bio.
2586 */
2587 break;
2588 }
2589 done++;
2590 offset += blocksize;
2591 if (offset >= PAGE_CACHE_SIZE)
2592 offset = 0;
2593 }
2594 2500
2595 init_completion(&event); 2501 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2596 bio->bi_private = &event; 2502 if (ret > 0)
2597 bio->bi_end_io = bi_complete; 2503 ret = 0;
2598 submit_bio(WRITE, bio);
2599 wait_for_completion(&event);
2600 2504
2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2505 return ret;
2602 bio_put(bio);
2603 return -EIO;
2604 }
2605 bio_put(bio);
2606 ee_len -= done;
2607 ee_pblock += done << (blkbits - 9);
2608 }
2609 return 0;
2610} 2506}
2611 2507
2612#define EXT4_EXT_ZERO_LEN 7 2508#define EXT4_EXT_ZERO_LEN 7
@@ -2652,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2652 ee_block = le32_to_cpu(ex->ee_block); 2548 ee_block = le32_to_cpu(ex->ee_block);
2653 ee_len = ext4_ext_get_actual_len(ex); 2549 ee_len = ext4_ext_get_actual_len(ex);
2654 allocated = ee_len - (map->m_lblk - ee_block); 2550 allocated = ee_len - (map->m_lblk - ee_block);
2655 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2551 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2656 2552
2657 ex2 = ex; 2553 ex2 = ex;
2658 orig_ex.ee_block = ex->ee_block; 2554 orig_ex.ee_block = ex->ee_block;
2659 orig_ex.ee_len = cpu_to_le16(ee_len); 2555 orig_ex.ee_len = cpu_to_le16(ee_len);
2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2556 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2661 2557
2662 /* 2558 /*
2663 * It is safe to convert extent to initialized via explicit 2559 * It is safe to convert extent to initialized via explicit
@@ -2676,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2676 /* update the extent length and mark as initialized */ 2572 /* update the extent length and mark as initialized */
2677 ex->ee_block = orig_ex.ee_block; 2573 ex->ee_block = orig_ex.ee_block;
2678 ex->ee_len = orig_ex.ee_len; 2574 ex->ee_len = orig_ex.ee_len;
2679 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2575 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2680 ext4_ext_dirty(handle, inode, path + depth); 2576 ext4_ext_dirty(handle, inode, path + depth);
2681 /* zeroed the full extent */ 2577 /* zeroed the full extent */
2682 return allocated; 2578 return allocated;
@@ -2711,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_block = orig_ex.ee_block; 2607 ex->ee_block = orig_ex.ee_block;
2712 ex->ee_len = cpu_to_le16(ee_len - allocated); 2608 ex->ee_len = cpu_to_le16(ee_len - allocated);
2713 ext4_ext_mark_uninitialized(ex); 2609 ext4_ext_mark_uninitialized(ex);
2714 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2610 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2715 ext4_ext_dirty(handle, inode, path + depth); 2611 ext4_ext_dirty(handle, inode, path + depth);
2716 2612
2717 ex3 = &newex; 2613 ex3 = &newex;
@@ -2726,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2726 goto fix_extent_len; 2622 goto fix_extent_len;
2727 ex->ee_block = orig_ex.ee_block; 2623 ex->ee_block = orig_ex.ee_block;
2728 ex->ee_len = orig_ex.ee_len; 2624 ex->ee_len = orig_ex.ee_len;
2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2625 ext4_ext_store_pblock(ex,
2626 ext4_ext_pblock(&orig_ex));
2730 ext4_ext_dirty(handle, inode, path + depth); 2627 ext4_ext_dirty(handle, inode, path + depth);
2731 /* blocks available from map->m_lblk */ 2628 /* blocks available from map->m_lblk */
2732 return allocated; 2629 return allocated;
@@ -2783,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 /* update the extent length and mark as initialized */ 2680 /* update the extent length and mark as initialized */
2784 ex->ee_block = orig_ex.ee_block; 2681 ex->ee_block = orig_ex.ee_block;
2785 ex->ee_len = orig_ex.ee_len; 2682 ex->ee_len = orig_ex.ee_len;
2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2683 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2787 ext4_ext_dirty(handle, inode, path + depth); 2684 ext4_ext_dirty(handle, inode, path + depth);
2788 /* zeroed the full extent */ 2685 /* zeroed the full extent */
2789 /* blocks available from map->m_lblk */ 2686 /* blocks available from map->m_lblk */
@@ -2834,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2834 /* update the extent length and mark as initialized */ 2731 /* update the extent length and mark as initialized */
2835 ex->ee_block = orig_ex.ee_block; 2732 ex->ee_block = orig_ex.ee_block;
2836 ex->ee_len = orig_ex.ee_len; 2733 ex->ee_len = orig_ex.ee_len;
2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2734 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2838 ext4_ext_dirty(handle, inode, path + depth); 2735 ext4_ext_dirty(handle, inode, path + depth);
2839 /* zero out the first half */ 2736 /* zero out the first half */
2840 /* blocks available from map->m_lblk */ 2737 /* blocks available from map->m_lblk */
@@ -2903,7 +2800,7 @@ insert:
2903 /* update the extent length and mark as initialized */ 2800 /* update the extent length and mark as initialized */
2904 ex->ee_block = orig_ex.ee_block; 2801 ex->ee_block = orig_ex.ee_block;
2905 ex->ee_len = orig_ex.ee_len; 2802 ex->ee_len = orig_ex.ee_len;
2906 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2803 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2907 ext4_ext_dirty(handle, inode, path + depth); 2804 ext4_ext_dirty(handle, inode, path + depth);
2908 /* zero out the first half */ 2805 /* zero out the first half */
2909 return allocated; 2806 return allocated;
@@ -2916,7 +2813,7 @@ out:
2916fix_extent_len: 2813fix_extent_len:
2917 ex->ee_block = orig_ex.ee_block; 2814 ex->ee_block = orig_ex.ee_block;
2918 ex->ee_len = orig_ex.ee_len; 2815 ex->ee_len = orig_ex.ee_len;
2919 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2816 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2920 ext4_ext_mark_uninitialized(ex); 2817 ext4_ext_mark_uninitialized(ex);
2921 ext4_ext_dirty(handle, inode, path + depth); 2818 ext4_ext_dirty(handle, inode, path + depth);
2922 return err; 2819 return err;
@@ -2937,7 +2834,7 @@ fix_extent_len:
2937 * One of more index blocks maybe needed if the extent tree grow after 2834 * One of more index blocks maybe needed if the extent tree grow after
2938 * the unintialized extent split. To prevent ENOSPC occur at the IO 2835 * the unintialized extent split. To prevent ENOSPC occur at the IO
2939 * complete, we need to split the uninitialized extent before DIO submit 2836 * complete, we need to split the uninitialized extent before DIO submit
2940 * the IO. The uninitilized extent called at this time will be split 2837 * the IO. The uninitialized extent called at this time will be split
2941 * into three uninitialized extent(at most). After IO complete, the part 2838 * into three uninitialized extent(at most). After IO complete, the part
2942 * being filled will be convert to initialized by the end_io callback function 2839 * being filled will be convert to initialized by the end_io callback function
2943 * via ext4_convert_unwritten_extents(). 2840 * via ext4_convert_unwritten_extents().
@@ -2954,7 +2851,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2954 struct ext4_extent *ex1 = NULL; 2851 struct ext4_extent *ex1 = NULL;
2955 struct ext4_extent *ex2 = NULL; 2852 struct ext4_extent *ex2 = NULL;
2956 struct ext4_extent *ex3 = NULL; 2853 struct ext4_extent *ex3 = NULL;
2957 struct ext4_extent_header *eh;
2958 ext4_lblk_t ee_block, eof_block; 2854 ext4_lblk_t ee_block, eof_block;
2959 unsigned int allocated, ee_len, depth; 2855 unsigned int allocated, ee_len, depth;
2960 ext4_fsblk_t newblock; 2856 ext4_fsblk_t newblock;
@@ -2971,17 +2867,16 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2971 eof_block = map->m_lblk + map->m_len; 2867 eof_block = map->m_lblk + map->m_len;
2972 2868
2973 depth = ext_depth(inode); 2869 depth = ext_depth(inode);
2974 eh = path[depth].p_hdr;
2975 ex = path[depth].p_ext; 2870 ex = path[depth].p_ext;
2976 ee_block = le32_to_cpu(ex->ee_block); 2871 ee_block = le32_to_cpu(ex->ee_block);
2977 ee_len = ext4_ext_get_actual_len(ex); 2872 ee_len = ext4_ext_get_actual_len(ex);
2978 allocated = ee_len - (map->m_lblk - ee_block); 2873 allocated = ee_len - (map->m_lblk - ee_block);
2979 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2874 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2980 2875
2981 ex2 = ex; 2876 ex2 = ex;
2982 orig_ex.ee_block = ex->ee_block; 2877 orig_ex.ee_block = ex->ee_block;
2983 orig_ex.ee_len = cpu_to_le16(ee_len); 2878 orig_ex.ee_len = cpu_to_le16(ee_len);
2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2879 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2985 2880
2986 /* 2881 /*
2987 * It is safe to convert extent to initialized via explicit 2882 * It is safe to convert extent to initialized via explicit
@@ -3030,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3030 /* update the extent length and mark as initialized */ 2925 /* update the extent length and mark as initialized */
3031 ex->ee_block = orig_ex.ee_block; 2926 ex->ee_block = orig_ex.ee_block;
3032 ex->ee_len = orig_ex.ee_len; 2927 ex->ee_len = orig_ex.ee_len;
3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2928 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3034 ext4_ext_dirty(handle, inode, path + depth); 2929 ext4_ext_dirty(handle, inode, path + depth);
3035 /* zeroed the full extent */ 2930 /* zeroed the full extent */
3036 /* blocks available from map->m_lblk */ 2931 /* blocks available from map->m_lblk */
@@ -3058,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3058 err = PTR_ERR(path); 2953 err = PTR_ERR(path);
3059 goto out; 2954 goto out;
3060 } 2955 }
3061 eh = path[depth].p_hdr;
3062 ex = path[depth].p_ext; 2956 ex = path[depth].p_ext;
3063 if (ex2 != &newex) 2957 if (ex2 != &newex)
3064 ex2 = ex; 2958 ex2 = ex;
@@ -3103,7 +2997,7 @@ insert:
3103 /* update the extent length and mark as initialized */ 2997 /* update the extent length and mark as initialized */
3104 ex->ee_block = orig_ex.ee_block; 2998 ex->ee_block = orig_ex.ee_block;
3105 ex->ee_len = orig_ex.ee_len; 2999 ex->ee_len = orig_ex.ee_len;
3106 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3000 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3107 ext4_ext_dirty(handle, inode, path + depth); 3001 ext4_ext_dirty(handle, inode, path + depth);
3108 /* zero out the first half */ 3002 /* zero out the first half */
3109 return allocated; 3003 return allocated;
@@ -3116,7 +3010,7 @@ out:
3116fix_extent_len: 3010fix_extent_len:
3117 ex->ee_block = orig_ex.ee_block; 3011 ex->ee_block = orig_ex.ee_block;
3118 ex->ee_len = orig_ex.ee_len; 3012 ex->ee_len = orig_ex.ee_len;
3119 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3013 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3120 ext4_ext_mark_uninitialized(ex); 3014 ext4_ext_mark_uninitialized(ex);
3121 ext4_ext_dirty(handle, inode, path + depth); 3015 ext4_ext_dirty(handle, inode, path + depth);
3122 return err; 3016 return err;
@@ -3184,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3184 unmap_underlying_metadata(bdev, block + i); 3078 unmap_underlying_metadata(bdev, block + i);
3185} 3079}
3186 3080
3081/*
3082 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3083 */
3084static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3085 struct ext4_map_blocks *map,
3086 struct ext4_ext_path *path,
3087 unsigned int len)
3088{
3089 int i, depth;
3090 struct ext4_extent_header *eh;
3091 struct ext4_extent *ex, *last_ex;
3092
3093 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3094 return 0;
3095
3096 depth = ext_depth(inode);
3097 eh = path[depth].p_hdr;
3098 ex = path[depth].p_ext;
3099
3100 if (unlikely(!eh->eh_entries)) {
3101 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3102 "EOFBLOCKS_FL set");
3103 return -EIO;
3104 }
3105 last_ex = EXT_LAST_EXTENT(eh);
3106 /*
3107 * We should clear the EOFBLOCKS_FL flag if we are writing the
3108 * last block in the last extent in the file. We test this by
3109 * first checking to see if the caller to
3110 * ext4_ext_get_blocks() was interested in the last block (or
3111 * a block beyond the last block) in the current extent. If
3112 * this turns out to be false, we can bail out from this
3113 * function immediately.
3114 */
3115 if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
3116 ext4_ext_get_actual_len(last_ex))
3117 return 0;
3118 /*
3119 * If the caller does appear to be planning to write at or
3120 * beyond the end of the current extent, we then test to see
3121 * if the current extent is the last extent in the file, by
3122 * checking to make sure it was reached via the rightmost node
3123 * at each level of the tree.
3124 */
3125 for (i = depth-1; i >= 0; i--)
3126 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3127 return 0;
3128 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3129 return ext4_mark_inode_dirty(handle, inode);
3130}
3131
3187static int 3132static int
3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3133ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3189 struct ext4_map_blocks *map, 3134 struct ext4_map_blocks *map,
@@ -3210,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * completed 3155 * completed
3211 */ 3156 */
3212 if (io) 3157 if (io)
3213 io->flag = EXT4_IO_UNWRITTEN; 3158 io->flag = EXT4_IO_END_UNWRITTEN;
3214 else 3159 else
3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3160 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3216 if (ext4_should_dioread_nolock(inode)) 3161 if (ext4_should_dioread_nolock(inode))
@@ -3221,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3221 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3166 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3222 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3167 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3223 path); 3168 path);
3224 if (ret >= 0) 3169 if (ret >= 0) {
3225 ext4_update_inode_fsync_trans(handle, inode, 1); 3170 ext4_update_inode_fsync_trans(handle, inode, 1);
3171 err = check_eofblocks_fl(handle, inode, map, path,
3172 map->m_len);
3173 } else
3174 err = ret;
3226 goto out2; 3175 goto out2;
3227 } 3176 }
3228 /* buffered IO case */ 3177 /* buffered IO case */
@@ -3248,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3248 3197
3249 /* buffered write, writepage time, convert*/ 3198 /* buffered write, writepage time, convert*/
3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3199 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3251 if (ret >= 0) 3200 if (ret >= 0) {
3252 ext4_update_inode_fsync_trans(handle, inode, 1); 3201 ext4_update_inode_fsync_trans(handle, inode, 1);
3202 err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
3203 if (err < 0)
3204 goto out2;
3205 }
3206
3253out: 3207out:
3254 if (ret <= 0) { 3208 if (ret <= 0) {
3255 err = ret; 3209 err = ret;
@@ -3296,6 +3250,7 @@ out2:
3296 } 3250 }
3297 return err ? err : allocated; 3251 return err ? err : allocated;
3298} 3252}
3253
3299/* 3254/*
3300 * Block allocation/map/preallocation routine for extents based files 3255 * Block allocation/map/preallocation routine for extents based files
3301 * 3256 *
@@ -3319,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3319{ 3274{
3320 struct ext4_ext_path *path = NULL; 3275 struct ext4_ext_path *path = NULL;
3321 struct ext4_extent_header *eh; 3276 struct ext4_extent_header *eh;
3322 struct ext4_extent newex, *ex, *last_ex; 3277 struct ext4_extent newex, *ex;
3323 ext4_fsblk_t newblock; 3278 ext4_fsblk_t newblock;
3324 int i, err = 0, depth, ret, cache_type; 3279 int err = 0, depth, ret, cache_type;
3325 unsigned int allocated = 0; 3280 unsigned int allocated = 0;
3326 struct ext4_allocation_request ar; 3281 struct ext4_allocation_request ar;
3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3282 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3345,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3345 /* block is already allocated */ 3300 /* block is already allocated */
3346 newblock = map->m_lblk 3301 newblock = map->m_lblk
3347 - le32_to_cpu(newex.ee_block) 3302 - le32_to_cpu(newex.ee_block)
3348 + ext_pblock(&newex); 3303 + ext4_ext_pblock(&newex);
3349 /* number of remaining blocks in the extent */ 3304 /* number of remaining blocks in the extent */
3350 allocated = ext4_ext_get_actual_len(&newex) - 3305 allocated = ext4_ext_get_actual_len(&newex) -
3351 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3306 (map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3383,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3383 ex = path[depth].p_ext; 3338 ex = path[depth].p_ext;
3384 if (ex) { 3339 if (ex) {
3385 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3340 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3386 ext4_fsblk_t ee_start = ext_pblock(ex); 3341 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3387 unsigned short ee_len; 3342 unsigned short ee_len;
3388 3343
3389 /* 3344 /*
@@ -3492,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3492 */ 3447 */
3493 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3448 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3494 if (io) 3449 if (io)
3495 io->flag = EXT4_IO_UNWRITTEN; 3450 io->flag = EXT4_IO_END_UNWRITTEN;
3496 else 3451 else
3497 ext4_set_inode_state(inode, 3452 ext4_set_inode_state(inode,
3498 EXT4_STATE_DIO_UNWRITTEN); 3453 EXT4_STATE_DIO_UNWRITTEN);
@@ -3501,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3501 map->m_flags |= EXT4_MAP_UNINIT; 3456 map->m_flags |= EXT4_MAP_UNINIT;
3502 } 3457 }
3503 3458
3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3459 err = check_eofblocks_fl(handle, inode, map, path, ar.len);
3505 if (unlikely(!eh->eh_entries)) { 3460 if (err)
3506 EXT4_ERROR_INODE(inode, 3461 goto out2;
3507 "eh->eh_entries == 0 and " 3462
3508 "EOFBLOCKS_FL set");
3509 err = -EIO;
3510 goto out2;
3511 }
3512 last_ex = EXT_LAST_EXTENT(eh);
3513 /*
3514 * If the current leaf block was reached by looking at
3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3528 }
3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3463 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3530 if (err) { 3464 if (err) {
3531 /* free data blocks we just allocated */ 3465 /* free data blocks we just allocated */
3532 /* not a good idea to call discard here directly, 3466 /* not a good idea to call discard here directly,
3533 * but otherwise we'd need to call it every free() */ 3467 * but otherwise we'd need to call it every free() */
3534 ext4_discard_preallocations(inode); 3468 ext4_discard_preallocations(inode);
3535 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3469 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
3536 ext4_ext_get_actual_len(&newex), 0); 3470 ext4_ext_get_actual_len(&newex), 0);
3537 goto out2; 3471 goto out2;
3538 } 3472 }
3539 3473
3540 /* previous routine could use block we allocated */ 3474 /* previous routine could use block we allocated */
3541 newblock = ext_pblock(&newex); 3475 newblock = ext4_ext_pblock(&newex);
3542 allocated = ext4_ext_get_actual_len(&newex); 3476 allocated = ext4_ext_get_actual_len(&newex);
3543 if (allocated > map->m_len) 3477 if (allocated > map->m_len)
3544 allocated = map->m_len; 3478 allocated = map->m_len;
@@ -3733,7 +3667,7 @@ retry:
3733 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3667 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3734 "returned error inode#%lu, block=%u, " 3668 "returned error inode#%lu, block=%u, "
3735 "max_blocks=%u", __func__, 3669 "max_blocks=%u", __func__,
3736 inode->i_ino, block, max_blocks); 3670 inode->i_ino, map.m_lblk, max_blocks);
3737#endif 3671#endif
3738 ext4_mark_inode_dirty(handle, inode); 3672 ext4_mark_inode_dirty(handle, inode);
3739 ret2 = ext4_journal_stop(handle); 3673 ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5313ae4cda2..5a5c55ddcee 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
73 if (pos > sbi->s_bitmap_maxbytes) 73 if ((pos > sbi->s_bitmap_maxbytes ||
74 (pos == sbi->s_bitmap_maxbytes && length > 0)))
74 return -EFBIG; 75 return -EFBIG;
75 76
76 if (pos + length > sbi->s_bitmap_maxbytes) { 77 if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,14 +124,56 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
123 if (!IS_ERR(cp)) { 124 if (!IS_ERR(cp)) {
124 memcpy(sbi->s_es->s_last_mounted, cp, 125 memcpy(sbi->s_es->s_last_mounted, cp,
125 sizeof(sbi->s_es->s_last_mounted)); 126 sizeof(sbi->s_es->s_last_mounted));
126 sb->s_dirt = 1; 127 ext4_mark_super_dirty(sb);
127 } 128 }
128 } 129 }
129 return dquot_file_open(inode, filp); 130 return dquot_file_open(inode, filp);
130} 131}
131 132
133/*
134 * ext4_llseek() copied from generic_file_llseek() to handle both
135 * block-mapped and extent-mapped maxbytes values. This should
136 * otherwise be identical with generic_file_llseek().
137 */
138loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
139{
140 struct inode *inode = file->f_mapping->host;
141 loff_t maxbytes;
142
143 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
144 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
145 else
146 maxbytes = inode->i_sb->s_maxbytes;
147 mutex_lock(&inode->i_mutex);
148 switch (origin) {
149 case SEEK_END:
150 offset += inode->i_size;
151 break;
152 case SEEK_CUR:
153 if (offset == 0) {
154 mutex_unlock(&inode->i_mutex);
155 return file->f_pos;
156 }
157 offset += file->f_pos;
158 break;
159 }
160
161 if (offset < 0 || offset > maxbytes) {
162 mutex_unlock(&inode->i_mutex);
163 return -EINVAL;
164 }
165
166 if (offset != file->f_pos) {
167 file->f_pos = offset;
168 file->f_version = 0;
169 }
170 mutex_unlock(&inode->i_mutex);
171
172 return offset;
173}
174
132const struct file_operations ext4_file_operations = { 175const struct file_operations ext4_file_operations = {
133 .llseek = generic_file_llseek, 176 .llseek = ext4_llseek,
134 .read = do_sync_read, 177 .read = do_sync_read,
135 .write = do_sync_write, 178 .write = do_sync_write,
136 .aio_read = generic_file_aio_read, 179 .aio_read = generic_file_aio_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546..c1a7bc923cf 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78static int flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list);
94 /*
95 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written.
97 *
98 * When ext4_sync_file() is called, run_queue() may already
99 * about to flush the work corresponding to this io structure.
100 * It will be upset if it founds the io structure related
101 * to the work-to-be schedule is freed.
102 *
103 * Thus we need to keep the io structure still valid here after
104 * convertion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work
106 * queue work.
107 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0)
112 ret2 = ret;
113 else
114 list_del_init(&io->list);
115 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0;
118}
119
37/* 120/*
38 * If we're not journaling and this is a just-created file, we have to 121 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since 122 * sync our parent directory (if it was freshly created) since
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
128 (journal->j_fs_dev != journal->j_dev) && 211 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER)) 212 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 213 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT); 214 NULL);
132 ret = jbd2_log_wait_commit(journal, commit_tid); 215 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER) 216 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
135 BLKDEV_IFL_WAIT);
136 return ret; 218 return ret;
137} 219}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25c4b3173fd..1ce240a23eb 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
50 * need to use it within a single byte (to ensure we get endianness right). 50 * need to use it within a single byte (to ensure we get endianness right).
51 * We can use memset for the rest of the bitmap as there are no other users. 51 * We can use memset for the rest of the bitmap as there are no other users.
52 */ 52 */
53void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) 53void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
54{ 54{
55 int i; 55 int i;
56 56
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
65} 65}
66 66
67/* Initializes an uninitialized inode bitmap */ 67/* Initializes an uninitialized inode bitmap */
68unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, 68static unsigned ext4_init_inode_bitmap(struct super_block *sb,
69 ext4_group_t block_group, 69 struct buffer_head *bh,
70 struct ext4_group_desc *gdp) 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp)
71{ 72{
72 struct ext4_sb_info *sbi = EXT4_SB(sb); 73 struct ext4_sb_info *sbi = EXT4_SB(sb);
73 74
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
85 } 86 }
86 87
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
89 bh->b_data); 90 bh->b_data);
90 91
91 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 108 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 109 if (!desc)
109 return NULL; 110 return NULL;
111
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 112 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 113 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 114 if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 125 unlock_buffer(bh);
124 return bh; 126 return bh;
125 } 127 }
128
126 ext4_lock_group(sb, block_group); 129 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 130 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 131 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 136 return bh;
134 } 137 }
135 ext4_unlock_group(sb, block_group); 138 ext4_unlock_group(sb, block_group);
139
136 if (buffer_uptodate(bh)) { 140 if (buffer_uptodate(bh)) {
137 /* 141 /*
138 * if not uninit if bh is uptodate, 142 * if not uninit if bh is uptodate,
@@ -222,7 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
222 is_directory = S_ISDIR(inode->i_mode); 226 is_directory = S_ISDIR(inode->i_mode);
223 227
224 /* Do this BEFORE marking the inode not in use or returning an error */ 228 /* Do this BEFORE marking the inode not in use or returning an error */
225 clear_inode(inode); 229 ext4_clear_inode(inode);
226 230
227 es = EXT4_SB(sb)->s_es; 231 es = EXT4_SB(sb)->s_es;
228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 232 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
@@ -279,7 +283,7 @@ out:
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 283 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal) 284 if (!fatal)
281 fatal = err; 285 fatal = err;
282 sb->s_dirt = 1; 286 ext4_mark_super_dirty(sb);
283 } else 287 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino); 288 ext4_error(sb, "bit already cleared for inode %lu", ino);
285 289
@@ -411,8 +415,8 @@ struct orlov_stats {
411 * for a particular block group or flex_bg. If flex_size is 1, then g 415 * for a particular block group or flex_bg. If flex_size is 1, then g
412 * is a block group number; otherwise it is flex_bg number. 416 * is a block group number; otherwise it is flex_bg number.
413 */ 417 */
414void get_orlov_stats(struct super_block *sb, ext4_group_t g, 418static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
415 int flex_size, struct orlov_stats *stats) 419 int flex_size, struct orlov_stats *stats)
416{ 420{
417 struct ext4_group_desc *desc; 421 struct ext4_group_desc *desc;
418 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; 422 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 716{
713 int free = 0, retval = 0, count; 717 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 718 struct ext4_sb_info *sbi = EXT4_SB(sb);
719 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 720 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 721
722 /*
723 * We have to be sure that new inode allocation does not race with
724 * inode table initialization, because otherwise we may end up
725 * allocating and writing new inode right before sb_issue_zeroout
726 * takes place and overwriting our new inode with zeroes. So we
727 * take alloc_sem to prevent it.
728 */
729 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 730 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 731 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 732 /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 737 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 738 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 739 ext4_unlock_group(sb, group);
740 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 741 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 742 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 743 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 787err_ret:
774 ext4_unlock_group(sb, group); 788 ext4_unlock_group(sb, group);
789 up_read(&grp->alloc_sem);
775 return retval; 790 return retval;
776} 791}
777 792
@@ -965,7 +980,7 @@ got:
965 percpu_counter_dec(&sbi->s_freeinodes_counter); 980 percpu_counter_dec(&sbi->s_freeinodes_counter);
966 if (S_ISDIR(mode)) 981 if (S_ISDIR(mode))
967 percpu_counter_inc(&sbi->s_dirs_counter); 982 percpu_counter_inc(&sbi->s_dirs_counter);
968 sb->s_dirt = 1; 983 ext4_mark_super_dirty(sb);
969 984
970 if (sbi->s_log_groups_per_flex) { 985 if (sbi->s_log_groups_per_flex) {
971 flex_group = ext4_flex_group(sbi, group); 986 flex_group = ext4_flex_group(sbi, group);
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1220 }
1206 return count; 1221 return count;
1207} 1222}
1223
1224/*
1225 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1226 * inode table. Must be called without any spinlock held. The only place
1227 * where it is called from on active part of filesystem is ext4lazyinit
1228 * thread, so we do not need any special locks, however we have to prevent
1229 * inode allocation from the current group, so we take alloc_sem lock, to
1230 * block ext4_claim_inode until we are finished.
1231 */
1232extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1233 int barrier)
1234{
1235 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1236 struct ext4_sb_info *sbi = EXT4_SB(sb);
1237 struct ext4_group_desc *gdp = NULL;
1238 struct buffer_head *group_desc_bh;
1239 handle_t *handle;
1240 ext4_fsblk_t blk;
1241 int num, ret = 0, used_blks = 0;
1242
1243 /* This should not happen, but just to be sure check this */
1244 if (sb->s_flags & MS_RDONLY) {
1245 ret = 1;
1246 goto out;
1247 }
1248
1249 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1250 if (!gdp)
1251 goto out;
1252
1253 /*
1254 * We do not need to lock this, because we are the only one
1255 * handling this flag.
1256 */
1257 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1258 goto out;
1259
1260 handle = ext4_journal_start_sb(sb, 1);
1261 if (IS_ERR(handle)) {
1262 ret = PTR_ERR(handle);
1263 goto out;
1264 }
1265
1266 down_write(&grp->alloc_sem);
1267 /*
1268 * If inode bitmap was already initialized there may be some
1269 * used inodes so we need to skip blocks with used inodes in
1270 * inode table.
1271 */
1272 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1273 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1274 ext4_itable_unused_count(sb, gdp)),
1275 sbi->s_inodes_per_block);
1276
1277 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1278 ext4_error(sb, "Something is wrong with group %u\n"
1279 "Used itable blocks: %d"
1280 "itable unused count: %u\n",
1281 group, used_blks,
1282 ext4_itable_unused_count(sb, gdp));
1283 ret = 1;
1284 goto out;
1285 }
1286
1287 blk = ext4_inode_table(sb, gdp) + used_blks;
1288 num = sbi->s_itb_per_group - used_blks;
1289
1290 BUFFER_TRACE(group_desc_bh, "get_write_access");
1291 ret = ext4_journal_get_write_access(handle,
1292 group_desc_bh);
1293 if (ret)
1294 goto err_out;
1295
1296 /*
1297 * Skip zeroout if the inode table is full. But we set the ZEROED
1298 * flag anyway, because obviously, when it is full it does not need
1299 * further zeroing.
1300 */
1301 if (unlikely(num == 0))
1302 goto skip_zeroout;
1303
1304 ext4_debug("going to zero out inode table in group %d\n",
1305 group);
1306 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1307 if (ret < 0)
1308 goto err_out;
1309 if (barrier)
1310 blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1311
1312skip_zeroout:
1313 ext4_lock_group(sb, group);
1314 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1315 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1316 ext4_unlock_group(sb, group);
1317
1318 BUFFER_TRACE(group_desc_bh,
1319 "call ext4_handle_dirty_metadata");
1320 ret = ext4_handle_dirty_metadata(handle, NULL,
1321 group_desc_bh);
1322
1323err_out:
1324 up_write(&grp->alloc_sem);
1325 ext4_journal_stop(handle);
1326out:
1327 return ret;
1328}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 42272d67955..19161647046 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
60} 60}
61 61
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 62static void ext4_invalidatepage(struct page *page, unsigned long offset);
63static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
64 struct buffer_head *bh_result, int create);
65static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
66static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
67static int __ext4_journalled_writepage(struct page *page, unsigned int len);
68static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 69
64/* 70/*
65 * Test whether an inode is a fast symlink. 71 * Test whether an inode is a fast symlink.
@@ -167,11 +173,16 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
167/* 173/*
168 * Called at the last iput() if i_nlink is zero. 174 * Called at the last iput() if i_nlink is zero.
169 */ 175 */
170void ext4_delete_inode(struct inode *inode) 176void ext4_evict_inode(struct inode *inode)
171{ 177{
172 handle_t *handle; 178 handle_t *handle;
173 int err; 179 int err;
174 180
181 if (inode->i_nlink) {
182 truncate_inode_pages(&inode->i_data, 0);
183 goto no_delete;
184 }
185
175 if (!is_bad_inode(inode)) 186 if (!is_bad_inode(inode))
176 dquot_initialize(inode); 187 dquot_initialize(inode);
177 188
@@ -221,6 +232,7 @@ void ext4_delete_inode(struct inode *inode)
221 "couldn't extend journal (err %d)", err); 232 "couldn't extend journal (err %d)", err);
222 stop_handle: 233 stop_handle:
223 ext4_journal_stop(handle); 234 ext4_journal_stop(handle);
235 ext4_orphan_del(NULL, inode);
224 goto no_delete; 236 goto no_delete;
225 } 237 }
226 } 238 }
@@ -245,13 +257,13 @@ void ext4_delete_inode(struct inode *inode)
245 */ 257 */
246 if (ext4_mark_inode_dirty(handle, inode)) 258 if (ext4_mark_inode_dirty(handle, inode))
247 /* If that failed, just do the required in-core inode clear. */ 259 /* If that failed, just do the required in-core inode clear. */
248 clear_inode(inode); 260 ext4_clear_inode(inode);
249 else 261 else
250 ext4_free_inode(handle, inode); 262 ext4_free_inode(handle, inode);
251 ext4_journal_stop(handle); 263 ext4_journal_stop(handle);
252 return; 264 return;
253no_delete: 265no_delete:
254 clear_inode(inode); /* We must guarantee clearing of inode... */ 266 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
255} 267}
256 268
257typedef struct { 269typedef struct {
@@ -337,9 +349,11 @@ static int ext4_block_to_path(struct inode *inode,
337 return n; 349 return n;
338} 350}
339 351
340static int __ext4_check_blockref(const char *function, struct inode *inode, 352static int __ext4_check_blockref(const char *function, unsigned int line,
353 struct inode *inode,
341 __le32 *p, unsigned int max) 354 __le32 *p, unsigned int max)
342{ 355{
356 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
343 __le32 *bref = p; 357 __le32 *bref = p;
344 unsigned int blk; 358 unsigned int blk;
345 359
@@ -348,8 +362,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 362 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 363 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 364 blk, 1))) {
351 ext4_error_inode(function, inode, 365 es->s_last_error_block = cpu_to_le64(blk);
352 "invalid block reference %u", blk); 366 ext4_error_inode(inode, function, line, blk,
367 "invalid block");
353 return -EIO; 368 return -EIO;
354 } 369 }
355 } 370 }
@@ -358,11 +373,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
358 373
359 374
360#define ext4_check_indirect_blockref(inode, bh) \ 375#define ext4_check_indirect_blockref(inode, bh) \
361 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 376 __ext4_check_blockref(__func__, __LINE__, inode, \
377 (__le32 *)(bh)->b_data, \
362 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 378 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
363 379
364#define ext4_check_inode_blockref(inode) \ 380#define ext4_check_inode_blockref(inode) \
365 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 381 __ext4_check_blockref(__func__, __LINE__, inode, \
382 EXT4_I(inode)->i_data, \
366 EXT4_NDIR_BLOCKS) 383 EXT4_NDIR_BLOCKS)
367 384
368/** 385/**
@@ -744,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
744 * parent to disk. 761 * parent to disk.
745 */ 762 */
746 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 763 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
764 if (unlikely(!bh)) {
765 err = -EIO;
766 goto failed;
767 }
768
747 branch[n].bh = bh; 769 branch[n].bh = bh;
748 lock_buffer(bh); 770 lock_buffer(bh);
749 BUFFER_TRACE(bh, "call get_create_access"); 771 BUFFER_TRACE(bh, "call get_create_access");
@@ -1128,20 +1150,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
1128 ext4_discard_preallocations(inode); 1150 ext4_discard_preallocations(inode);
1129} 1151}
1130 1152
1131static int check_block_validity(struct inode *inode, const char *func, 1153static int __check_block_validity(struct inode *inode, const char *func,
1154 unsigned int line,
1132 struct ext4_map_blocks *map) 1155 struct ext4_map_blocks *map)
1133{ 1156{
1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 1157 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1135 map->m_len)) { 1158 map->m_len)) {
1136 ext4_error_inode(func, inode, 1159 ext4_error_inode(inode, func, line, map->m_pblk,
1137 "lblock %lu mapped to illegal pblock %llu " 1160 "lblock %lu mapped to illegal pblock "
1138 "(length %d)", (unsigned long) map->m_lblk, 1161 "(length %d)", (unsigned long) map->m_lblk,
1139 map->m_pblk, map->m_len); 1162 map->m_len);
1140 return -EIO; 1163 return -EIO;
1141 } 1164 }
1142 return 0; 1165 return 0;
1143} 1166}
1144 1167
1168#define check_block_validity(inode, map) \
1169 __check_block_validity((inode), __func__, __LINE__, (map))
1170
1145/* 1171/*
1146 * Return the number of contiguous dirty pages in a given inode 1172 * Return the number of contiguous dirty pages in a given inode
1147 * starting at page frame idx. 1173 * starting at page frame idx.
@@ -1192,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1192 break; 1218 break;
1193 idx++; 1219 idx++;
1194 num++; 1220 num++;
1195 if (num >= max_pages) 1221 if (num >= max_pages) {
1222 done = 1;
1196 break; 1223 break;
1224 }
1197 } 1225 }
1198 pagevec_release(&pvec); 1226 pagevec_release(&pvec);
1199 } 1227 }
@@ -1244,7 +1272,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1244 up_read((&EXT4_I(inode)->i_data_sem)); 1272 up_read((&EXT4_I(inode)->i_data_sem));
1245 1273
1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1274 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1247 int ret = check_block_validity(inode, __func__, map); 1275 int ret = check_block_validity(inode, map);
1248 if (ret != 0) 1276 if (ret != 0)
1249 return ret; 1277 return ret;
1250 } 1278 }
@@ -1324,9 +1352,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1324 1352
1325 up_write((&EXT4_I(inode)->i_data_sem)); 1353 up_write((&EXT4_I(inode)->i_data_sem));
1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1354 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1327 int ret = check_block_validity(inode, 1355 int ret = check_block_validity(inode, map);
1328 "ext4_map_blocks_after_alloc",
1329 map);
1330 if (ret != 0) 1356 if (ret != 0)
1331 return ret; 1357 return ret;
1332 } 1358 }
@@ -1519,9 +1545,25 @@ static int walk_page_buffers(handle_t *handle,
1519static int do_journal_get_write_access(handle_t *handle, 1545static int do_journal_get_write_access(handle_t *handle,
1520 struct buffer_head *bh) 1546 struct buffer_head *bh)
1521{ 1547{
1548 int dirty = buffer_dirty(bh);
1549 int ret;
1550
1522 if (!buffer_mapped(bh) || buffer_freed(bh)) 1551 if (!buffer_mapped(bh) || buffer_freed(bh))
1523 return 0; 1552 return 0;
1524 return ext4_journal_get_write_access(handle, bh); 1553 /*
1554 * __block_write_begin() could have dirtied some buffers. Clean
1555 * the dirty bit as jbd2_journal_get_write_access() could complain
1556 * otherwise about fs integrity issues. Setting of the dirty bit
1557 * by __block_write_begin() isn't a real problem here as we clear
1558 * the bit before releasing a page lock and thus writeback cannot
1559 * ever write the buffer.
1560 */
1561 if (dirty)
1562 clear_buffer_dirty(bh);
1563 ret = ext4_journal_get_write_access(handle, bh);
1564 if (!ret && dirty)
1565 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1566 return ret;
1525} 1567}
1526 1568
1527/* 1569/*
@@ -1578,11 +1620,9 @@ retry:
1578 *pagep = page; 1620 *pagep = page;
1579 1621
1580 if (ext4_should_dioread_nolock(inode)) 1622 if (ext4_should_dioread_nolock(inode))
1581 ret = block_write_begin(file, mapping, pos, len, flags, pagep, 1623 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1582 fsdata, ext4_get_block_write);
1583 else 1624 else
1584 ret = block_write_begin(file, mapping, pos, len, flags, pagep, 1625 ret = __block_write_begin(page, pos, len, ext4_get_block);
1585 fsdata, ext4_get_block);
1586 1626
1587 if (!ret && ext4_should_journal_data(inode)) { 1627 if (!ret && ext4_should_journal_data(inode)) {
1588 ret = walk_page_buffers(handle, page_buffers(page), 1628 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1593,7 +1633,7 @@ retry:
1593 unlock_page(page); 1633 unlock_page(page);
1594 page_cache_release(page); 1634 page_cache_release(page);
1595 /* 1635 /*
1596 * block_write_begin may have instantiated a few blocks 1636 * __block_write_begin may have instantiated a few blocks
1597 * outside i_size. Trim these off again. Don't need 1637 * outside i_size. Trim these off again. Don't need
1598 * i_size_read because we hold i_mutex. 1638 * i_size_read because we hold i_mutex.
1599 * 1639 *
@@ -1968,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1968 * 2008 *
1969 * As pages are already locked by write_cache_pages(), we can't use it 2009 * As pages are already locked by write_cache_pages(), we can't use it
1970 */ 2010 */
1971static int mpage_da_submit_io(struct mpage_da_data *mpd) 2011static int mpage_da_submit_io(struct mpage_da_data *mpd,
2012 struct ext4_map_blocks *map)
1972{ 2013{
1973 long pages_skipped;
1974 struct pagevec pvec; 2014 struct pagevec pvec;
1975 unsigned long index, end; 2015 unsigned long index, end;
1976 int ret = 0, err, nr_pages, i; 2016 int ret = 0, err, nr_pages, i;
1977 struct inode *inode = mpd->inode; 2017 struct inode *inode = mpd->inode;
1978 struct address_space *mapping = inode->i_mapping; 2018 struct address_space *mapping = inode->i_mapping;
2019 loff_t size = i_size_read(inode);
2020 unsigned int len, block_start;
2021 struct buffer_head *bh, *page_bufs = NULL;
2022 int journal_data = ext4_should_journal_data(inode);
2023 sector_t pblock = 0, cur_logical = 0;
2024 struct ext4_io_submit io_submit;
1979 2025
1980 BUG_ON(mpd->next_page <= mpd->first_page); 2026 BUG_ON(mpd->next_page <= mpd->first_page);
2027 memset(&io_submit, 0, sizeof(io_submit));
1981 /* 2028 /*
1982 * We need to start from the first_page to the next_page - 1 2029 * We need to start from the first_page to the next_page - 1
1983 * to make sure we also write the mapped dirty buffer_heads. 2030 * to make sure we also write the mapped dirty buffer_heads.
@@ -1993,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1993 if (nr_pages == 0) 2040 if (nr_pages == 0)
1994 break; 2041 break;
1995 for (i = 0; i < nr_pages; i++) { 2042 for (i = 0; i < nr_pages; i++) {
2043 int commit_write = 0, redirty_page = 0;
1996 struct page *page = pvec.pages[i]; 2044 struct page *page = pvec.pages[i];
1997 2045
1998 index = page->index; 2046 index = page->index;
1999 if (index > end) 2047 if (index > end)
2000 break; 2048 break;
2049
2050 if (index == size >> PAGE_CACHE_SHIFT)
2051 len = size & ~PAGE_CACHE_MASK;
2052 else
2053 len = PAGE_CACHE_SIZE;
2054 if (map) {
2055 cur_logical = index << (PAGE_CACHE_SHIFT -
2056 inode->i_blkbits);
2057 pblock = map->m_pblk + (cur_logical -
2058 map->m_lblk);
2059 }
2001 index++; 2060 index++;
2002 2061
2003 BUG_ON(!PageLocked(page)); 2062 BUG_ON(!PageLocked(page));
2004 BUG_ON(PageWriteback(page)); 2063 BUG_ON(PageWriteback(page));
2005 2064
2006 pages_skipped = mpd->wbc->pages_skipped;
2007 err = mapping->a_ops->writepage(page, mpd->wbc);
2008 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2009 /*
2010 * have successfully written the page
2011 * without skipping the same
2012 */
2013 mpd->pages_written++;
2014 /* 2065 /*
2015 * In error case, we have to continue because 2066 * If the page does not have buffers (for
2016 * remaining pages are still locked 2067 * whatever reason), try to create them using
2017 * XXX: unlock and re-dirty them? 2068 * __block_write_begin. If this fails,
2069 * redirty the page and move on.
2018 */ 2070 */
2019 if (ret == 0) 2071 if (!page_has_buffers(page)) {
2020 ret = err; 2072 if (__block_write_begin(page, 0, len,
2021 } 2073 noalloc_get_block_write)) {
2022 pagevec_release(&pvec); 2074 redirty_page:
2023 } 2075 redirty_page_for_writepage(mpd->wbc,
2024 return ret; 2076 page);
2025} 2077 unlock_page(page);
2026 2078 continue;
2027/* 2079 }
2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2080 commit_write = 1;
2029 * 2081 }
2030 * the function goes through all passed space and put actual disk
2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2032 */
2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2034 struct ext4_map_blocks *map)
2035{
2036 struct inode *inode = mpd->inode;
2037 struct address_space *mapping = inode->i_mapping;
2038 int blocks = map->m_len;
2039 sector_t pblock = map->m_pblk, cur_logical;
2040 struct buffer_head *head, *bh;
2041 pgoff_t index, end;
2042 struct pagevec pvec;
2043 int nr_pages, i;
2044
2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2048
2049 pagevec_init(&pvec, 0);
2050
2051 while (index <= end) {
2052 /* XXX: optimize tail */
2053 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2054 if (nr_pages == 0)
2055 break;
2056 for (i = 0; i < nr_pages; i++) {
2057 struct page *page = pvec.pages[i];
2058
2059 index = page->index;
2060 if (index > end)
2061 break;
2062 index++;
2063
2064 BUG_ON(!PageLocked(page));
2065 BUG_ON(PageWriteback(page));
2066 BUG_ON(!page_has_buffers(page));
2067
2068 bh = page_buffers(page);
2069 head = bh;
2070
2071 /* skip blocks out of the range */
2072 do {
2073 if (cur_logical >= map->m_lblk)
2074 break;
2075 cur_logical++;
2076 } while ((bh = bh->b_this_page) != head);
2077 2082
2083 bh = page_bufs = page_buffers(page);
2084 block_start = 0;
2078 do { 2085 do {
2079 if (cur_logical >= map->m_lblk + blocks) 2086 if (!bh)
2080 break; 2087 goto redirty_page;
2081 2088 if (map && (cur_logical >= map->m_lblk) &&
2082 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2089 (cur_logical <= (map->m_lblk +
2083 2090 (map->m_len - 1)))) {
2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2085
2086 if (buffer_delay(bh)) { 2091 if (buffer_delay(bh)) {
2087 clear_buffer_delay(bh); 2092 clear_buffer_delay(bh);
2088 bh->b_blocknr = pblock; 2093 bh->b_blocknr = pblock;
2089 } else {
2090 /*
2091 * unwritten already should have
2092 * blocknr assigned. Verify that
2093 */
2094 clear_buffer_unwritten(bh);
2095 BUG_ON(bh->b_blocknr != pblock);
2096 } 2094 }
2095 if (buffer_unwritten(bh) ||
2096 buffer_mapped(bh))
2097 BUG_ON(bh->b_blocknr != pblock);
2098 if (map->m_flags & EXT4_MAP_UNINIT)
2099 set_buffer_uninit(bh);
2100 clear_buffer_unwritten(bh);
2101 }
2097 2102
2098 } else if (buffer_mapped(bh)) 2103 /* redirty page if block allocation undone */
2099 BUG_ON(bh->b_blocknr != pblock); 2104 if (buffer_delay(bh) || buffer_unwritten(bh))
2100 2105 redirty_page = 1;
2101 if (map->m_flags & EXT4_MAP_UNINIT) 2106 bh = bh->b_this_page;
2102 set_buffer_uninit(bh); 2107 block_start += bh->b_size;
2103 cur_logical++; 2108 cur_logical++;
2104 pblock++; 2109 pblock++;
2105 } while ((bh = bh->b_this_page) != head); 2110 } while (bh != page_bufs);
2111
2112 if (redirty_page)
2113 goto redirty_page;
2114
2115 if (commit_write)
2116 /* mark the buffer_heads as dirty & uptodate */
2117 block_commit_write(page, 0, len);
2118
2119 /*
2120 * Delalloc doesn't support data journalling,
2121 * but eventually maybe we'll lift this
2122 * restriction.
2123 */
2124 if (unlikely(journal_data && PageChecked(page)))
2125 err = __ext4_journalled_writepage(page, len);
2126 else
2127 err = ext4_bio_write_page(&io_submit, page,
2128 len, mpd->wbc);
2129
2130 if (!err)
2131 mpd->pages_written++;
2132 /*
2133 * In error case, we have to continue because
2134 * remaining pages are still locked
2135 */
2136 if (ret == 0)
2137 ret = err;
2106 } 2138 }
2107 pagevec_release(&pvec); 2139 pagevec_release(&pvec);
2108 } 2140 }
2141 ext4_io_submit(&io_submit);
2142 return ret;
2109} 2143}
2110 2144
2111
2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2145static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2113 sector_t logical, long blk_cnt) 2146 sector_t logical, long blk_cnt)
2114{ 2147{
@@ -2160,41 +2193,38 @@ static void ext4_print_free_blocks(struct inode *inode)
2160} 2193}
2161 2194
2162/* 2195/*
2163 * mpage_da_map_blocks - go through given space 2196 * mpage_da_map_and_submit - go through given space, map them
2197 * if necessary, and then submit them for I/O
2164 * 2198 *
2165 * @mpd - bh describing space 2199 * @mpd - bh describing space
2166 * 2200 *
2167 * The function skips space we know is already mapped to disk blocks. 2201 * The function skips space we know is already mapped to disk blocks.
2168 * 2202 *
2169 */ 2203 */
2170static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2204static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2171{ 2205{
2172 int err, blks, get_blocks_flags; 2206 int err, blks, get_blocks_flags;
2173 struct ext4_map_blocks map; 2207 struct ext4_map_blocks map, *mapp = NULL;
2174 sector_t next = mpd->b_blocknr; 2208 sector_t next = mpd->b_blocknr;
2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2209 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2210 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2177 handle_t *handle = NULL; 2211 handle_t *handle = NULL;
2178 2212
2179 /* 2213 /*
2180 * We consider only non-mapped and non-allocated blocks 2214 * If the blocks are mapped already, or we couldn't accumulate
2181 */ 2215 * any blocks, then proceed immediately to the submission stage.
2182 if ((mpd->b_state & (1 << BH_Mapped)) &&
2183 !(mpd->b_state & (1 << BH_Delay)) &&
2184 !(mpd->b_state & (1 << BH_Unwritten)))
2185 return 0;
2186
2187 /*
2188 * If we didn't accumulate anything to write simply return
2189 */ 2216 */
2190 if (!mpd->b_size) 2217 if ((mpd->b_size == 0) ||
2191 return 0; 2218 ((mpd->b_state & (1 << BH_Mapped)) &&
2219 !(mpd->b_state & (1 << BH_Delay)) &&
2220 !(mpd->b_state & (1 << BH_Unwritten))))
2221 goto submit_io;
2192 2222
2193 handle = ext4_journal_current_handle(); 2223 handle = ext4_journal_current_handle();
2194 BUG_ON(!handle); 2224 BUG_ON(!handle);
2195 2225
2196 /* 2226 /*
2197 * Call ext4_get_blocks() to allocate any delayed allocation 2227 * Call ext4_map_blocks() to allocate any delayed allocation
2198 * blocks, or to convert an uninitialized extent to be 2228 * blocks, or to convert an uninitialized extent to be
2199 * initialized (in the case where we have written into 2229 * initialized (in the case where we have written into
2200 * one or more preallocated blocks). 2230 * one or more preallocated blocks).
@@ -2203,7 +2233,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2203 * indicate that we are on the delayed allocation path. This 2233 * indicate that we are on the delayed allocation path. This
2204 * affects functions in many different parts of the allocation 2234 * affects functions in many different parts of the allocation
2205 * call path. This flag exists primarily because we don't 2235 * call path. This flag exists primarily because we don't
2206 * want to change *many* call functions, so ext4_get_blocks() 2236 * want to change *many* call functions, so ext4_map_blocks()
2207 * will set the magic i_delalloc_reserved_flag once the 2237 * will set the magic i_delalloc_reserved_flag once the
2208 * inode's allocation semaphore is taken. 2238 * inode's allocation semaphore is taken.
2209 * 2239 *
@@ -2221,19 +2251,22 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2221 2251
2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 2252 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2223 if (blks < 0) { 2253 if (blks < 0) {
2254 struct super_block *sb = mpd->inode->i_sb;
2255
2224 err = blks; 2256 err = blks;
2225 /* 2257 /*
2226 * If get block returns with error we simply 2258 * If get block returns EAGAIN or ENOSPC and there
2227 * return. Later writepage will redirty the page and 2259 * appears to be free blocks we will call
2228 * writepages will find the dirty page again 2260 * ext4_writepage() for all of the pages which will
2261 * just redirty the pages.
2229 */ 2262 */
2230 if (err == -EAGAIN) 2263 if (err == -EAGAIN)
2231 return 0; 2264 goto submit_io;
2232 2265
2233 if (err == -ENOSPC && 2266 if (err == -ENOSPC &&
2234 ext4_count_free_blocks(mpd->inode->i_sb)) { 2267 ext4_count_free_blocks(sb)) {
2235 mpd->retval = err; 2268 mpd->retval = err;
2236 return 0; 2269 goto submit_io;
2237 } 2270 }
2238 2271
2239 /* 2272 /*
@@ -2243,24 +2276,26 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2243 * writepage and writepages will again try to write 2276 * writepage and writepages will again try to write
2244 * the same. 2277 * the same.
2245 */ 2278 */
2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2279 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2247 "delayed block allocation failed for inode %lu at " 2280 ext4_msg(sb, KERN_CRIT,
2248 "logical offset %llu with max blocks %zd with " 2281 "delayed block allocation failed for inode %lu "
2249 "error %d", mpd->inode->i_ino, 2282 "at logical offset %llu with max blocks %zd "
2250 (unsigned long long) next, 2283 "with error %d", mpd->inode->i_ino,
2251 mpd->b_size >> mpd->inode->i_blkbits, err); 2284 (unsigned long long) next,
2252 printk(KERN_CRIT "This should not happen!! " 2285 mpd->b_size >> mpd->inode->i_blkbits, err);
2253 "Data will be lost\n"); 2286 ext4_msg(sb, KERN_CRIT,
2254 if (err == -ENOSPC) { 2287 "This should not happen!! Data will be lost\n");
2255 ext4_print_free_blocks(mpd->inode); 2288 if (err == -ENOSPC)
2289 ext4_print_free_blocks(mpd->inode);
2256 } 2290 }
2257 /* invalidate all the pages */ 2291 /* invalidate all the pages */
2258 ext4_da_block_invalidatepages(mpd, next, 2292 ext4_da_block_invalidatepages(mpd, next,
2259 mpd->b_size >> mpd->inode->i_blkbits); 2293 mpd->b_size >> mpd->inode->i_blkbits);
2260 return err; 2294 return;
2261 } 2295 }
2262 BUG_ON(blks == 0); 2296 BUG_ON(blks == 0);
2263 2297
2298 mapp = &map;
2264 if (map.m_flags & EXT4_MAP_NEW) { 2299 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2300 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i; 2301 int i;
@@ -2269,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2269 unmap_underlying_metadata(bdev, map.m_pblk + i); 2304 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 } 2305 }
2271 2306
2272 /*
2273 * If blocks are delayed marked, we need to
2274 * put actual blocknr and drop delayed bit
2275 */
2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2277 (mpd->b_state & (1 << BH_Unwritten)))
2278 mpage_put_bnr_to_bhs(mpd, &map);
2279
2280 if (ext4_should_order_data(mpd->inode)) { 2307 if (ext4_should_order_data(mpd->inode)) {
2281 err = ext4_jbd2_file_inode(handle, mpd->inode); 2308 err = ext4_jbd2_file_inode(handle, mpd->inode);
2282 if (err) 2309 if (err)
2283 return err; 2310 /* This only happens if the journal is aborted */
2311 return;
2284 } 2312 }
2285 2313
2286 /* 2314 /*
@@ -2291,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2291 disksize = i_size_read(mpd->inode); 2319 disksize = i_size_read(mpd->inode);
2292 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2320 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2293 ext4_update_i_disksize(mpd->inode, disksize); 2321 ext4_update_i_disksize(mpd->inode, disksize);
2294 return ext4_mark_inode_dirty(handle, mpd->inode); 2322 err = ext4_mark_inode_dirty(handle, mpd->inode);
2323 if (err)
2324 ext4_error(mpd->inode->i_sb,
2325 "Failed to mark inode %lu dirty",
2326 mpd->inode->i_ino);
2295 } 2327 }
2296 2328
2297 return 0; 2329submit_io:
2330 mpage_da_submit_io(mpd, mapp);
2331 mpd->io_done = 1;
2298} 2332}
2299 2333
2300#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2334#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2320,7 +2354,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2320 * XXX Don't go larger than mballoc is willing to allocate 2354 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold 2355 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call 2356 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop 2357 * ext4_map_blocks() multiple times in a loop
2324 */ 2358 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) 2359 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it; 2360 goto flush_it;
@@ -2371,9 +2405,7 @@ flush_it:
2371 * We couldn't merge the block to our extent, so we 2405 * We couldn't merge the block to our extent, so we
2372 * need to flush current extent and start new one 2406 * need to flush current extent and start new one
2373 */ 2407 */
2374 if (mpage_da_map_blocks(mpd) == 0) 2408 mpage_da_map_and_submit(mpd);
2375 mpage_da_submit_io(mpd);
2376 mpd->io_done = 1;
2377 return; 2409 return;
2378} 2410}
2379 2411
@@ -2392,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2392 * The function finds extents of pages and scan them for all blocks. 2424 * The function finds extents of pages and scan them for all blocks.
2393 */ 2425 */
2394static int __mpage_da_writepage(struct page *page, 2426static int __mpage_da_writepage(struct page *page,
2395 struct writeback_control *wbc, void *data) 2427 struct writeback_control *wbc,
2428 struct mpage_da_data *mpd)
2396{ 2429{
2397 struct mpage_da_data *mpd = data;
2398 struct inode *inode = mpd->inode; 2430 struct inode *inode = mpd->inode;
2399 struct buffer_head *bh, *head; 2431 struct buffer_head *bh, *head;
2400 sector_t logical; 2432 sector_t logical;
@@ -2405,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
2405 if (mpd->next_page != page->index) { 2437 if (mpd->next_page != page->index) {
2406 /* 2438 /*
2407 * Nope, we can't. So, we map non-allocated blocks 2439 * Nope, we can't. So, we map non-allocated blocks
2408 * and start IO on them using writepage() 2440 * and start IO on them
2409 */ 2441 */
2410 if (mpd->next_page != mpd->first_page) { 2442 if (mpd->next_page != mpd->first_page) {
2411 if (mpage_da_map_blocks(mpd) == 0) 2443 mpage_da_map_and_submit(mpd);
2412 mpage_da_submit_io(mpd);
2413 /* 2444 /*
2414 * skip rest of the page in the page_vec 2445 * skip rest of the page in the page_vec
2415 */ 2446 */
2416 mpd->io_done = 1;
2417 redirty_page_for_writepage(wbc, page); 2447 redirty_page_for_writepage(wbc, page);
2418 unlock_page(page); 2448 unlock_page(page);
2419 return MPAGE_DA_EXTENT_TAIL; 2449 return MPAGE_DA_EXTENT_TAIL;
@@ -2520,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2520 if (buffer_delay(bh)) 2550 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */ 2551 return 0; /* Not sure this could or should happen */
2522 /* 2552 /*
2523 * XXX: __block_prepare_write() unmaps passed block, 2553 * XXX: __block_write_begin() unmaps passed block, is it OK?
2524 * is it OK?
2525 */ 2554 */
2526 ret = ext4_da_reserve_space(inode, iblock); 2555 ret = ext4_da_reserve_space(inode, iblock);
2527 if (ret) 2556 if (ret)
@@ -2553,18 +2582,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2553/* 2582/*
2554 * This function is used as a standard get_block_t calback function 2583 * This function is used as a standard get_block_t calback function
2555 * when there is no desire to allocate any blocks. It is used as a 2584 * when there is no desire to allocate any blocks. It is used as a
2556 * callback function for block_prepare_write(), nobh_writepage(), and 2585 * callback function for block_write_begin() and block_write_full_page().
2557 * block_write_full_page(). These functions should only try to map a 2586 * These functions should only try to map a single block at a time.
2558 * single block at a time.
2559 * 2587 *
2560 * Since this function doesn't do block allocations even if the caller 2588 * Since this function doesn't do block allocations even if the caller
2561 * requests it by passing in create=1, it is critically important that 2589 * requests it by passing in create=1, it is critically important that
2562 * any caller checks to make sure that any buffer heads are returned 2590 * any caller checks to make sure that any buffer heads are returned
2563 * by this function are either all already mapped or marked for 2591 * by this function are either all already mapped or marked for
2564 * delayed allocation before calling nobh_writepage() or 2592 * delayed allocation before calling block_write_full_page(). Otherwise,
2565 * block_write_full_page(). Otherwise, b_blocknr could be left 2593 * b_blocknr could be left unitialized, and the page write functions will
2566 * unitialized, and the page write functions will be taken by 2594 * be taken by surprise.
2567 * surprise.
2568 */ 2595 */
2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2596static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2570 struct buffer_head *bh_result, int create) 2597 struct buffer_head *bh_result, int create)
@@ -2595,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
2595 int ret = 0; 2622 int ret = 0;
2596 int err; 2623 int err;
2597 2624
2625 ClearPageChecked(page);
2598 page_bufs = page_buffers(page); 2626 page_bufs = page_buffers(page);
2599 BUG_ON(!page_bufs); 2627 BUG_ON(!page_bufs);
2600 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2672,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2672static int ext4_writepage(struct page *page, 2700static int ext4_writepage(struct page *page,
2673 struct writeback_control *wbc) 2701 struct writeback_control *wbc)
2674{ 2702{
2675 int ret = 0; 2703 int ret = 0, commit_write = 0;
2676 loff_t size; 2704 loff_t size;
2677 unsigned int len; 2705 unsigned int len;
2678 struct buffer_head *page_bufs = NULL; 2706 struct buffer_head *page_bufs = NULL;
@@ -2685,73 +2713,44 @@ static int ext4_writepage(struct page *page,
2685 else 2713 else
2686 len = PAGE_CACHE_SIZE; 2714 len = PAGE_CACHE_SIZE;
2687 2715
2688 if (page_has_buffers(page)) { 2716 /*
2689 page_bufs = page_buffers(page); 2717 * If the page does not have buffers (for whatever reason),
2690 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2718 * try to create them using __block_write_begin. If this
2691 ext4_bh_delay_or_unwritten)) { 2719 * fails, redirty the page and move on.
2692 /* 2720 */
2693 * We don't want to do block allocation 2721 if (!page_has_buffers(page)) {
2694 * So redirty the page and return 2722 if (__block_write_begin(page, 0, len,
2695 * We may reach here when we do a journal commit 2723 noalloc_get_block_write)) {
2696 * via journal_submit_inode_data_buffers. 2724 redirty_page:
2697 * If we don't have mapping block we just ignore
2698 * them. We can also reach here via shrink_page_list
2699 */
2700 redirty_page_for_writepage(wbc, page); 2725 redirty_page_for_writepage(wbc, page);
2701 unlock_page(page); 2726 unlock_page(page);
2702 return 0; 2727 return 0;
2703 } 2728 }
2704 } else { 2729 commit_write = 1;
2730 }
2731 page_bufs = page_buffers(page);
2732 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2733 ext4_bh_delay_or_unwritten)) {
2705 /* 2734 /*
2706 * The test for page_has_buffers() is subtle: 2735 * We don't want to do block allocation, so redirty
2707 * We know the page is dirty but it lost buffers. That means 2736 * the page and return. We may reach here when we do
2708 * that at some moment in time after write_begin()/write_end() 2737 * a journal commit via journal_submit_inode_data_buffers.
2709 * has been called all buffers have been clean and thus they 2738 * We can also reach here via shrink_page_list
2710 * must have been written at least once. So they are all
2711 * mapped and we can happily proceed with mapping them
2712 * and writing the page.
2713 *
2714 * Try to initialize the buffer_heads and check whether
2715 * all are mapped and non delay. We don't want to
2716 * do block allocation here.
2717 */ 2739 */
2718 ret = block_prepare_write(page, 0, len, 2740 goto redirty_page;
2719 noalloc_get_block_write); 2741 }
2720 if (!ret) { 2742 if (commit_write)
2721 page_bufs = page_buffers(page);
2722 /* check whether all are mapped and non delay */
2723 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2724 ext4_bh_delay_or_unwritten)) {
2725 redirty_page_for_writepage(wbc, page);
2726 unlock_page(page);
2727 return 0;
2728 }
2729 } else {
2730 /*
2731 * We can't do block allocation here
2732 * so just redity the page and unlock
2733 * and return
2734 */
2735 redirty_page_for_writepage(wbc, page);
2736 unlock_page(page);
2737 return 0;
2738 }
2739 /* now mark the buffer_heads as dirty and uptodate */ 2743 /* now mark the buffer_heads as dirty and uptodate */
2740 block_commit_write(page, 0, len); 2744 block_commit_write(page, 0, len);
2741 }
2742 2745
2743 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2746 if (PageChecked(page) && ext4_should_journal_data(inode))
2744 /* 2747 /*
2745 * It's mmapped pagecache. Add buffers and journal it. There 2748 * It's mmapped pagecache. Add buffers and journal it. There
2746 * doesn't seem much point in redirtying the page here. 2749 * doesn't seem much point in redirtying the page here.
2747 */ 2750 */
2748 ClearPageChecked(page);
2749 return __ext4_journalled_writepage(page, len); 2751 return __ext4_journalled_writepage(page, len);
2750 }
2751 2752
2752 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2753 if (buffer_uninit(page_bufs)) {
2753 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2754 else if (page_bufs && buffer_uninit(page_bufs)) {
2755 ext4_set_bh_endio(page_bufs, inode); 2754 ext4_set_bh_endio(page_bufs, inode);
2756 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2755 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2757 wbc, ext4_end_io_buffer_write); 2756 wbc, ext4_end_io_buffer_write);
@@ -2798,25 +2797,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2798 */ 2797 */
2799static int write_cache_pages_da(struct address_space *mapping, 2798static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc, 2799 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd) 2800 struct mpage_da_data *mpd,
2801 pgoff_t *done_index)
2802{ 2802{
2803 int ret = 0; 2803 int ret = 0;
2804 int done = 0; 2804 int done = 0;
2805 struct pagevec pvec; 2805 struct pagevec pvec;
2806 int nr_pages; 2806 unsigned nr_pages;
2807 pgoff_t index; 2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */ 2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write; 2809 long nr_to_write = wbc->nr_to_write;
2810 int tag;
2810 2811
2811 pagevec_init(&pvec, 0); 2812 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2813 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2814 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814 2815
2816 if (wbc->sync_mode == WB_SYNC_ALL)
2817 tag = PAGECACHE_TAG_TOWRITE;
2818 else
2819 tag = PAGECACHE_TAG_DIRTY;
2820
2821 *done_index = index;
2815 while (!done && (index <= end)) { 2822 while (!done && (index <= end)) {
2816 int i; 2823 int i;
2817 2824
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2825 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2826 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0) 2827 if (nr_pages == 0)
2822 break; 2828 break;
@@ -2836,6 +2842,8 @@ static int write_cache_pages_da(struct address_space *mapping,
2836 break; 2842 break;
2837 } 2843 }
2838 2844
2845 *done_index = page->index + 1;
2846
2839 lock_page(page); 2847 lock_page(page);
2840 2848
2841 /* 2849 /*
@@ -2921,6 +2929,8 @@ static int ext4_da_writepages(struct address_space *mapping,
2921 long desired_nr_to_write, nr_to_writebump = 0; 2929 long desired_nr_to_write, nr_to_writebump = 0;
2922 loff_t range_start = wbc->range_start; 2930 loff_t range_start = wbc->range_start;
2923 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2931 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2932 pgoff_t done_index = 0;
2933 pgoff_t end;
2924 2934
2925 trace_ext4_da_writepages(inode, wbc); 2935 trace_ext4_da_writepages(inode, wbc);
2926 2936
@@ -2956,8 +2966,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2956 wbc->range_start = index << PAGE_CACHE_SHIFT; 2966 wbc->range_start = index << PAGE_CACHE_SHIFT;
2957 wbc->range_end = LLONG_MAX; 2967 wbc->range_end = LLONG_MAX;
2958 wbc->range_cyclic = 0; 2968 wbc->range_cyclic = 0;
2959 } else 2969 end = -1;
2970 } else {
2960 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2971 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2972 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2973 }
2961 2974
2962 /* 2975 /*
2963 * This works around two forms of stupidity. The first is in 2976 * This works around two forms of stupidity. The first is in
@@ -2976,9 +2989,12 @@ static int ext4_da_writepages(struct address_space *mapping,
2976 * sbi->max_writeback_mb_bump whichever is smaller. 2989 * sbi->max_writeback_mb_bump whichever is smaller.
2977 */ 2990 */
2978 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2991 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2979 if (!range_cyclic && range_whole) 2992 if (!range_cyclic && range_whole) {
2980 desired_nr_to_write = wbc->nr_to_write * 8; 2993 if (wbc->nr_to_write == LONG_MAX)
2981 else 2994 desired_nr_to_write = wbc->nr_to_write;
2995 else
2996 desired_nr_to_write = wbc->nr_to_write * 8;
2997 } else
2982 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2998 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2983 max_pages); 2999 max_pages);
2984 if (desired_nr_to_write > max_pages) 3000 if (desired_nr_to_write > max_pages)
@@ -2995,6 +3011,9 @@ static int ext4_da_writepages(struct address_space *mapping,
2995 pages_skipped = wbc->pages_skipped; 3011 pages_skipped = wbc->pages_skipped;
2996 3012
2997retry: 3013retry:
3014 if (wbc->sync_mode == WB_SYNC_ALL)
3015 tag_pages_for_writeback(mapping, index, end);
3016
2998 while (!ret && wbc->nr_to_write > 0) { 3017 while (!ret && wbc->nr_to_write > 0) {
2999 3018
3000 /* 3019 /*
@@ -3033,16 +3052,14 @@ retry:
3033 mpd.io_done = 0; 3052 mpd.io_done = 0;
3034 mpd.pages_written = 0; 3053 mpd.pages_written = 0;
3035 mpd.retval = 0; 3054 mpd.retval = 0;
3036 ret = write_cache_pages_da(mapping, wbc, &mpd); 3055 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3037 /* 3056 /*
3038 * If we have a contiguous extent of pages and we 3057 * If we have a contiguous extent of pages and we
3039 * haven't done the I/O yet, map the blocks and submit 3058 * haven't done the I/O yet, map the blocks and submit
3040 * them for I/O. 3059 * them for I/O.
3041 */ 3060 */
3042 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3061 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3043 if (mpage_da_map_blocks(&mpd) == 0) 3062 mpage_da_map_and_submit(&mpd);
3044 mpage_da_submit_io(&mpd);
3045 mpd.io_done = 1;
3046 ret = MPAGE_DA_EXTENT_TAIL; 3063 ret = MPAGE_DA_EXTENT_TAIL;
3047 } 3064 }
3048 trace_ext4_da_write_pages(inode, &mpd); 3065 trace_ext4_da_write_pages(inode, &mpd);
@@ -3089,14 +3106,13 @@ retry:
3089 __func__, wbc->nr_to_write, ret); 3106 __func__, wbc->nr_to_write, ret);
3090 3107
3091 /* Update index */ 3108 /* Update index */
3092 index += pages_written;
3093 wbc->range_cyclic = range_cyclic; 3109 wbc->range_cyclic = range_cyclic;
3094 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3110 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3095 /* 3111 /*
3096 * set the writeback_index so that range_cyclic 3112 * set the writeback_index so that range_cyclic
3097 * mode will write it back later 3113 * mode will write it back later
3098 */ 3114 */
3099 mapping->writeback_index = index; 3115 mapping->writeback_index = done_index;
3100 3116
3101out_writepages: 3117out_writepages:
3102 wbc->nr_to_write -= nr_to_writebump; 3118 wbc->nr_to_write -= nr_to_writebump;
@@ -3146,13 +3162,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3146 int ret, retries = 0; 3162 int ret, retries = 0;
3147 struct page *page; 3163 struct page *page;
3148 pgoff_t index; 3164 pgoff_t index;
3149 unsigned from, to;
3150 struct inode *inode = mapping->host; 3165 struct inode *inode = mapping->host;
3151 handle_t *handle; 3166 handle_t *handle;
3152 3167
3153 index = pos >> PAGE_CACHE_SHIFT; 3168 index = pos >> PAGE_CACHE_SHIFT;
3154 from = pos & (PAGE_CACHE_SIZE - 1);
3155 to = from + len;
3156 3169
3157 if (ext4_nonda_switch(inode->i_sb)) { 3170 if (ext4_nonda_switch(inode->i_sb)) {
3158 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3171 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3185,8 +3198,7 @@ retry:
3185 } 3198 }
3186 *pagep = page; 3199 *pagep = page;
3187 3200
3188 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 3201 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3189 ext4_da_get_block_prep);
3190 if (ret < 0) { 3202 if (ret < 0) {
3191 unlock_page(page); 3203 unlock_page(page);
3192 ext4_journal_stop(handle); 3204 ext4_journal_stop(handle);
@@ -3435,15 +3447,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3435 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3447 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3436} 3448}
3437 3449
3438static void ext4_free_io_end(ext4_io_end_t *io)
3439{
3440 BUG_ON(!io);
3441 if (io->page)
3442 put_page(io->page);
3443 iput(io->inode);
3444 kfree(io);
3445}
3446
3447static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3450static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3448{ 3451{
3449 struct buffer_head *head, *bh; 3452 struct buffer_head *head, *bh;
@@ -3545,15 +3548,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3545 3548
3546retry: 3549retry:
3547 if (rw == READ && ext4_should_dioread_nolock(inode)) 3550 if (rw == READ && ext4_should_dioread_nolock(inode))
3548 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 3551 ret = __blockdev_direct_IO(rw, iocb, inode,
3549 inode->i_sb->s_bdev, iov, 3552 inode->i_sb->s_bdev, iov,
3550 offset, nr_segs, 3553 offset, nr_segs,
3551 ext4_get_block, NULL); 3554 ext4_get_block, NULL, NULL, 0);
3552 else 3555 else {
3553 ret = blockdev_direct_IO(rw, iocb, inode, 3556 ret = blockdev_direct_IO(rw, iocb, inode,
3554 inode->i_sb->s_bdev, iov, 3557 inode->i_sb->s_bdev, iov,
3555 offset, nr_segs, 3558 offset, nr_segs,
3556 ext4_get_block, NULL); 3559 ext4_get_block, NULL);
3560
3561 if (unlikely((rw & WRITE) && ret < 0)) {
3562 loff_t isize = i_size_read(inode);
3563 loff_t end = offset + iov_length(iov, nr_segs);
3564
3565 if (end > isize)
3566 vmtruncate(inode, isize);
3567 }
3568 }
3557 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3569 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3558 goto retry; 3570 goto retry;
3559 3571
@@ -3611,171 +3623,9 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3611 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3623 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3612} 3624}
3613 3625
3614static void dump_completed_IO(struct inode * inode)
3615{
3616#ifdef EXT4_DEBUG
3617 struct list_head *cur, *before, *after;
3618 ext4_io_end_t *io, *io0, *io1;
3619 unsigned long flags;
3620
3621 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3622 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3623 return;
3624 }
3625
3626 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3627 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3628 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3629 cur = &io->list;
3630 before = cur->prev;
3631 io0 = container_of(before, ext4_io_end_t, list);
3632 after = cur->next;
3633 io1 = container_of(after, ext4_io_end_t, list);
3634
3635 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3636 io, inode->i_ino, io0, io1);
3637 }
3638 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3639#endif
3640}
3641
3642/*
3643 * check a range of space and convert unwritten extents to written.
3644 */
3645static int ext4_end_io_nolock(ext4_io_end_t *io)
3646{
3647 struct inode *inode = io->inode;
3648 loff_t offset = io->offset;
3649 ssize_t size = io->size;
3650 int ret = 0;
3651
3652 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3653 "list->prev 0x%p\n",
3654 io, inode->i_ino, io->list.next, io->list.prev);
3655
3656 if (list_empty(&io->list))
3657 return ret;
3658
3659 if (io->flag != EXT4_IO_UNWRITTEN)
3660 return ret;
3661
3662 ret = ext4_convert_unwritten_extents(inode, offset, size);
3663 if (ret < 0) {
3664 printk(KERN_EMERG "%s: failed to convert unwritten"
3665 "extents to written extents, error is %d"
3666 " io is still on inode %lu aio dio list\n",
3667 __func__, ret, inode->i_ino);
3668 return ret;
3669 }
3670
3671 /* clear the DIO AIO unwritten flag */
3672 io->flag = 0;
3673 return ret;
3674}
3675
3676/*
3677 * work on completed aio dio IO, to convert unwritten extents to extents
3678 */
3679static void ext4_end_io_work(struct work_struct *work)
3680{
3681 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3682 struct inode *inode = io->inode;
3683 struct ext4_inode_info *ei = EXT4_I(inode);
3684 unsigned long flags;
3685 int ret;
3686
3687 mutex_lock(&inode->i_mutex);
3688 ret = ext4_end_io_nolock(io);
3689 if (ret < 0) {
3690 mutex_unlock(&inode->i_mutex);
3691 return;
3692 }
3693
3694 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3695 if (!list_empty(&io->list))
3696 list_del_init(&io->list);
3697 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3698 mutex_unlock(&inode->i_mutex);
3699 ext4_free_io_end(io);
3700}
3701
3702/*
3703 * This function is called from ext4_sync_file().
3704 *
3705 * When IO is completed, the work to convert unwritten extents to
3706 * written is queued on workqueue but may not get immediately
3707 * scheduled. When fsync is called, we need to ensure the
3708 * conversion is complete before fsync returns.
3709 * The inode keeps track of a list of pending/completed IO that
3710 * might needs to do the conversion. This function walks through
3711 * the list and convert the related unwritten extents for completed IO
3712 * to written.
3713 * The function return the number of pending IOs on success.
3714 */
3715int flush_completed_IO(struct inode *inode)
3716{
3717 ext4_io_end_t *io;
3718 struct ext4_inode_info *ei = EXT4_I(inode);
3719 unsigned long flags;
3720 int ret = 0;
3721 int ret2 = 0;
3722
3723 if (list_empty(&ei->i_completed_io_list))
3724 return ret;
3725
3726 dump_completed_IO(inode);
3727 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3728 while (!list_empty(&ei->i_completed_io_list)){
3729 io = list_entry(ei->i_completed_io_list.next,
3730 ext4_io_end_t, list);
3731 /*
3732 * Calling ext4_end_io_nolock() to convert completed
3733 * IO to written.
3734 *
3735 * When ext4_sync_file() is called, run_queue() may already
3736 * about to flush the work corresponding to this io structure.
3737 * It will be upset if it founds the io structure related
3738 * to the work-to-be schedule is freed.
3739 *
3740 * Thus we need to keep the io structure still valid here after
3741 * convertion finished. The io structure has a flag to
3742 * avoid double converting from both fsync and background work
3743 * queue work.
3744 */
3745 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3746 ret = ext4_end_io_nolock(io);
3747 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3748 if (ret < 0)
3749 ret2 = ret;
3750 else
3751 list_del_init(&io->list);
3752 }
3753 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3754 return (ret2 < 0) ? ret2 : 0;
3755}
3756
3757static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3758{
3759 ext4_io_end_t *io = NULL;
3760
3761 io = kmalloc(sizeof(*io), flags);
3762
3763 if (io) {
3764 igrab(inode);
3765 io->inode = inode;
3766 io->flag = 0;
3767 io->offset = 0;
3768 io->size = 0;
3769 io->page = NULL;
3770 INIT_WORK(&io->work, ext4_end_io_work);
3771 INIT_LIST_HEAD(&io->list);
3772 }
3773
3774 return io;
3775}
3776
3777static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3626static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3778 ssize_t size, void *private) 3627 ssize_t size, void *private, int ret,
3628 bool is_async)
3779{ 3629{
3780 ext4_io_end_t *io_end = iocb->private; 3630 ext4_io_end_t *io_end = iocb->private;
3781 struct workqueue_struct *wq; 3631 struct workqueue_struct *wq;
@@ -3784,7 +3634,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3784 3634
3785 /* if not async direct IO or dio with 0 bytes write, just return */ 3635 /* if not async direct IO or dio with 0 bytes write, just return */
3786 if (!io_end || !size) 3636 if (!io_end || !size)
3787 return; 3637 goto out;
3788 3638
3789 ext_debug("ext4_end_io_dio(): io_end 0x%p" 3639 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3790 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 3640 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3792,25 +3642,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3792 size); 3642 size);
3793 3643
3794 /* if not aio dio with unwritten extents, just free io and return */ 3644 /* if not aio dio with unwritten extents, just free io and return */
3795 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3645 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3796 ext4_free_io_end(io_end); 3646 ext4_free_io_end(io_end);
3797 iocb->private = NULL; 3647 iocb->private = NULL;
3648out:
3649 if (is_async)
3650 aio_complete(iocb, ret, 0);
3798 return; 3651 return;
3799 } 3652 }
3800 3653
3801 io_end->offset = offset; 3654 io_end->offset = offset;
3802 io_end->size = size; 3655 io_end->size = size;
3803 io_end->flag = EXT4_IO_UNWRITTEN; 3656 if (is_async) {
3657 io_end->iocb = iocb;
3658 io_end->result = ret;
3659 }
3804 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3660 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3805 3661
3806 /* queue the work to convert unwritten extents to written */
3807 queue_work(wq, &io_end->work);
3808
3809 /* Add the io_end to per-inode completed aio dio list*/ 3662 /* Add the io_end to per-inode completed aio dio list*/
3810 ei = EXT4_I(io_end->inode); 3663 ei = EXT4_I(io_end->inode);
3811 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3664 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3812 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3665 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3813 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3666 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3667
3668 /* queue the work to convert unwritten extents to written */
3669 queue_work(wq, &io_end->work);
3814 iocb->private = NULL; 3670 iocb->private = NULL;
3815} 3671}
3816 3672
@@ -3831,7 +3687,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3831 goto out; 3687 goto out;
3832 } 3688 }
3833 3689
3834 io_end->flag = EXT4_IO_UNWRITTEN; 3690 io_end->flag = EXT4_IO_END_UNWRITTEN;
3835 inode = io_end->inode; 3691 inode = io_end->inode;
3836 3692
3837 /* Add the io_end to per-inode completed io list*/ 3693 /* Add the io_end to per-inode completed io list*/
@@ -3937,7 +3793,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3937 return -ENOMEM; 3793 return -ENOMEM;
3938 /* 3794 /*
3939 * we save the io structure for current async 3795 * we save the io structure for current async
3940 * direct IO, so that later ext4_get_blocks() 3796 * direct IO, so that later ext4_map_blocks()
3941 * could flag the io structure whether there 3797 * could flag the io structure whether there
3942 * is a unwritten extents needs to be converted 3798 * is a unwritten extents needs to be converted
3943 * when IO is completed. 3799 * when IO is completed.
@@ -4128,17 +3984,6 @@ int ext4_block_truncate_page(handle_t *handle,
4128 length = blocksize - (offset & (blocksize - 1)); 3984 length = blocksize - (offset & (blocksize - 1));
4129 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3985 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4130 3986
4131 /*
4132 * For "nobh" option, we can only work if we don't need to
4133 * read-in the page - otherwise we create buffers to do the IO.
4134 */
4135 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
4136 ext4_should_writeback_data(inode) && PageUptodate(page)) {
4137 zero_user(page, offset, length);
4138 set_page_dirty(page);
4139 goto unlock;
4140 }
4141
4142 if (!page_has_buffers(page)) 3987 if (!page_has_buffers(page))
4143 create_empty_buffers(page, blocksize, 0); 3988 create_empty_buffers(page, blocksize, 0);
4144 3989
@@ -4488,9 +4333,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4488 * (should be rare). 4333 * (should be rare).
4489 */ 4334 */
4490 if (!bh) { 4335 if (!bh) {
4491 EXT4_ERROR_INODE(inode, 4336 EXT4_ERROR_INODE_BLOCK(inode, nr,
4492 "Read failure block=%llu", 4337 "Read failure");
4493 (unsigned long long) nr);
4494 continue; 4338 continue;
4495 } 4339 }
4496 4340
@@ -4502,27 +4346,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4502 depth); 4346 depth);
4503 4347
4504 /* 4348 /*
4505 * We've probably journalled the indirect block several
4506 * times during the truncate. But it's no longer
4507 * needed and we now drop it from the transaction via
4508 * jbd2_journal_revoke().
4509 *
4510 * That's easy if it's exclusively part of this
4511 * transaction. But if it's part of the committing
4512 * transaction then jbd2_journal_forget() will simply
4513 * brelse() it. That means that if the underlying
4514 * block is reallocated in ext4_get_block(),
4515 * unmap_underlying_metadata() will find this block
4516 * and will try to get rid of it. damn, damn.
4517 *
4518 * If this block has already been committed to the
4519 * journal, a revoke record will be written. And
4520 * revoke records must be emitted *before* clearing
4521 * this block's bit in the bitmaps.
4522 */
4523 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
4524
4525 /*
4526 * Everything below this this pointer has been 4349 * Everything below this this pointer has been
4527 * released. Now let this top-of-subtree go. 4350 * released. Now let this top-of-subtree go.
4528 * 4351 *
@@ -4546,8 +4369,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4546 blocks_for_truncate(inode)); 4369 blocks_for_truncate(inode));
4547 } 4370 }
4548 4371
4372 /*
4373 * The forget flag here is critical because if
4374 * we are journaling (and not doing data
4375 * journaling), we have to make sure a revoke
4376 * record is written to prevent the journal
4377 * replay from overwriting the (former)
4378 * indirect block if it gets reallocated as a
4379 * data block. This must happen in the same
4380 * transaction where the data blocks are
4381 * actually freed.
4382 */
4549 ext4_free_blocks(handle, inode, 0, nr, 1, 4383 ext4_free_blocks(handle, inode, 0, nr, 1,
4550 EXT4_FREE_BLOCKS_METADATA); 4384 EXT4_FREE_BLOCKS_METADATA|
4385 EXT4_FREE_BLOCKS_FORGET);
4551 4386
4552 if (parent_bh) { 4387 if (parent_bh) {
4553 /* 4388 /*
@@ -4805,8 +4640,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4805 4640
4806 bh = sb_getblk(sb, block); 4641 bh = sb_getblk(sb, block);
4807 if (!bh) { 4642 if (!bh) {
4808 EXT4_ERROR_INODE(inode, "unable to read inode block - " 4643 EXT4_ERROR_INODE_BLOCK(inode, block,
4809 "block %llu", block); 4644 "unable to read itable block");
4810 return -EIO; 4645 return -EIO;
4811 } 4646 }
4812 if (!buffer_uptodate(bh)) { 4647 if (!buffer_uptodate(bh)) {
@@ -4904,8 +4739,8 @@ make_io:
4904 submit_bh(READ_META, bh); 4739 submit_bh(READ_META, bh);
4905 wait_on_buffer(bh); 4740 wait_on_buffer(bh);
4906 if (!buffer_uptodate(bh)) { 4741 if (!buffer_uptodate(bh)) {
4907 EXT4_ERROR_INODE(inode, "unable to read inode " 4742 EXT4_ERROR_INODE_BLOCK(inode, block,
4908 "block %llu", block); 4743 "unable to read itable block");
4909 brelse(bh); 4744 brelse(bh);
4910 return -EIO; 4745 return -EIO;
4911 } 4746 }
@@ -4976,7 +4811,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4976 /* we are using combined 48 bit field */ 4811 /* we are using combined 48 bit field */
4977 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4812 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4978 le32_to_cpu(raw_inode->i_blocks_lo); 4813 le32_to_cpu(raw_inode->i_blocks_lo);
4979 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4814 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4980 /* i_blocks represent file system block size */ 4815 /* i_blocks represent file system block size */
4981 return i_blocks << (inode->i_blkbits - 9); 4816 return i_blocks << (inode->i_blkbits - 9);
4982 } else { 4817 } else {
@@ -5072,7 +4907,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5072 transaction_t *transaction; 4907 transaction_t *transaction;
5073 tid_t tid; 4908 tid_t tid;
5074 4909
5075 spin_lock(&journal->j_state_lock); 4910 read_lock(&journal->j_state_lock);
5076 if (journal->j_running_transaction) 4911 if (journal->j_running_transaction)
5077 transaction = journal->j_running_transaction; 4912 transaction = journal->j_running_transaction;
5078 else 4913 else
@@ -5081,7 +4916,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5081 tid = transaction->t_tid; 4916 tid = transaction->t_tid;
5082 else 4917 else
5083 tid = journal->j_commit_sequence; 4918 tid = journal->j_commit_sequence;
5084 spin_unlock(&journal->j_state_lock); 4919 read_unlock(&journal->j_state_lock);
5085 ei->i_sync_tid = tid; 4920 ei->i_sync_tid = tid;
5086 ei->i_datasync_tid = tid; 4921 ei->i_datasync_tid = tid;
5087 } 4922 }
@@ -5126,7 +4961,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5126 ei->i_file_acl); 4961 ei->i_file_acl);
5127 ret = -EIO; 4962 ret = -EIO;
5128 goto bad_inode; 4963 goto bad_inode;
5129 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 4964 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5130 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4965 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5131 (S_ISLNK(inode->i_mode) && 4966 (S_ISLNK(inode->i_mode) &&
5132 !ext4_inode_is_fast_symlink(inode))) 4967 !ext4_inode_is_fast_symlink(inode)))
@@ -5406,9 +5241,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5406 if (wbc->sync_mode == WB_SYNC_ALL) 5241 if (wbc->sync_mode == WB_SYNC_ALL)
5407 sync_dirty_buffer(iloc.bh); 5242 sync_dirty_buffer(iloc.bh);
5408 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5243 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5409 EXT4_ERROR_INODE(inode, 5244 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5410 "IO error syncing inode (block=%llu)", 5245 "IO error syncing inode");
5411 (unsigned long long) iloc.bh->b_blocknr);
5412 err = -EIO; 5246 err = -EIO;
5413 } 5247 }
5414 brelse(iloc.bh); 5248 brelse(iloc.bh);
@@ -5444,6 +5278,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5444{ 5278{
5445 struct inode *inode = dentry->d_inode; 5279 struct inode *inode = dentry->d_inode;
5446 int error, rc = 0; 5280 int error, rc = 0;
5281 int orphan = 0;
5447 const unsigned int ia_valid = attr->ia_valid; 5282 const unsigned int ia_valid = attr->ia_valid;
5448 5283
5449 error = inode_change_ok(inode, attr); 5284 error = inode_change_ok(inode, attr);
@@ -5483,10 +5318,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5483 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5318 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5484 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5319 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5485 5320
5486 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5321 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5487 error = -EFBIG; 5322 return -EFBIG;
5488 goto err_out;
5489 }
5490 } 5323 }
5491 } 5324 }
5492 5325
@@ -5501,8 +5334,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5501 error = PTR_ERR(handle); 5334 error = PTR_ERR(handle);
5502 goto err_out; 5335 goto err_out;
5503 } 5336 }
5504 5337 if (ext4_handle_valid(handle)) {
5505 error = ext4_orphan_add(handle, inode); 5338 error = ext4_orphan_add(handle, inode);
5339 orphan = 1;
5340 }
5506 EXT4_I(inode)->i_disksize = attr->ia_size; 5341 EXT4_I(inode)->i_disksize = attr->ia_size;
5507 rc = ext4_mark_inode_dirty(handle, inode); 5342 rc = ext4_mark_inode_dirty(handle, inode);
5508 if (!error) 5343 if (!error)
@@ -5520,6 +5355,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5520 goto err_out; 5355 goto err_out;
5521 } 5356 }
5522 ext4_orphan_del(handle, inode); 5357 ext4_orphan_del(handle, inode);
5358 orphan = 0;
5523 ext4_journal_stop(handle); 5359 ext4_journal_stop(handle);
5524 goto err_out; 5360 goto err_out;
5525 } 5361 }
@@ -5529,12 +5365,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5529 ext4_truncate(inode); 5365 ext4_truncate(inode);
5530 } 5366 }
5531 5367
5532 rc = inode_setattr(inode, attr); 5368 if ((attr->ia_valid & ATTR_SIZE) &&
5369 attr->ia_size != i_size_read(inode))
5370 rc = vmtruncate(inode, attr->ia_size);
5533 5371
5534 /* If inode_setattr's call to ext4_truncate failed to get a 5372 if (!rc) {
5535 * transaction handle at all, we need to clean up the in-core 5373 setattr_copy(inode, attr);
5536 * orphan list manually. */ 5374 mark_inode_dirty(inode);
5537 if (inode->i_nlink) 5375 }
5376
5377 /*
5378 * If the call to ext4_truncate failed to get a transaction handle at
5379 * all, we need to clean up the in-core orphan list manually.
5380 */
5381 if (orphan && inode->i_nlink)
5538 ext4_orphan_del(NULL, inode); 5382 ext4_orphan_del(NULL, inode);
5539 5383
5540 if (!rc && (ia_valid & ATTR_MODE)) 5384 if (!rc && (ia_valid & ATTR_MODE))
@@ -5617,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5617 * 5461 *
5618 * Also account for superblock, inode, quota and xattr blocks 5462 * Also account for superblock, inode, quota and xattr blocks
5619 */ 5463 */
5620int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5464static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5621{ 5465{
5622 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5466 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5623 int gdpblocks; 5467 int gdpblocks;
@@ -5688,7 +5532,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5688 * Calculate the journal credits for a chunk of data modification. 5532 * Calculate the journal credits for a chunk of data modification.
5689 * 5533 *
5690 * This is called from DIO, fallocate or whoever calling 5534 * This is called from DIO, fallocate or whoever calling
5691 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. 5535 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5692 * 5536 *
5693 * journal buffers for data blocks are not included here, as DIO 5537 * journal buffers for data blocks are not included here, as DIO
5694 * and fallocate do no need to journal data buffers. 5538 * and fallocate do no need to journal data buffers.
@@ -5754,7 +5598,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5754{ 5598{
5755 struct ext4_inode *raw_inode; 5599 struct ext4_inode *raw_inode;
5756 struct ext4_xattr_ibody_header *header; 5600 struct ext4_xattr_ibody_header *header;
5757 struct ext4_xattr_entry *entry;
5758 5601
5759 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5602 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5760 return 0; 5603 return 0;
@@ -5762,7 +5605,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5762 raw_inode = ext4_raw_inode(&iloc); 5605 raw_inode = ext4_raw_inode(&iloc);
5763 5606
5764 header = IHDR(inode, raw_inode); 5607 header = IHDR(inode, raw_inode);
5765 entry = IFIRST(header);
5766 5608
5767 /* No extended attributes present */ 5609 /* No extended attributes present */
5768 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5610 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3bc026a6..c58eba34724 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
338static struct kmem_cache *ext4_pspace_cachep; 338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep; 339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep; 340static struct kmem_cache *ext4_free_ext_cachep;
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348
341static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
342 ext4_group_t group); 350 ext4_group_t group);
343static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -446,10 +454,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 454 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
447 blocknr += first + i; 455 blocknr += first + i;
448 ext4_grp_locked_error(sb, e4b->bd_group, 456 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 457 inode ? inode->i_ino : 0,
450 " %lu's block %llu(bit %u in group %u)", 458 blocknr,
451 inode ? inode->i_ino : 0, blocknr, 459 "freeing block already freed "
452 first + i, e4b->bd_group); 460 "(bit %u)",
461 first + i);
453 } 462 }
454 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 463 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
455 } 464 }
@@ -712,9 +721,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
712 grp->bb_fragments = fragments; 721 grp->bb_fragments = fragments;
713 722
714 if (free != grp->bb_free) { 723 if (free != grp->bb_free) {
715 ext4_grp_locked_error(sb, group, __func__, 724 ext4_grp_locked_error(sb, group, 0, 0,
716 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", 725 "%u blocks in bitmap, %u in gd",
717 group, free, grp->bb_free); 726 free, grp->bb_free);
718 /* 727 /*
719 * If we intent to continue, we consider group descritor 728 * If we intent to continue, we consider group descritor
720 * corrupt and update bb_free using bitmap value 729 * corrupt and update bb_free using bitmap value
@@ -938,6 +947,85 @@ out:
938} 947}
939 948
940/* 949/*
950 * lock the group_info alloc_sem of all the groups
951 * belonging to the same buddy cache page. This
952 * make sure other parallel operation on the buddy
953 * cache doesn't happen whild holding the buddy cache
954 * lock
955 */
956static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
957 ext4_group_t group)
958{
959 int i;
960 int block, pnum;
961 int blocks_per_page;
962 int groups_per_page;
963 ext4_group_t ngroups = ext4_get_groups_count(sb);
964 ext4_group_t first_group;
965 struct ext4_group_info *grp;
966
967 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
968 /*
969 * the buddy cache inode stores the block bitmap
970 * and buddy information in consecutive blocks.
971 * So for each group we need two blocks.
972 */
973 block = group * 2;
974 pnum = block / blocks_per_page;
975 first_group = pnum * blocks_per_page / 2;
976
977 groups_per_page = blocks_per_page >> 1;
978 if (groups_per_page == 0)
979 groups_per_page = 1;
980 /* read all groups the page covers into the cache */
981 for (i = 0; i < groups_per_page; i++) {
982
983 if ((first_group + i) >= ngroups)
984 break;
985 grp = ext4_get_group_info(sb, first_group + i);
986 /* take all groups write allocation
987 * semaphore. This make sure there is
988 * no block allocation going on in any
989 * of that groups
990 */
991 down_write_nested(&grp->alloc_sem, i);
992 }
993 return i;
994}
995
996static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
997 ext4_group_t group, int locked_group)
998{
999 int i;
1000 int block, pnum;
1001 int blocks_per_page;
1002 ext4_group_t first_group;
1003 struct ext4_group_info *grp;
1004
1005 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1006 /*
1007 * the buddy cache inode stores the block bitmap
1008 * and buddy information in consecutive blocks.
1009 * So for each group we need two blocks.
1010 */
1011 block = group * 2;
1012 pnum = block / blocks_per_page;
1013 first_group = pnum * blocks_per_page / 2;
1014 /* release locks on all the groups */
1015 for (i = 0; i < locked_group; i++) {
1016
1017 grp = ext4_get_group_info(sb, first_group + i);
1018 /* take all groups write allocation
1019 * semaphore. This make sure there is
1020 * no block allocation going on in any
1021 * of that groups
1022 */
1023 up_write(&grp->alloc_sem);
1024 }
1025
1026}
1027
1028/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1029 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when 1030 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine! 1031 * calling this routine!
@@ -1296,10 +1384,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1296 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1384 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1297 blocknr += block; 1385 blocknr += block;
1298 ext4_grp_locked_error(sb, e4b->bd_group, 1386 ext4_grp_locked_error(sb, e4b->bd_group,
1299 __func__, "double-free of inode" 1387 inode ? inode->i_ino : 0,
1300 " %lu's block %llu(bit %u in group %u)", 1388 blocknr,
1301 inode ? inode->i_ino : 0, blocknr, block, 1389 "freeing already freed block "
1302 e4b->bd_group); 1390 "(bit %u)", block);
1303 } 1391 }
1304 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1392 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1305 e4b->bd_info->bb_counters[order]++; 1393 e4b->bd_info->bb_counters[order]++;
@@ -1788,8 +1876,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1788 * free blocks even though group info says we 1876 * free blocks even though group info says we
1789 * we have free blocks 1877 * we have free blocks
1790 */ 1878 */
1791 ext4_grp_locked_error(sb, e4b->bd_group, 1879 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1792 __func__, "%d free blocks as per " 1880 "%d free blocks as per "
1793 "group info. But bitmap says 0", 1881 "group info. But bitmap says 0",
1794 free); 1882 free);
1795 break; 1883 break;
@@ -1798,8 +1886,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1798 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1886 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1799 BUG_ON(ex.fe_len <= 0); 1887 BUG_ON(ex.fe_len <= 0);
1800 if (free < ex.fe_len) { 1888 if (free < ex.fe_len) {
1801 ext4_grp_locked_error(sb, e4b->bd_group, 1889 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1802 __func__, "%d free blocks as per " 1890 "%d free blocks as per "
1803 "group info. But got %d blocks", 1891 "group info. But got %d blocks",
1804 free, ex.fe_len); 1892 free, ex.fe_len);
1805 /* 1893 /*
@@ -1821,8 +1909,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1821 1909
1822/* 1910/*
1823 * This is a special case for storages like raid5 1911 * This is a special case for storages like raid5
1824 * we try to find stripe-aligned chunks for stripe-size requests 1912 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1825 * XXX should do so at least for multiples of stripe size as well
1826 */ 1913 */
1827static noinline_for_stack 1914static noinline_for_stack
1828void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1915void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1915,91 +2002,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1915 return 0; 2002 return 0;
1916} 2003}
1917 2004
1918/*
1919 * lock the group_info alloc_sem of all the groups
1920 * belonging to the same buddy cache page. This
1921 * make sure other parallel operation on the buddy
1922 * cache doesn't happen whild holding the buddy cache
1923 * lock
1924 */
1925int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1926{
1927 int i;
1928 int block, pnum;
1929 int blocks_per_page;
1930 int groups_per_page;
1931 ext4_group_t ngroups = ext4_get_groups_count(sb);
1932 ext4_group_t first_group;
1933 struct ext4_group_info *grp;
1934
1935 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1936 /*
1937 * the buddy cache inode stores the block bitmap
1938 * and buddy information in consecutive blocks.
1939 * So for each group we need two blocks.
1940 */
1941 block = group * 2;
1942 pnum = block / blocks_per_page;
1943 first_group = pnum * blocks_per_page / 2;
1944
1945 groups_per_page = blocks_per_page >> 1;
1946 if (groups_per_page == 0)
1947 groups_per_page = 1;
1948 /* read all groups the page covers into the cache */
1949 for (i = 0; i < groups_per_page; i++) {
1950
1951 if ((first_group + i) >= ngroups)
1952 break;
1953 grp = ext4_get_group_info(sb, first_group + i);
1954 /* take all groups write allocation
1955 * semaphore. This make sure there is
1956 * no block allocation going on in any
1957 * of that groups
1958 */
1959 down_write_nested(&grp->alloc_sem, i);
1960 }
1961 return i;
1962}
1963
1964void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1965 ext4_group_t group, int locked_group)
1966{
1967 int i;
1968 int block, pnum;
1969 int blocks_per_page;
1970 ext4_group_t first_group;
1971 struct ext4_group_info *grp;
1972
1973 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1974 /*
1975 * the buddy cache inode stores the block bitmap
1976 * and buddy information in consecutive blocks.
1977 * So for each group we need two blocks.
1978 */
1979 block = group * 2;
1980 pnum = block / blocks_per_page;
1981 first_group = pnum * blocks_per_page / 2;
1982 /* release locks on all the groups */
1983 for (i = 0; i < locked_group; i++) {
1984
1985 grp = ext4_get_group_info(sb, first_group + i);
1986 /* take all groups write allocation
1987 * semaphore. This make sure there is
1988 * no block allocation going on in any
1989 * of that groups
1990 */
1991 up_write(&grp->alloc_sem);
1992 }
1993
1994}
1995
1996static noinline_for_stack int 2005static noinline_for_stack int
1997ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2006ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1998{ 2007{
1999 ext4_group_t ngroups, group, i; 2008 ext4_group_t ngroups, group, i;
2000 int cr; 2009 int cr;
2001 int err = 0; 2010 int err = 0;
2002 int bsbits;
2003 struct ext4_sb_info *sbi; 2011 struct ext4_sb_info *sbi;
2004 struct super_block *sb; 2012 struct super_block *sb;
2005 struct ext4_buddy e4b; 2013 struct ext4_buddy e4b;
@@ -2041,8 +2049,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2041 ac->ac_2order = i - 1; 2049 ac->ac_2order = i - 1;
2042 } 2050 }
2043 2051
2044 bsbits = ac->ac_sb->s_blocksize_bits;
2045
2046 /* if stream allocation is enabled, use global goal */ 2052 /* if stream allocation is enabled, use global goal */
2047 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2053 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2048 /* TBD: may be hot point */ 2054 /* TBD: may be hot point */
@@ -2094,8 +2100,8 @@ repeat:
2094 ac->ac_groups_scanned++; 2100 ac->ac_groups_scanned++;
2095 if (cr == 0) 2101 if (cr == 0)
2096 ext4_mb_simple_scan_group(ac, &e4b); 2102 ext4_mb_simple_scan_group(ac, &e4b);
2097 else if (cr == 1 && 2103 else if (cr == 1 && sbi->s_stripe &&
2098 ac->ac_g_ex.fe_len == sbi->s_stripe) 2104 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2099 ext4_mb_scan_aligned(ac, &e4b); 2105 ext4_mb_scan_aligned(ac, &e4b);
2100 else 2106 else
2101 ext4_mb_complex_scan_group(ac, &e4b); 2107 ext4_mb_complex_scan_group(ac, &e4b);
@@ -2221,7 +2227,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2221 2227
2222 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2228 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2223 if (rc == 0) { 2229 if (rc == 0) {
2224 struct seq_file *m = (struct seq_file *)file->private_data; 2230 struct seq_file *m = file->private_data;
2225 m->private = sb; 2231 m->private = sb;
2226 } 2232 }
2227 return rc; 2233 return rc;
@@ -2236,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2236 .release = seq_release, 2242 .release = seq_release,
2237}; 2243};
2238 2244
2245static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2246{
2247 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2248 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2249
2250 BUG_ON(!cachep);
2251 return cachep;
2252}
2239 2253
2240/* Create and initialize ext4_group_info data for the given group. */ 2254/* Create and initialize ext4_group_info data for the given group. */
2241int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2255int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2242 struct ext4_group_desc *desc) 2256 struct ext4_group_desc *desc)
2243{ 2257{
2244 int i, len; 2258 int i;
2245 int metalen = 0; 2259 int metalen = 0;
2246 struct ext4_sb_info *sbi = EXT4_SB(sb); 2260 struct ext4_sb_info *sbi = EXT4_SB(sb);
2247 struct ext4_group_info **meta_group_info; 2261 struct ext4_group_info **meta_group_info;
2262 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2248 2263
2249 /* 2264 /*
2250 * First check if this group is the first of a reserved block. 2265 * First check if this group is the first of a reserved block.
@@ -2264,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2264 meta_group_info; 2279 meta_group_info;
2265 } 2280 }
2266 2281
2267 /*
2268 * calculate needed size. if change bb_counters size,
2269 * don't forget about ext4_mb_generate_buddy()
2270 */
2271 len = offsetof(typeof(**meta_group_info),
2272 bb_counters[sb->s_blocksize_bits + 2]);
2273
2274 meta_group_info = 2282 meta_group_info =
2275 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2283 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2276 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2284 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2277 2285
2278 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2286 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2279 if (meta_group_info[i] == NULL) { 2287 if (meta_group_info[i] == NULL) {
2280 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2288 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2281 goto exit_group_info; 2289 goto exit_group_info;
2282 } 2290 }
2291 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2283 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2292 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2284 &(meta_group_info[i]->bb_state)); 2293 &(meta_group_info[i]->bb_state));
2285 2294
@@ -2334,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2334 int num_meta_group_infos_max; 2343 int num_meta_group_infos_max;
2335 int array_size; 2344 int array_size;
2336 struct ext4_group_desc *desc; 2345 struct ext4_group_desc *desc;
2346 struct kmem_cache *cachep;
2337 2347
2338 /* This is the number of blocks used by GDT */ 2348 /* This is the number of blocks used by GDT */
2339 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2349 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2376,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2376 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2386 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2377 goto err_freesgi; 2387 goto err_freesgi;
2378 } 2388 }
2389 sbi->s_buddy_cache->i_ino = get_next_ino();
2379 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2390 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2380 for (i = 0; i < ngroups; i++) { 2391 for (i = 0; i < ngroups; i++) {
2381 desc = ext4_get_group_desc(sb, i, NULL); 2392 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2391,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
2391 return 0; 2402 return 0;
2392 2403
2393err_freebuddy: 2404err_freebuddy:
2405 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2394 while (i-- > 0) 2406 while (i-- > 0)
2395 kfree(ext4_get_group_info(sb, i)); 2407 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2396 i = num_meta_group_infos; 2408 i = num_meta_group_infos;
2397 while (i-- > 0) 2409 while (i-- > 0)
2398 kfree(sbi->s_group_info[i]); 2410 kfree(sbi->s_group_info[i]);
@@ -2409,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2409 unsigned offset; 2421 unsigned offset;
2410 unsigned max; 2422 unsigned max;
2411 int ret; 2423 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2412 2427
2413 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2414 2429
2415 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2430 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2416 if (sbi->s_mb_offsets == NULL) { 2431 if (sbi->s_mb_offsets == NULL) {
2417 return -ENOMEM; 2432 ret = -ENOMEM;
2433 goto out;
2418 } 2434 }
2419 2435
2420 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2436 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2421 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2437 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2422 if (sbi->s_mb_maxs == NULL) { 2438 if (sbi->s_mb_maxs == NULL) {
2423 kfree(sbi->s_mb_offsets); 2439 ret = -ENOMEM;
2424 return -ENOMEM; 2440 goto out;
2441 }
2442
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2444 cachep = ext4_groupinfo_caches[cache_index];
2445 if (!cachep) {
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2425 } 2466 }
2426 2467
2427 /* order 0 is regular bitmap */ 2468 /* order 0 is regular bitmap */
@@ -2442,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2442 /* init file for buddy data */ 2483 /* init file for buddy data */
2443 ret = ext4_mb_init_backend(sb); 2484 ret = ext4_mb_init_backend(sb);
2444 if (ret != 0) { 2485 if (ret != 0) {
2445 kfree(sbi->s_mb_offsets); 2486 goto out;
2446 kfree(sbi->s_mb_maxs);
2447 return ret;
2448 } 2487 }
2449 2488
2450 spin_lock_init(&sbi->s_md_lock); 2489 spin_lock_init(&sbi->s_md_lock);
@@ -2459,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2459 2498
2460 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2499 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2461 if (sbi->s_locality_groups == NULL) { 2500 if (sbi->s_locality_groups == NULL) {
2462 kfree(sbi->s_mb_offsets); 2501 ret = -ENOMEM;
2463 kfree(sbi->s_mb_maxs); 2502 goto out;
2464 return -ENOMEM;
2465 } 2503 }
2466 for_each_possible_cpu(i) { 2504 for_each_possible_cpu(i) {
2467 struct ext4_locality_group *lg; 2505 struct ext4_locality_group *lg;
@@ -2478,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2478 2516
2479 if (sbi->s_journal) 2517 if (sbi->s_journal)
2480 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2518 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2481 return 0; 2519out:
2520 if (ret) {
2521 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 }
2525 return ret;
2482} 2526}
2483 2527
2484/* need to called with the ext4 group lock held */ 2528/* need to called with the ext4 group lock held */
@@ -2506,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
2506 int num_meta_group_infos; 2550 int num_meta_group_infos;
2507 struct ext4_group_info *grinfo; 2551 struct ext4_group_info *grinfo;
2508 struct ext4_sb_info *sbi = EXT4_SB(sb); 2552 struct ext4_sb_info *sbi = EXT4_SB(sb);
2553 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2509 2554
2510 if (sbi->s_group_info) { 2555 if (sbi->s_group_info) {
2511 for (i = 0; i < ngroups; i++) { 2556 for (i = 0; i < ngroups; i++) {
@@ -2516,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
2516 ext4_lock_group(sb, i); 2561 ext4_lock_group(sb, i);
2517 ext4_mb_cleanup_pa(grinfo); 2562 ext4_mb_cleanup_pa(grinfo);
2518 ext4_unlock_group(sb, i); 2563 ext4_unlock_group(sb, i);
2519 kfree(grinfo); 2564 kmem_cache_free(cachep, grinfo);
2520 } 2565 }
2521 num_meta_group_infos = (ngroups + 2566 num_meta_group_infos = (ngroups +
2522 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2567 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2560,6 +2605,23 @@ int ext4_mb_release(struct super_block *sb)
2560 return 0; 2605 return 0;
2561} 2606}
2562 2607
2608static inline int ext4_issue_discard(struct super_block *sb,
2609 ext4_group_t block_group, ext4_grpblk_t block, int count)
2610{
2611 int ret;
2612 ext4_fsblk_t discard_block;
2613
2614 discard_block = block + ext4_group_first_block_no(sb, block_group);
2615 trace_ext4_discard_blocks(sb,
2616 (unsigned long long) discard_block, count);
2617 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2618 if (ret == -EOPNOTSUPP) {
2619 ext4_warning(sb, "discard not supported, disabling");
2620 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2621 }
2622 return ret;
2623}
2624
2563/* 2625/*
2564 * This function is called by the jbd2 layer once the commit has finished, 2626 * This function is called by the jbd2 layer once the commit has finished,
2565 * so we know we can free the blocks that were released with that commit. 2627 * so we know we can free the blocks that were released with that commit.
@@ -2579,22 +2641,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2579 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2641 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2580 entry->count, entry->group, entry); 2642 entry->count, entry->group, entry);
2581 2643
2582 if (test_opt(sb, DISCARD)) { 2644 if (test_opt(sb, DISCARD))
2583 int ret; 2645 ext4_issue_discard(sb, entry->group,
2584 ext4_fsblk_t discard_block; 2646 entry->start_blk, entry->count);
2585
2586 discard_block = entry->start_blk +
2587 ext4_group_first_block_no(sb, entry->group);
2588 trace_ext4_discard_blocks(sb,
2589 (unsigned long long)discard_block,
2590 entry->count);
2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2597 }
2598 2647
2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2600 /* we expect to find existing buddy because it's pinned */ 2649 /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
2658 2707
2659#endif 2708#endif
2660 2709
2661int __init init_ext4_mballoc(void) 2710int __init ext4_init_mballoc(void)
2662{ 2711{
2663 ext4_pspace_cachep = 2712 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2664 kmem_cache_create("ext4_prealloc_space", 2713 SLAB_RECLAIM_ACCOUNT);
2665 sizeof(struct ext4_prealloc_space),
2666 0, SLAB_RECLAIM_ACCOUNT, NULL);
2667 if (ext4_pspace_cachep == NULL) 2714 if (ext4_pspace_cachep == NULL)
2668 return -ENOMEM; 2715 return -ENOMEM;
2669 2716
2670 ext4_ac_cachep = 2717 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2671 kmem_cache_create("ext4_alloc_context", 2718 SLAB_RECLAIM_ACCOUNT);
2672 sizeof(struct ext4_allocation_context),
2673 0, SLAB_RECLAIM_ACCOUNT, NULL);
2674 if (ext4_ac_cachep == NULL) { 2719 if (ext4_ac_cachep == NULL) {
2675 kmem_cache_destroy(ext4_pspace_cachep); 2720 kmem_cache_destroy(ext4_pspace_cachep);
2676 return -ENOMEM; 2721 return -ENOMEM;
2677 } 2722 }
2678 2723
2679 ext4_free_ext_cachep = 2724 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2680 kmem_cache_create("ext4_free_block_extents", 2725 SLAB_RECLAIM_ACCOUNT);
2681 sizeof(struct ext4_free_data),
2682 0, SLAB_RECLAIM_ACCOUNT, NULL);
2683 if (ext4_free_ext_cachep == NULL) { 2726 if (ext4_free_ext_cachep == NULL) {
2684 kmem_cache_destroy(ext4_pspace_cachep); 2727 kmem_cache_destroy(ext4_pspace_cachep);
2685 kmem_cache_destroy(ext4_ac_cachep); 2728 kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
2689 return 0; 2732 return 0;
2690} 2733}
2691 2734
2692void exit_ext4_mballoc(void) 2735void ext4_exit_mballoc(void)
2693{ 2736{
2737 int i;
2694 /* 2738 /*
2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2696 * before destroying the slab cache. 2740 * before destroying the slab cache.
@@ -2699,12 +2743,21 @@ void exit_ext4_mballoc(void)
2699 kmem_cache_destroy(ext4_pspace_cachep); 2743 kmem_cache_destroy(ext4_pspace_cachep);
2700 kmem_cache_destroy(ext4_ac_cachep); 2744 kmem_cache_destroy(ext4_ac_cachep);
2701 kmem_cache_destroy(ext4_free_ext_cachep); 2745 kmem_cache_destroy(ext4_free_ext_cachep);
2746
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2702 ext4_remove_debugfs_entry(); 2755 ext4_remove_debugfs_entry();
2703} 2756}
2704 2757
2705 2758
2706/* 2759/*
2707 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps 2760 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2708 * Returns 0 if success or error code 2761 * Returns 0 if success or error code
2709 */ 2762 */
2710static noinline_for_stack int 2763static noinline_for_stack int
@@ -2712,7 +2765,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2712 handle_t *handle, unsigned int reserv_blks) 2765 handle_t *handle, unsigned int reserv_blks)
2713{ 2766{
2714 struct buffer_head *bitmap_bh = NULL; 2767 struct buffer_head *bitmap_bh = NULL;
2715 struct ext4_super_block *es;
2716 struct ext4_group_desc *gdp; 2768 struct ext4_group_desc *gdp;
2717 struct buffer_head *gdp_bh; 2769 struct buffer_head *gdp_bh;
2718 struct ext4_sb_info *sbi; 2770 struct ext4_sb_info *sbi;
@@ -2725,8 +2777,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2725 2777
2726 sb = ac->ac_sb; 2778 sb = ac->ac_sb;
2727 sbi = EXT4_SB(sb); 2779 sbi = EXT4_SB(sb);
2728 es = sbi->s_es;
2729
2730 2780
2731 err = -EIO; 2781 err = -EIO;
2732 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2782 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2812,7 +2862,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2812 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2862 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2813 2863
2814out_err: 2864out_err:
2815 sb->s_dirt = 1; 2865 ext4_mark_super_dirty(sb);
2816 brelse(bitmap_bh); 2866 brelse(bitmap_bh);
2817 return err; 2867 return err;
2818} 2868}
@@ -2850,7 +2900,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2850 int bsbits, max; 2900 int bsbits, max;
2851 ext4_lblk_t end; 2901 ext4_lblk_t end;
2852 loff_t size, orig_size, start_off; 2902 loff_t size, orig_size, start_off;
2853 ext4_lblk_t start, orig_start; 2903 ext4_lblk_t start;
2854 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2904 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2855 struct ext4_prealloc_space *pa; 2905 struct ext4_prealloc_space *pa;
2856 2906
@@ -2881,6 +2931,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2881 size = size << bsbits; 2931 size = size << bsbits;
2882 if (size < i_size_read(ac->ac_inode)) 2932 if (size < i_size_read(ac->ac_inode))
2883 size = i_size_read(ac->ac_inode); 2933 size = i_size_read(ac->ac_inode);
2934 orig_size = size;
2884 2935
2885 /* max size of free chunks */ 2936 /* max size of free chunks */
2886 max = 2 << bsbits; 2937 max = 2 << bsbits;
@@ -2922,8 +2973,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2922 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 2973 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2923 size = ac->ac_o_ex.fe_len << bsbits; 2974 size = ac->ac_o_ex.fe_len << bsbits;
2924 } 2975 }
2925 orig_size = size = size >> bsbits; 2976 size = size >> bsbits;
2926 orig_start = start = start_off >> bsbits; 2977 start = start_off >> bsbits;
2927 2978
2928 /* don't cover already allocated blocks in selected range */ 2979 /* don't cover already allocated blocks in selected range */
2929 if (ar->pleft && start <= ar->lleft) { 2980 if (ar->pleft && start <= ar->lleft) {
@@ -3537,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3537 */ 3588 */
3538static noinline_for_stack int 3589static noinline_for_stack int
3539ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3590ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3540 struct ext4_prealloc_space *pa, 3591 struct ext4_prealloc_space *pa)
3541 struct ext4_allocation_context *ac)
3542{ 3592{
3543 struct super_block *sb = e4b->bd_sb; 3593 struct super_block *sb = e4b->bd_sb;
3544 struct ext4_sb_info *sbi = EXT4_SB(sb); 3594 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3547,7 +3597,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3547 ext4_group_t group; 3597 ext4_group_t group;
3548 ext4_grpblk_t bit; 3598 ext4_grpblk_t bit;
3549 unsigned long long grp_blk_start; 3599 unsigned long long grp_blk_start;
3550 sector_t start;
3551 int err = 0; 3600 int err = 0;
3552 int free = 0; 3601 int free = 0;
3553 3602
@@ -3557,32 +3606,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3557 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3606 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3558 end = bit + pa->pa_len; 3607 end = bit + pa->pa_len;
3559 3608
3560 if (ac) {
3561 ac->ac_sb = sb;
3562 ac->ac_inode = pa->pa_inode;
3563 }
3564
3565 while (bit < end) { 3609 while (bit < end) {
3566 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3610 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3567 if (bit >= end) 3611 if (bit >= end)
3568 break; 3612 break;
3569 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3613 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3570 start = ext4_group_first_block_no(sb, group) + bit;
3571 mb_debug(1, " free preallocated %u/%u in group %u\n", 3614 mb_debug(1, " free preallocated %u/%u in group %u\n",
3572 (unsigned) start, (unsigned) next - bit, 3615 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3573 (unsigned) group); 3616 (unsigned) next - bit, (unsigned) group);
3574 free += next - bit; 3617 free += next - bit;
3575 3618
3576 if (ac) { 3619 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3577 ac->ac_b_ex.fe_group = group; 3620 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
3578 ac->ac_b_ex.fe_start = bit; 3621 grp_blk_start + bit, next - bit);
3579 ac->ac_b_ex.fe_len = next - bit;
3580 ac->ac_b_ex.fe_logical = 0;
3581 trace_ext4_mballoc_discard(ac);
3582 }
3583
3584 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
3585 next - bit);
3586 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3622 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3587 bit = next + 1; 3623 bit = next + 1;
3588 } 3624 }
@@ -3591,8 +3627,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3591 pa, (unsigned long) pa->pa_lstart, 3627 pa, (unsigned long) pa->pa_lstart,
3592 (unsigned long) pa->pa_pstart, 3628 (unsigned long) pa->pa_pstart,
3593 (unsigned long) pa->pa_len); 3629 (unsigned long) pa->pa_len);
3594 ext4_grp_locked_error(sb, group, 3630 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3595 __func__, "free %u, pa_free %u",
3596 free, pa->pa_free); 3631 free, pa->pa_free);
3597 /* 3632 /*
3598 * pa is already deleted so we use the value obtained 3633 * pa is already deleted so we use the value obtained
@@ -3606,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3606 3641
3607static noinline_for_stack int 3642static noinline_for_stack int
3608ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3643ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3609 struct ext4_prealloc_space *pa, 3644 struct ext4_prealloc_space *pa)
3610 struct ext4_allocation_context *ac)
3611{ 3645{
3612 struct super_block *sb = e4b->bd_sb; 3646 struct super_block *sb = e4b->bd_sb;
3613 ext4_group_t group; 3647 ext4_group_t group;
3614 ext4_grpblk_t bit; 3648 ext4_grpblk_t bit;
3615 3649
3616 trace_ext4_mb_release_group_pa(ac, pa); 3650 trace_ext4_mb_release_group_pa(sb, pa);
3617 BUG_ON(pa->pa_deleted == 0); 3651 BUG_ON(pa->pa_deleted == 0);
3618 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3652 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3619 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3653 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3620 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3654 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3621 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3655 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3622 3656 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3623 if (ac) {
3624 ac->ac_sb = sb;
3625 ac->ac_inode = NULL;
3626 ac->ac_b_ex.fe_group = group;
3627 ac->ac_b_ex.fe_start = bit;
3628 ac->ac_b_ex.fe_len = pa->pa_len;
3629 ac->ac_b_ex.fe_logical = 0;
3630 trace_ext4_mballoc_discard(ac);
3631 }
3632 3657
3633 return 0; 3658 return 0;
3634} 3659}
@@ -3649,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3649 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3674 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3650 struct buffer_head *bitmap_bh = NULL; 3675 struct buffer_head *bitmap_bh = NULL;
3651 struct ext4_prealloc_space *pa, *tmp; 3676 struct ext4_prealloc_space *pa, *tmp;
3652 struct ext4_allocation_context *ac;
3653 struct list_head list; 3677 struct list_head list;
3654 struct ext4_buddy e4b; 3678 struct ext4_buddy e4b;
3655 int err; 3679 int err;
@@ -3678,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3678 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3702 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3679 3703
3680 INIT_LIST_HEAD(&list); 3704 INIT_LIST_HEAD(&list);
3681 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3682 if (ac)
3683 ac->ac_sb = sb;
3684repeat: 3705repeat:
3685 ext4_lock_group(sb, group); 3706 ext4_lock_group(sb, group);
3686 list_for_each_entry_safe(pa, tmp, 3707 list_for_each_entry_safe(pa, tmp,
@@ -3735,9 +3756,9 @@ repeat:
3735 spin_unlock(pa->pa_obj_lock); 3756 spin_unlock(pa->pa_obj_lock);
3736 3757
3737 if (pa->pa_type == MB_GROUP_PA) 3758 if (pa->pa_type == MB_GROUP_PA)
3738 ext4_mb_release_group_pa(&e4b, pa, ac); 3759 ext4_mb_release_group_pa(&e4b, pa);
3739 else 3760 else
3740 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3761 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3741 3762
3742 list_del(&pa->u.pa_tmp_list); 3763 list_del(&pa->u.pa_tmp_list);
3743 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3764 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3745,8 +3766,6 @@ repeat:
3745 3766
3746out: 3767out:
3747 ext4_unlock_group(sb, group); 3768 ext4_unlock_group(sb, group);
3748 if (ac)
3749 kmem_cache_free(ext4_ac_cachep, ac);
3750 ext4_mb_unload_buddy(&e4b); 3769 ext4_mb_unload_buddy(&e4b);
3751 put_bh(bitmap_bh); 3770 put_bh(bitmap_bh);
3752 return free; 3771 return free;
@@ -3767,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
3767 struct super_block *sb = inode->i_sb; 3786 struct super_block *sb = inode->i_sb;
3768 struct buffer_head *bitmap_bh = NULL; 3787 struct buffer_head *bitmap_bh = NULL;
3769 struct ext4_prealloc_space *pa, *tmp; 3788 struct ext4_prealloc_space *pa, *tmp;
3770 struct ext4_allocation_context *ac;
3771 ext4_group_t group = 0; 3789 ext4_group_t group = 0;
3772 struct list_head list; 3790 struct list_head list;
3773 struct ext4_buddy e4b; 3791 struct ext4_buddy e4b;
@@ -3783,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
3783 3801
3784 INIT_LIST_HEAD(&list); 3802 INIT_LIST_HEAD(&list);
3785 3803
3786 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3787 if (ac) {
3788 ac->ac_sb = sb;
3789 ac->ac_inode = inode;
3790 }
3791repeat: 3804repeat:
3792 /* first, collect all pa's in the inode */ 3805 /* first, collect all pa's in the inode */
3793 spin_lock(&ei->i_prealloc_lock); 3806 spin_lock(&ei->i_prealloc_lock);
@@ -3857,7 +3870,7 @@ repeat:
3857 3870
3858 ext4_lock_group(sb, group); 3871 ext4_lock_group(sb, group);
3859 list_del(&pa->pa_group_list); 3872 list_del(&pa->pa_group_list);
3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3873 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3861 ext4_unlock_group(sb, group); 3874 ext4_unlock_group(sb, group);
3862 3875
3863 ext4_mb_unload_buddy(&e4b); 3876 ext4_mb_unload_buddy(&e4b);
@@ -3866,8 +3879,6 @@ repeat:
3866 list_del(&pa->u.pa_tmp_list); 3879 list_del(&pa->u.pa_tmp_list);
3867 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3880 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3868 } 3881 }
3869 if (ac)
3870 kmem_cache_free(ext4_ac_cachep, ac);
3871} 3882}
3872 3883
3873/* 3884/*
@@ -3889,6 +3900,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3889 struct super_block *sb = ac->ac_sb; 3900 struct super_block *sb = ac->ac_sb;
3890 ext4_group_t ngroups, i; 3901 ext4_group_t ngroups, i;
3891 3902
3903 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
3904 return;
3905
3892 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3906 printk(KERN_ERR "EXT4-fs: Can't allocate:"
3893 " Allocation context details:\n"); 3907 " Allocation context details:\n");
3894 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3908 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4062,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4062 struct ext4_buddy e4b; 4076 struct ext4_buddy e4b;
4063 struct list_head discard_list; 4077 struct list_head discard_list;
4064 struct ext4_prealloc_space *pa, *tmp; 4078 struct ext4_prealloc_space *pa, *tmp;
4065 struct ext4_allocation_context *ac;
4066 4079
4067 mb_debug(1, "discard locality group preallocation\n"); 4080 mb_debug(1, "discard locality group preallocation\n");
4068 4081
4069 INIT_LIST_HEAD(&discard_list); 4082 INIT_LIST_HEAD(&discard_list);
4070 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4071 if (ac)
4072 ac->ac_sb = sb;
4073 4083
4074 spin_lock(&lg->lg_prealloc_lock); 4084 spin_lock(&lg->lg_prealloc_lock);
4075 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4085 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4121,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4121 } 4131 }
4122 ext4_lock_group(sb, group); 4132 ext4_lock_group(sb, group);
4123 list_del(&pa->pa_group_list); 4133 list_del(&pa->pa_group_list);
4124 ext4_mb_release_group_pa(&e4b, pa, ac); 4134 ext4_mb_release_group_pa(&e4b, pa);
4125 ext4_unlock_group(sb, group); 4135 ext4_unlock_group(sb, group);
4126 4136
4127 ext4_mb_unload_buddy(&e4b); 4137 ext4_mb_unload_buddy(&e4b);
4128 list_del(&pa->u.pa_tmp_list); 4138 list_del(&pa->u.pa_tmp_list);
4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4139 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4130 } 4140 }
4131 if (ac)
4132 kmem_cache_free(ext4_ac_cachep, ac);
4133} 4141}
4134 4142
4135/* 4143/*
@@ -4255,7 +4263,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4255 * to usual allocation 4263 * to usual allocation
4256 */ 4264 */
4257ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4265ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4258 struct ext4_allocation_request *ar, int *errp) 4266 struct ext4_allocation_request *ar, int *errp)
4259{ 4267{
4260 int freed; 4268 int freed;
4261 struct ext4_allocation_context *ac = NULL; 4269 struct ext4_allocation_context *ac = NULL;
@@ -4299,7 +4307,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4299 inquota = ar->len; 4307 inquota = ar->len;
4300 if (ar->len == 0) { 4308 if (ar->len == 0) {
4301 *errp = -EDQUOT; 4309 *errp = -EDQUOT;
4302 goto out3; 4310 goto out;
4303 } 4311 }
4304 } 4312 }
4305 4313
@@ -4307,13 +4315,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4307 if (!ac) { 4315 if (!ac) {
4308 ar->len = 0; 4316 ar->len = 0;
4309 *errp = -ENOMEM; 4317 *errp = -ENOMEM;
4310 goto out1; 4318 goto out;
4311 } 4319 }
4312 4320
4313 *errp = ext4_mb_initialize_context(ac, ar); 4321 *errp = ext4_mb_initialize_context(ac, ar);
4314 if (*errp) { 4322 if (*errp) {
4315 ar->len = 0; 4323 ar->len = 0;
4316 goto out2; 4324 goto out;
4317 } 4325 }
4318 4326
4319 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4327 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4322,7 +4330,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4322 ext4_mb_normalize_request(ac, ar); 4330 ext4_mb_normalize_request(ac, ar);
4323repeat: 4331repeat:
4324 /* allocate space in core */ 4332 /* allocate space in core */
4325 ext4_mb_regular_allocator(ac); 4333 *errp = ext4_mb_regular_allocator(ac);
4334 if (*errp)
4335 goto errout;
4326 4336
4327 /* as we've just preallocated more space than 4337 /* as we've just preallocated more space than
4328 * user requested orinally, we store allocated 4338 * user requested orinally, we store allocated
@@ -4333,7 +4343,7 @@ repeat:
4333 } 4343 }
4334 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4344 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4335 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4345 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4336 if (*errp == -EAGAIN) { 4346 if (*errp == -EAGAIN) {
4337 /* 4347 /*
4338 * drop the reference that we took 4348 * drop the reference that we took
4339 * in ext4_mb_use_best_found 4349 * in ext4_mb_use_best_found
@@ -4344,12 +4354,10 @@ repeat:
4344 ac->ac_b_ex.fe_len = 0; 4354 ac->ac_b_ex.fe_len = 0;
4345 ac->ac_status = AC_STATUS_CONTINUE; 4355 ac->ac_status = AC_STATUS_CONTINUE;
4346 goto repeat; 4356 goto repeat;
4347 } else if (*errp) { 4357 } else if (*errp)
4358 errout:
4348 ext4_discard_allocated_blocks(ac); 4359 ext4_discard_allocated_blocks(ac);
4349 ac->ac_b_ex.fe_len = 0; 4360 else {
4350 ar->len = 0;
4351 ext4_mb_show_ac(ac);
4352 } else {
4353 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4361 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4354 ar->len = ac->ac_b_ex.fe_len; 4362 ar->len = ac->ac_b_ex.fe_len;
4355 } 4363 }
@@ -4358,19 +4366,19 @@ repeat:
4358 if (freed) 4366 if (freed)
4359 goto repeat; 4367 goto repeat;
4360 *errp = -ENOSPC; 4368 *errp = -ENOSPC;
4369 }
4370
4371 if (*errp) {
4361 ac->ac_b_ex.fe_len = 0; 4372 ac->ac_b_ex.fe_len = 0;
4362 ar->len = 0; 4373 ar->len = 0;
4363 ext4_mb_show_ac(ac); 4374 ext4_mb_show_ac(ac);
4364 } 4375 }
4365
4366 ext4_mb_release_context(ac); 4376 ext4_mb_release_context(ac);
4367 4377out:
4368out2: 4378 if (ac)
4369 kmem_cache_free(ext4_ac_cachep, ac); 4379 kmem_cache_free(ext4_ac_cachep, ac);
4370out1:
4371 if (inquota && ar->len < inquota) 4380 if (inquota && ar->len < inquota)
4372 dquot_free_block(ar->inode, inquota - ar->len); 4381 dquot_free_block(ar->inode, inquota - ar->len);
4373out3:
4374 if (!ar->len) { 4382 if (!ar->len) {
4375 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4383 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4376 /* release all the reserved blocks if non delalloc */ 4384 /* release all the reserved blocks if non delalloc */
@@ -4402,6 +4410,7 @@ static noinline_for_stack int
4402ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4410ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4403 struct ext4_free_data *new_entry) 4411 struct ext4_free_data *new_entry)
4404{ 4412{
4413 ext4_group_t group = e4b->bd_group;
4405 ext4_grpblk_t block; 4414 ext4_grpblk_t block;
4406 struct ext4_free_data *entry; 4415 struct ext4_free_data *entry;
4407 struct ext4_group_info *db = e4b->bd_info; 4416 struct ext4_group_info *db = e4b->bd_info;
@@ -4434,9 +4443,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4434 else if (block >= (entry->start_blk + entry->count)) 4443 else if (block >= (entry->start_blk + entry->count))
4435 n = &(*n)->rb_right; 4444 n = &(*n)->rb_right;
4436 else { 4445 else {
4437 ext4_grp_locked_error(sb, e4b->bd_group, __func__, 4446 ext4_grp_locked_error(sb, group, 0,
4438 "Double free of blocks %d (%d %d)", 4447 ext4_group_first_block_no(sb, group) + block,
4439 block, entry->start_blk, entry->count); 4448 "Block already on to-be-freed list");
4440 return 0; 4449 return 0;
4441 } 4450 }
4442 } 4451 }
@@ -4492,9 +4501,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4492{ 4501{
4493 struct buffer_head *bitmap_bh = NULL; 4502 struct buffer_head *bitmap_bh = NULL;
4494 struct super_block *sb = inode->i_sb; 4503 struct super_block *sb = inode->i_sb;
4495 struct ext4_allocation_context *ac = NULL;
4496 struct ext4_group_desc *gdp; 4504 struct ext4_group_desc *gdp;
4497 struct ext4_super_block *es;
4498 unsigned long freed = 0; 4505 unsigned long freed = 0;
4499 unsigned int overflow; 4506 unsigned int overflow;
4500 ext4_grpblk_t bit; 4507 ext4_grpblk_t bit;
@@ -4513,7 +4520,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4513 } 4520 }
4514 4521
4515 sbi = EXT4_SB(sb); 4522 sbi = EXT4_SB(sb);
4516 es = EXT4_SB(sb)->s_es;
4517 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4523 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4518 !ext4_data_block_valid(sbi, block, count)) { 4524 !ext4_data_block_valid(sbi, block, count)) {
4519 ext4_error(sb, "Freeing blocks not in datazone - " 4525 ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4534,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4534 if (!bh) 4540 if (!bh)
4535 tbh = sb_find_get_block(inode->i_sb, 4541 tbh = sb_find_get_block(inode->i_sb,
4536 block + i); 4542 block + i);
4543 if (unlikely(!tbh))
4544 continue;
4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4545 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4538 inode, tbh, block + i); 4546 inode, tbh, block + i);
4539 } 4547 }
@@ -4549,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4549 if (!ext4_should_writeback_data(inode)) 4557 if (!ext4_should_writeback_data(inode))
4550 flags |= EXT4_FREE_BLOCKS_METADATA; 4558 flags |= EXT4_FREE_BLOCKS_METADATA;
4551 4559
4552 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4553 if (ac) {
4554 ac->ac_inode = inode;
4555 ac->ac_sb = sb;
4556 }
4557
4558do_more: 4560do_more:
4559 overflow = 0; 4561 overflow = 0;
4560 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4562 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4612,12 +4614,7 @@ do_more:
4612 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4614 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4613 } 4615 }
4614#endif 4616#endif
4615 if (ac) { 4617 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4616 ac->ac_b_ex.fe_group = block_group;
4617 ac->ac_b_ex.fe_start = bit;
4618 ac->ac_b_ex.fe_len = count;
4619 trace_ext4_mballoc_free(ac);
4620 }
4621 4618
4622 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4619 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4623 if (err) 4620 if (err)
@@ -4643,6 +4640,8 @@ do_more:
4643 * with group lock held. generate_buddy look at 4640 * with group lock held. generate_buddy look at
4644 * them with group lock_held 4641 * them with group lock_held
4645 */ 4642 */
4643 if (test_opt(sb, DISCARD))
4644 ext4_issue_discard(sb, block_group, bit, count);
4646 ext4_lock_group(sb, block_group); 4645 ext4_lock_group(sb, block_group);
4647 mb_clear_bits(bitmap_bh->b_data, bit, count); 4646 mb_clear_bits(bitmap_bh->b_data, bit, count);
4648 mb_free_blocks(inode, &e4b, bit, count); 4647 mb_free_blocks(inode, &e4b, bit, count);
@@ -4680,13 +4679,196 @@ do_more:
4680 put_bh(bitmap_bh); 4679 put_bh(bitmap_bh);
4681 goto do_more; 4680 goto do_more;
4682 } 4681 }
4683 sb->s_dirt = 1; 4682 ext4_mark_super_dirty(sb);
4684error_return: 4683error_return:
4685 if (freed) 4684 if (freed)
4686 dquot_free_block(inode, freed); 4685 dquot_free_block(inode, freed);
4687 brelse(bitmap_bh); 4686 brelse(bitmap_bh);
4688 ext4_std_error(sb, err); 4687 ext4_std_error(sb, err);
4689 if (ac)
4690 kmem_cache_free(ext4_ac_cachep, ac);
4691 return; 4688 return;
4692} 4689}
4690
4691/**
4692 * ext4_trim_extent -- function to TRIM one single free extent in the group
4693 * @sb: super block for the file system
4694 * @start: starting block of the free extent in the alloc. group
4695 * @count: number of blocks to TRIM
4696 * @group: alloc. group we are working with
4697 * @e4b: ext4 buddy for the group
4698 *
4699 * Trim "count" blocks starting at "start" in the "group". To assure that no
4700 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4701 * be called with under the group lock.
4702 */
4703static int ext4_trim_extent(struct super_block *sb, int start, int count,
4704 ext4_group_t group, struct ext4_buddy *e4b)
4705{
4706 struct ext4_free_extent ex;
4707 int ret = 0;
4708
4709 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4710
4711 ex.fe_start = start;
4712 ex.fe_group = group;
4713 ex.fe_len = count;
4714
4715 /*
4716 * Mark blocks used, so no one can reuse them while
4717 * being trimmed.
4718 */
4719 mb_mark_used(e4b, &ex);
4720 ext4_unlock_group(sb, group);
4721
4722 ret = ext4_issue_discard(sb, group, start, count);
4723 if (ret)
4724 ext4_std_error(sb, ret);
4725
4726 ext4_lock_group(sb, group);
4727 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4728 return ret;
4729}
4730
4731/**
4732 * ext4_trim_all_free -- function to trim all free space in alloc. group
4733 * @sb: super block for file system
4734 * @e4b: ext4 buddy
4735 * @start: first group block to examine
4736 * @max: last group block to examine
4737 * @minblocks: minimum extent block count
4738 *
4739 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4740 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4741 * the extent.
4742 *
4743 *
4744 * ext4_trim_all_free walks through group's block bitmap searching for free
4745 * extents. When the free extent is found, mark it as used in group buddy
4746 * bitmap. Then issue a TRIM command on this extent and free the extent in
4747 * the group buddy bitmap. This is done until whole group is scanned.
4748 */
4749ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4750 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4751{
4752 void *bitmap;
4753 ext4_grpblk_t next, count = 0;
4754 ext4_group_t group;
4755 int ret = 0;
4756
4757 BUG_ON(e4b == NULL);
4758
4759 bitmap = e4b->bd_bitmap;
4760 group = e4b->bd_group;
4761 start = (e4b->bd_info->bb_first_free > start) ?
4762 e4b->bd_info->bb_first_free : start;
4763 ext4_lock_group(sb, group);
4764
4765 while (start < max) {
4766 start = mb_find_next_zero_bit(bitmap, max, start);
4767 if (start >= max)
4768 break;
4769 next = mb_find_next_bit(bitmap, max, start);
4770
4771 if ((next - start) >= minblocks) {
4772 ret = ext4_trim_extent(sb, start,
4773 next - start, group, e4b);
4774 if (ret < 0)
4775 break;
4776 count += next - start;
4777 }
4778 start = next + 1;
4779
4780 if (fatal_signal_pending(current)) {
4781 count = -ERESTARTSYS;
4782 break;
4783 }
4784
4785 if (need_resched()) {
4786 ext4_unlock_group(sb, group);
4787 cond_resched();
4788 ext4_lock_group(sb, group);
4789 }
4790
4791 if ((e4b->bd_info->bb_free - count) < minblocks)
4792 break;
4793 }
4794 ext4_unlock_group(sb, group);
4795
4796 ext4_debug("trimmed %d blocks in the group %d\n",
4797 count, group);
4798
4799 if (ret < 0)
4800 count = ret;
4801
4802 return count;
4803}
4804
4805/**
4806 * ext4_trim_fs() -- trim ioctl handle function
4807 * @sb: superblock for filesystem
4808 * @range: fstrim_range structure
4809 *
4810 * start: First Byte to trim
4811 * len: number of Bytes to trim from start
4812 * minlen: minimum extent length in Bytes
4813 * ext4_trim_fs goes through all allocation groups containing Bytes from
4814 * start to start+len. For each such a group ext4_trim_all_free function
4815 * is invoked to trim all free space.
4816 */
4817int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4818{
4819 struct ext4_buddy e4b;
4820 ext4_group_t first_group, last_group;
4821 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4822 ext4_grpblk_t cnt = 0, first_block, last_block;
4823 uint64_t start, len, minlen, trimmed;
4824 int ret = 0;
4825
4826 start = range->start >> sb->s_blocksize_bits;
4827 len = range->len >> sb->s_blocksize_bits;
4828 minlen = range->minlen >> sb->s_blocksize_bits;
4829 trimmed = 0;
4830
4831 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4832 return -EINVAL;
4833
4834 /* Determine first and last group to examine based on start and len */
4835 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4836 &first_group, &first_block);
4837 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4838 &last_group, &last_block);
4839 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4840 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4841
4842 if (first_group > last_group)
4843 return -EINVAL;
4844
4845 for (group = first_group; group <= last_group; group++) {
4846 ret = ext4_mb_load_buddy(sb, group, &e4b);
4847 if (ret) {
4848 ext4_error(sb, "Error in loading buddy "
4849 "information for %u", group);
4850 break;
4851 }
4852
4853 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4854 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4855 else
4856 last_block = len;
4857
4858 if (e4b.bd_info->bb_free >= minlen) {
4859 cnt = ext4_trim_all_free(sb, &e4b, first_block,
4860 last_block, minlen);
4861 if (cnt < 0) {
4862 ret = cnt;
4863 ext4_mb_unload_buddy(&e4b);
4864 break;
4865 }
4866 }
4867 ext4_mb_unload_buddy(&e4b);
4868 trimmed += cnt;
4869 first_block = 0;
4870 }
4871 range->len = trimmed * sb->s_blocksize;
4872
4873 return ret;
4874}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6f3a27ec30b..25f3a974b72 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ei->i_flags |= EXT4_EXTENTS_FL; 379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
412 struct buffer_head *bh; 412 struct buffer_head *bh;
413 struct ext4_extent_header *eh; 413 struct ext4_extent_header *eh;
414 414
415 block = idx_pblock(ix); 415 block = ext4_idx_pblock(ix);
416 bh = sb_bread(inode->i_sb, block); 416 bh = sb_bread(inode->i_sb, block);
417 if (!bh) 417 if (!bh)
418 return -EIO; 418 return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 52abfa12762..b9f3e7862f1 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
86 /* leaf block */ 86 /* leaf block */
87 *extent = ++path[ppos].p_ext; 87 *extent = ++path[ppos].p_ext;
88 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 88 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
89 return 0; 89 return 0;
90 } 90 }
91 91
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
96 96
97 /* index block */ 97 /* index block */
98 path[ppos].p_idx++; 98 path[ppos].p_idx++;
99 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 99 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
100 if (path[ppos+1].p_bh) 100 if (path[ppos+1].p_bh)
101 brelse(path[ppos+1].p_bh); 101 brelse(path[ppos+1].p_bh);
102 path[ppos+1].p_bh = 102 path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
111 path[cur_ppos].p_idx = 111 path[cur_ppos].p_idx =
112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr); 112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
113 path[cur_ppos].p_block = 113 path[cur_ppos].p_block =
114 idx_pblock(path[cur_ppos].p_idx); 114 ext4_idx_pblock(path[cur_ppos].p_idx);
115 if (path[cur_ppos+1].p_bh) 115 if (path[cur_ppos+1].p_bh)
116 brelse(path[cur_ppos+1].p_bh); 116 brelse(path[cur_ppos+1].p_bh);
117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, 117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
133 path[leaf_ppos].p_ext = *extent = 133 path[leaf_ppos].p_ext = *extent =
134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
135 path[leaf_ppos].p_block = 135 path[leaf_ppos].p_block =
136 ext_pblock(path[leaf_ppos].p_ext); 136 ext4_ext_pblock(path[leaf_ppos].p_ext);
137 return 0; 137 return 0;
138 } 138 }
139 } 139 }
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
148 */ 148 */
149static int 149static int
150mext_check_null_inode(struct inode *inode1, struct inode *inode2, 150mext_check_null_inode(struct inode *inode1, struct inode *inode2,
151 const char *function) 151 const char *function, unsigned int line)
152{ 152{
153 int ret = 0; 153 int ret = 0;
154 154
155 if (inode1 == NULL) { 155 if (inode1 == NULL) {
156 __ext4_error(inode2->i_sb, function, 156 __ext4_error(inode2->i_sb, function, line,
157 "Both inodes should not be NULL: " 157 "Both inodes should not be NULL: "
158 "inode1 NULL inode2 %lu", inode2->i_ino); 158 "inode1 NULL inode2 %lu", inode2->i_ino);
159 ret = -EIO; 159 ret = -EIO;
160 } else if (inode2 == NULL) { 160 } else if (inode2 == NULL) {
161 __ext4_error(inode1->i_sb, function, 161 __ext4_error(inode1->i_sb, function, line,
162 "Both inodes should not be NULL: " 162 "Both inodes should not be NULL: "
163 "inode1 %lu inode2 NULL", inode1->i_ino); 163 "inode1 %lu inode2 NULL", inode1->i_ino);
164 ret = -EIO; 164 ret = -EIO;
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
249 */ 249 */
250 o_end->ee_block = end_ext->ee_block; 250 o_end->ee_block = end_ext->ee_block;
251 o_end->ee_len = end_ext->ee_len; 251 o_end->ee_len = end_ext->ee_len;
252 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 252 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
253 } 253 }
254 254
255 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
276 */ 276 */
277 o_end->ee_block = end_ext->ee_block; 277 o_end->ee_block = end_ext->ee_block;
278 o_end->ee_len = end_ext->ee_len; 278 o_end->ee_len = end_ext->ee_len;
279 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 279 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
280 280
281 /* 281 /*
282 * Set 0 to the extent block if new_ext was 282 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
361 /* Insert new entry */ 361 /* Insert new entry */
362 if (new_ext->ee_len) { 362 if (new_ext->ee_len) {
363 o_start[i] = *new_ext; 363 o_start[i] = *new_ext;
364 ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); 364 ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
365 } 365 }
366 366
367 /* Insert end entry */ 367 /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
488 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
489 489
490 new_ext.ee_block = cpu_to_le32(*from); 490 new_ext.ee_block = cpu_to_le32(*from);
491 ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); 491 ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
492 new_ext.ee_len = dext->ee_len; 492 new_ext.ee_len = dext->ee_len;
493 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 493 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
553 copy_extent_status(oext, &end_ext); 553 copy_extent_status(oext, &end_ext);
554 end_ext_alen = ext4_ext_get_actual_len(&end_ext); 554 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
555 ext4_ext_store_pblock(&end_ext, 555 ext4_ext_store_pblock(&end_ext,
556 (ext_pblock(o_end) + oext_alen - end_ext_alen)); 556 (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
557 end_ext.ee_block = 557 end_ext.ee_block =
558 cpu_to_le32(le32_to_cpu(o_end->ee_block) + 558 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
559 oext_alen - end_ext_alen); 559 oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 /* When tmp_dext is too large, pick up the target range. */ 604 /* When tmp_dext is too large, pick up the target range. */
605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
606 606
607 ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); 607 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
608 tmp_dext->ee_block = 608 tmp_dext->ee_block =
609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); 610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
613 tmp_dext->ee_len = cpu_to_le16(max_count); 613 tmp_dext->ee_len = cpu_to_le16(max_count);
614 614
615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); 615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
616 ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); 616 ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
617 617
618 /* Adjust extent length if donor extent is larger than orig */ 618 /* Adjust extent length if donor extent is larger than orig */
619 if (ext4_ext_get_actual_len(tmp_dext) > 619 if (ext4_ext_get_actual_len(tmp_dext) >
@@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1084 1084
1085 BUG_ON(inode1 == NULL && inode2 == NULL); 1085 BUG_ON(inode1 == NULL && inode2 == NULL);
1086 1086
1087 ret = mext_check_null_inode(inode1, inode2, __func__); 1087 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1088 if (ret < 0) 1088 if (ret < 0)
1089 goto out; 1089 goto out;
1090 1090
@@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1121 1121
1122 BUG_ON(inode1 == NULL && inode2 == NULL); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1123 1123
1124 ret = mext_check_null_inode(inode1, inode2, __func__); 1124 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1125 if (ret < 0) 1125 if (ret < 0)
1126 goto out; 1126 goto out;
1127 1127
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a43e6617b35..92203b8a099 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
180 struct inode *inode); 180 struct inode *inode);
181 181
182unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
183{
184 unsigned len = le16_to_cpu(dlen);
185
186 if (len == EXT4_MAX_REC_LEN || len == 0)
187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16);
189}
190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
194 BUG();
195 if (len < 65536)
196 return cpu_to_le16(len);
197 if (len == blocksize) {
198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else
201 return cpu_to_le16(0);
202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
204}
205
206/* 182/*
207 * p is at least 6 bytes before the end of page 183 * p is at least 6 bytes before the end of page
208 */ 184 */
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
605 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
606 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
607 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
608 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 584 if (!ext4_check_dir_entry(dir, de, bh,
609 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
610 +((char *)de - bh->b_data))) { 586 +((char *)de - bh->b_data))) {
611 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
844 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
845 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
846 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
847 if (!ext4_check_dir_entry("ext4_find_entry", 823 if (!ext4_check_dir_entry(dir, de, bh, offset))
848 dir, de, bh, offset))
849 return -1; 824 return -1;
850 *res_dir = de; 825 *res_dir = de;
851 return 1; 826 return 1;
@@ -881,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
881 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 856 struct buffer_head *bh_use[NAMEI_RA_SIZE];
882 struct buffer_head *bh, *ret = NULL; 857 struct buffer_head *bh, *ret = NULL;
883 ext4_lblk_t start, block, b; 858 ext4_lblk_t start, block, b;
859 const u8 *name = d_name->name;
884 int ra_max = 0; /* Number of bh's in the readahead 860 int ra_max = 0; /* Number of bh's in the readahead
885 buffer, bh_use[] */ 861 buffer, bh_use[] */
886 int ra_ptr = 0; /* Current index into readahead 862 int ra_ptr = 0; /* Current index into readahead
@@ -895,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
895 namelen = d_name->len; 871 namelen = d_name->len;
896 if (namelen > EXT4_NAME_LEN) 872 if (namelen > EXT4_NAME_LEN)
897 return NULL; 873 return NULL;
874 if ((namelen <= 2) && (name[0] == '.') &&
875 (name[1] == '.' || name[1] == '0')) {
876 /*
877 * "." or ".." will only be in the first block
878 * NFS may look up ".."; "." should be handled by the VFS
879 */
880 block = start = 0;
881 nblocks = 1;
882 goto restart;
883 }
898 if (is_dx(dir)) { 884 if (is_dx(dir)) {
899 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 885 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
900 /* 886 /*
@@ -985,55 +971,35 @@ cleanup_and_exit:
985static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 971static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
986 struct ext4_dir_entry_2 **res_dir, int *err) 972 struct ext4_dir_entry_2 **res_dir, int *err)
987{ 973{
988 struct super_block * sb; 974 struct super_block * sb = dir->i_sb;
989 struct dx_hash_info hinfo; 975 struct dx_hash_info hinfo;
990 u32 hash;
991 struct dx_frame frames[2], *frame; 976 struct dx_frame frames[2], *frame;
992 struct ext4_dir_entry_2 *de, *top;
993 struct buffer_head *bh; 977 struct buffer_head *bh;
994 ext4_lblk_t block; 978 ext4_lblk_t block;
995 int retval; 979 int retval;
996 int namelen = d_name->len;
997 const u8 *name = d_name->name;
998 980
999 sb = dir->i_sb; 981 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1000 /* NFS may look up ".." - look at dx_root directory block */ 982 return NULL;
1001 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
1002 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1003 return NULL;
1004 } else {
1005 frame = frames;
1006 frame->bh = NULL; /* for dx_release() */
1007 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
1008 dx_set_block(frame->at, 0); /* dx_root block is 0 */
1009 }
1010 hash = hinfo.hash;
1011 do { 983 do {
1012 block = dx_get_block(frame->at); 984 block = dx_get_block(frame->at);
1013 if (!(bh = ext4_bread (NULL,dir, block, 0, err))) 985 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
1014 goto errout; 986 goto errout;
1015 de = (struct ext4_dir_entry_2 *) bh->b_data;
1016 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
1017 EXT4_DIR_REC_LEN(0));
1018 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
1019 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1020 + ((char *) de - bh->b_data);
1021
1022 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
1023 brelse(bh);
1024 *err = ERR_BAD_DX_DIR;
1025 goto errout;
1026 }
1027 987
1028 if (ext4_match(namelen, name, de)) { 988 retval = search_dirblock(bh, dir, d_name,
1029 *res_dir = de; 989 block << EXT4_BLOCK_SIZE_BITS(sb),
1030 dx_release(frames); 990 res_dir);
1031 return bh; 991 if (retval == 1) { /* Success! */
1032 } 992 dx_release(frames);
993 return bh;
1033 } 994 }
1034 brelse(bh); 995 brelse(bh);
996 if (retval == -1) {
997 *err = ERR_BAD_DX_DIR;
998 goto errout;
999 }
1000
1035 /* Check to see if we should continue to search */ 1001 /* Check to see if we should continue to search */
1036 retval = ext4_htree_next_block(dir, hash, frame, 1002 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1037 frames, NULL); 1003 frames, NULL);
1038 if (retval < 0) { 1004 if (retval < 0) {
1039 ext4_warning(sb, 1005 ext4_warning(sb,
@@ -1088,7 +1054,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1088struct dentry *ext4_get_parent(struct dentry *child) 1054struct dentry *ext4_get_parent(struct dentry *child)
1089{ 1055{
1090 __u32 ino; 1056 __u32 ino;
1091 struct inode *inode;
1092 static const struct qstr dotdot = { 1057 static const struct qstr dotdot = {
1093 .name = "..", 1058 .name = "..",
1094 .len = 2, 1059 .len = 2,
@@ -1097,7 +1062,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
1097 struct buffer_head *bh; 1062 struct buffer_head *bh;
1098 1063
1099 bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1064 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1100 inode = NULL;
1101 if (!bh) 1065 if (!bh)
1102 return ERR_PTR(-ENOENT); 1066 return ERR_PTR(-ENOENT);
1103 ino = le32_to_cpu(de->inode); 1067 ino = le32_to_cpu(de->inode);
@@ -1305,8 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1305 de = (struct ext4_dir_entry_2 *)bh->b_data; 1269 de = (struct ext4_dir_entry_2 *)bh->b_data;
1306 top = bh->b_data + blocksize - reclen; 1270 top = bh->b_data + blocksize - reclen;
1307 while ((char *) de <= top) { 1271 while ((char *) de <= top) {
1308 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1272 if (!ext4_check_dir_entry(dir, de, bh, offset))
1309 bh, offset))
1310 return -EIO; 1273 return -EIO;
1311 if (ext4_match(namelen, name, de)) 1274 if (ext4_match(namelen, name, de))
1312 return -EEXIST; 1275 return -EEXIST;
@@ -1673,7 +1636,7 @@ static int ext4_delete_entry(handle_t *handle,
1673 pde = NULL; 1636 pde = NULL;
1674 de = (struct ext4_dir_entry_2 *) bh->b_data; 1637 de = (struct ext4_dir_entry_2 *) bh->b_data;
1675 while (i < bh->b_size) { 1638 while (i < bh->b_size) {
1676 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) 1639 if (!ext4_check_dir_entry(dir, de, bh, i))
1677 return -EIO; 1640 return -EIO;
1678 if (de == de_del) { 1641 if (de == de_del) {
1679 BUFFER_TRACE(bh, "get_write_access"); 1642 BUFFER_TRACE(bh, "get_write_access");
@@ -1956,7 +1919,7 @@ static int empty_dir(struct inode *inode)
1956 } 1919 }
1957 de = (struct ext4_dir_entry_2 *) bh->b_data; 1920 de = (struct ext4_dir_entry_2 *) bh->b_data;
1958 } 1921 }
1959 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { 1922 if (!ext4_check_dir_entry(inode, de, bh, offset)) {
1960 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1923 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1961 sb->s_blocksize); 1924 sb->s_blocksize);
1962 offset = (offset | (sb->s_blocksize - 1)) + 1; 1925 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2340,7 +2303,7 @@ retry:
2340 2303
2341 inode->i_ctime = ext4_current_time(inode); 2304 inode->i_ctime = ext4_current_time(inode);
2342 ext4_inc_count(handle, inode); 2305 ext4_inc_count(handle, inode);
2343 atomic_inc(&inode->i_count); 2306 ihold(inode);
2344 2307
2345 err = ext4_add_entry(handle, dentry, inode); 2308 err = ext4_add_entry(handle, dentry, inode);
2346 if (!err) { 2309 if (!err) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000000..46a7d6a9d97
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,430 @@
1/*
2 * linux/fs/ext4/page-io.c
3 *
4 * This contains the new page_io functions for ext4
5 *
6 * Written by Theodore Ts'o, 2010.
7 */
8
9#include <linux/module.h>
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/jbd2.h>
13#include <linux/highuid.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/string.h>
17#include <linux/buffer_head.h>
18#include <linux/writeback.h>
19#include <linux/pagevec.h>
20#include <linux/mpage.h>
21#include <linux/namei.h>
22#include <linux/uio.h>
23#include <linux/bio.h>
24#include <linux/workqueue.h>
25#include <linux/kernel.h>
26#include <linux/slab.h>
27
28#include "ext4_jbd2.h"
29#include "xattr.h"
30#include "acl.h"
31#include "ext4_extents.h"
32
33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34
35int __init ext4_init_pageio(void)
36{
37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
38 if (io_page_cachep == NULL)
39 return -ENOMEM;
40 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
41 if (io_page_cachep == NULL) {
42 kmem_cache_destroy(io_page_cachep);
43 return -ENOMEM;
44 }
45
46 return 0;
47}
48
49void ext4_exit_pageio(void)
50{
51 kmem_cache_destroy(io_end_cachep);
52 kmem_cache_destroy(io_page_cachep);
53}
54
55void ext4_free_io_end(ext4_io_end_t *io)
56{
57 int i;
58
59 BUG_ON(!io);
60 if (io->page)
61 put_page(io->page);
62 for (i = 0; i < io->num_io_pages; i++) {
63 if (--io->pages[i]->p_count == 0) {
64 struct page *page = io->pages[i]->p_page;
65
66 end_page_writeback(page);
67 put_page(page);
68 kmem_cache_free(io_page_cachep, io->pages[i]);
69 }
70 }
71 io->num_io_pages = 0;
72 iput(io->inode);
73 kmem_cache_free(io_end_cachep, io);
74}
75
76/*
77 * check a range of space and convert unwritten extents to written.
78 */
79int ext4_end_io_nolock(ext4_io_end_t *io)
80{
81 struct inode *inode = io->inode;
82 loff_t offset = io->offset;
83 ssize_t size = io->size;
84 int ret = 0;
85
86 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
87 "list->prev 0x%p\n",
88 io, inode->i_ino, io->list.next, io->list.prev);
89
90 if (list_empty(&io->list))
91 return ret;
92
93 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
94 return ret;
95
96 ret = ext4_convert_unwritten_extents(inode, offset, size);
97 if (ret < 0) {
98 printk(KERN_EMERG "%s: failed to convert unwritten "
99 "extents to written extents, error is %d "
100 "io is still on inode %lu aio dio list\n",
101 __func__, ret, inode->i_ino);
102 return ret;
103 }
104
105 if (io->iocb)
106 aio_complete(io->iocb, io->result, 0);
107 /* clear the DIO AIO unwritten flag */
108 io->flag &= ~EXT4_IO_END_UNWRITTEN;
109 return ret;
110}
111
112/*
113 * work on completed aio dio IO, to convert unwritten extents to extents
114 */
115static void ext4_end_io_work(struct work_struct *work)
116{
117 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
118 struct inode *inode = io->inode;
119 struct ext4_inode_info *ei = EXT4_I(inode);
120 unsigned long flags;
121 int ret;
122
123 mutex_lock(&inode->i_mutex);
124 ret = ext4_end_io_nolock(io);
125 if (ret < 0) {
126 mutex_unlock(&inode->i_mutex);
127 return;
128 }
129
130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (!list_empty(&io->list))
132 list_del_init(&io->list);
133 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
134 mutex_unlock(&inode->i_mutex);
135 ext4_free_io_end(io);
136}
137
138ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
139{
140 ext4_io_end_t *io = NULL;
141
142 io = kmem_cache_alloc(io_end_cachep, flags);
143 if (io) {
144 memset(io, 0, sizeof(*io));
145 io->inode = igrab(inode);
146 BUG_ON(!io->inode);
147 INIT_WORK(&io->work, ext4_end_io_work);
148 INIT_LIST_HEAD(&io->list);
149 }
150 return io;
151}
152
153/*
154 * Print an buffer I/O error compatible with the fs/buffer.c. This
155 * provides compatibility with dmesg scrapers that look for a specific
156 * buffer I/O error message. We really need a unified error reporting
157 * structure to userspace ala Digital Unix's uerf system, but it's
158 * probably not going to happen in my lifetime, due to LKML politics...
159 */
160static void buffer_io_error(struct buffer_head *bh)
161{
162 char b[BDEVNAME_SIZE];
163 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
164 bdevname(bh->b_bdev, b),
165 (unsigned long long)bh->b_blocknr);
166}
167
168static void ext4_end_bio(struct bio *bio, int error)
169{
170 ext4_io_end_t *io_end = bio->bi_private;
171 struct workqueue_struct *wq;
172 struct inode *inode;
173 unsigned long flags;
174 ext4_fsblk_t err_block;
175 int i;
176
177 BUG_ON(!io_end);
178 inode = io_end->inode;
179 bio->bi_private = NULL;
180 bio->bi_end_io = NULL;
181 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
182 error = 0;
183 err_block = bio->bi_sector >> (inode->i_blkbits - 9);
184 bio_put(bio);
185
186 if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
187 pr_err("sb umounted, discard end_io request for inode %lu\n",
188 io_end->inode->i_ino);
189 ext4_free_io_end(io_end);
190 return;
191 }
192
193 if (error) {
194 io_end->flag |= EXT4_IO_END_ERROR;
195 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
196 "(offset %llu size %ld starting block %llu)",
197 inode->i_ino,
198 (unsigned long long) io_end->offset,
199 (long) io_end->size,
200 (unsigned long long) err_block);
201 }
202
203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head;
206 int partial_write = 0;
207
208 head = page_buffers(page);
209 if (error)
210 SetPageError(page);
211 BUG_ON(!head);
212 if (head->b_size == PAGE_CACHE_SIZE)
213 clear_buffer_dirty(head);
214 else {
215 loff_t offset;
216 loff_t io_end_offset = io_end->offset + io_end->size;
217
218 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
219 bh = head;
220 do {
221 if ((offset >= io_end->offset) &&
222 (offset+bh->b_size <= io_end_offset)) {
223 if (error)
224 buffer_io_error(bh);
225
226 clear_buffer_dirty(bh);
227 }
228 if (buffer_delay(bh))
229 partial_write = 1;
230 else if (!buffer_mapped(bh))
231 clear_buffer_dirty(bh);
232 else if (buffer_dirty(bh))
233 partial_write = 1;
234 offset += bh->b_size;
235 bh = bh->b_this_page;
236 } while (bh != head);
237 }
238
239 if (--io_end->pages[i]->p_count == 0) {
240 struct page *page = io_end->pages[i]->p_page;
241
242 end_page_writeback(page);
243 put_page(page);
244 kmem_cache_free(io_page_cachep, io_end->pages[i]);
245 }
246
247 /*
248 * If this is a partial write which happened to make
249 * all buffers uptodate then we can optimize away a
250 * bogus readpage() for the next read(). Here we
251 * 'discover' whether the page went uptodate as a
252 * result of this (potentially partial) write.
253 */
254 if (!partial_write)
255 SetPageUptodate(page);
256 }
257
258 io_end->num_io_pages = 0;
259
260 /* Add the io_end to per-inode completed io list*/
261 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
262 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
263 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
264
265 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
266 /* queue the work to convert unwritten extents to written */
267 queue_work(wq, &io_end->work);
268}
269
270void ext4_io_submit(struct ext4_io_submit *io)
271{
272 struct bio *bio = io->io_bio;
273
274 if (bio) {
275 bio_get(io->io_bio);
276 submit_bio(io->io_op, io->io_bio);
277 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
278 bio_put(io->io_bio);
279 }
280 io->io_bio = 0;
281 io->io_op = 0;
282 io->io_end = 0;
283}
284
285static int io_submit_init(struct ext4_io_submit *io,
286 struct inode *inode,
287 struct writeback_control *wbc,
288 struct buffer_head *bh)
289{
290 ext4_io_end_t *io_end;
291 struct page *page = bh->b_page;
292 int nvecs = bio_get_nr_vecs(bh->b_bdev);
293 struct bio *bio;
294
295 io_end = ext4_init_io_end(inode, GFP_NOFS);
296 if (!io_end)
297 return -ENOMEM;
298 do {
299 bio = bio_alloc(GFP_NOIO, nvecs);
300 nvecs >>= 1;
301 } while (bio == NULL);
302
303 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
304 bio->bi_bdev = bh->b_bdev;
305 bio->bi_private = io->io_end = io_end;
306 bio->bi_end_io = ext4_end_bio;
307
308 io_end->inode = inode;
309 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
310
311 io->io_bio = bio;
312 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
313 WRITE_SYNC_PLUG : WRITE);
314 io->io_next_block = bh->b_blocknr;
315 return 0;
316}
317
318static int io_submit_add_bh(struct ext4_io_submit *io,
319 struct ext4_io_page *io_page,
320 struct inode *inode,
321 struct writeback_control *wbc,
322 struct buffer_head *bh)
323{
324 ext4_io_end_t *io_end;
325 int ret;
326
327 if (buffer_new(bh)) {
328 clear_buffer_new(bh);
329 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
330 }
331
332 if (!buffer_mapped(bh) || buffer_delay(bh)) {
333 if (!buffer_mapped(bh))
334 clear_buffer_dirty(bh);
335 if (io->io_bio)
336 ext4_io_submit(io);
337 return 0;
338 }
339
340 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
341submit_and_retry:
342 ext4_io_submit(io);
343 }
344 if (io->io_bio == NULL) {
345 ret = io_submit_init(io, inode, wbc, bh);
346 if (ret)
347 return ret;
348 }
349 io_end = io->io_end;
350 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
351 (io_end->pages[io_end->num_io_pages-1] != io_page))
352 goto submit_and_retry;
353 if (buffer_uninit(bh))
354 io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
355 io->io_end->size += bh->b_size;
356 io->io_next_block++;
357 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
358 if (ret != bh->b_size)
359 goto submit_and_retry;
360 if ((io_end->num_io_pages == 0) ||
361 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
362 io_end->pages[io_end->num_io_pages++] = io_page;
363 io_page->p_count++;
364 }
365 return 0;
366}
367
368int ext4_bio_write_page(struct ext4_io_submit *io,
369 struct page *page,
370 int len,
371 struct writeback_control *wbc)
372{
373 struct inode *inode = page->mapping->host;
374 unsigned block_start, block_end, blocksize;
375 struct ext4_io_page *io_page;
376 struct buffer_head *bh, *head;
377 int ret = 0;
378
379 blocksize = 1 << inode->i_blkbits;
380
381 BUG_ON(PageWriteback(page));
382 set_page_writeback(page);
383 ClearPageError(page);
384
385 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
386 if (!io_page) {
387 set_page_dirty(page);
388 unlock_page(page);
389 return -ENOMEM;
390 }
391 io_page->p_page = page;
392 io_page->p_count = 0;
393 get_page(page);
394
395 for (bh = head = page_buffers(page), block_start = 0;
396 bh != head || !block_start;
397 block_start = block_end, bh = bh->b_this_page) {
398 block_end = block_start + blocksize;
399 if (block_start >= len) {
400 clear_buffer_dirty(bh);
401 set_buffer_uptodate(bh);
402 continue;
403 }
404 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
405 if (ret) {
406 /*
407 * We only get here on ENOMEM. Not much else
408 * we can do but mark the page as dirty, and
409 * better luck next time.
410 */
411 set_page_dirty(page);
412 break;
413 }
414 }
415 unlock_page(page);
416 /*
417 * If the page was truncated before we could do the writeback,
418 * or we had a memory allocation error while trying to write
419 * the first buffer head, we won't have submitted any pages for
420 * I/O. In that case we need to make sure we've cleared the
421 * PageWriteback bit from the page to prevent the system from
422 * wedging later on.
423 */
424 if (io_page->p_count == 0) {
425 put_page(page);
426 end_page_writeback(page);
427 kmem_cache_free(io_page_cachep, io_page);
428 }
429 return ret;
430}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6df797eb9ae..dc963929de6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
226 } 226 }
227 227
228 /* Zero out all of the reserved backup group descriptor table blocks */ 228 /* Zero out all of the reserved backup group descriptor table blocks */
229 for (i = 0, bit = gdblocks + 1, block = start + bit; 229 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
230 i < reserved_gdb; i++, block++, bit++) { 230 block, sbi->s_itb_per_group);
231 struct buffer_head *gdb; 231 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
232 232 GFP_NOFS);
233 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); 233 if (err)
234 234 goto exit_bh;
235 if ((err = extend_or_restart_transaction(handle, 1, bh)))
236 goto exit_bh;
237 235
238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
239 err = PTR_ERR(gdb);
240 goto exit_bh;
241 }
242 ext4_handle_dirty_metadata(handle, NULL, gdb);
243 ext4_set_bit(bit, bh->b_data);
244 brelse(gdb);
245 }
246 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 236 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
247 input->block_bitmap - start); 237 input->block_bitmap - start);
248 ext4_set_bit(input->block_bitmap - start, bh->b_data); 238 ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
251 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 241 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
252 242
253 /* Zero out all of the inode table blocks */ 243 /* Zero out all of the inode table blocks */
254 for (i = 0, block = input->inode_table, bit = block - start; 244 block = input->inode_table;
255 i < sbi->s_itb_per_group; i++, bit++, block++) { 245 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
256 struct buffer_head *it; 246 block, sbi->s_itb_per_group);
257 247 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
258 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); 248 if (err)
259 249 goto exit_bh;
260 if ((err = extend_or_restart_transaction(handle, 1, bh)))
261 goto exit_bh;
262
263 if (IS_ERR(it = bclean(handle, sb, block))) {
264 err = PTR_ERR(it);
265 goto exit_bh;
266 }
267 ext4_handle_dirty_metadata(handle, NULL, it);
268 brelse(it);
269 ext4_set_bit(bit, bh->b_data);
270 }
271 250
272 if ((err = extend_or_restart_transaction(handle, 2, bh))) 251 if ((err = extend_or_restart_transaction(handle, 2, bh)))
273 goto exit_bh; 252 goto exit_bh;
274 253
275 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); 254 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
255 bh->b_data);
276 ext4_handle_dirty_metadata(handle, NULL, bh); 256 ext4_handle_dirty_metadata(handle, NULL, bh);
277 brelse(bh); 257 brelse(bh);
278 /* Mark unused entries in inode bitmap used */ 258 /* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
283 goto exit_journal; 263 goto exit_journal;
284 } 264 }
285 265
286 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 266 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
287 bh->b_data); 267 bh->b_data);
288 ext4_handle_dirty_metadata(handle, NULL, bh); 268 ext4_handle_dirty_metadata(handle, NULL, bh);
289exit_bh: 269exit_bh:
290 brelse(bh); 270 brelse(bh);
@@ -921,8 +901,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
921 &sbi->s_flex_groups[flex_group].free_inodes); 901 &sbi->s_flex_groups[flex_group].free_inodes);
922 } 902 }
923 903
924 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 904 ext4_handle_dirty_super(handle, sb);
925 sb->s_dirt = 1;
926 905
927exit_journal: 906exit_journal:
928 mutex_unlock(&sbi->s_resize_lock); 907 mutex_unlock(&sbi->s_resize_lock);
@@ -953,7 +932,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
953 ext4_fsblk_t n_blocks_count) 932 ext4_fsblk_t n_blocks_count)
954{ 933{
955 ext4_fsblk_t o_blocks_count; 934 ext4_fsblk_t o_blocks_count;
956 ext4_group_t o_groups_count;
957 ext4_grpblk_t last; 935 ext4_grpblk_t last;
958 ext4_grpblk_t add; 936 ext4_grpblk_t add;
959 struct buffer_head *bh; 937 struct buffer_head *bh;
@@ -965,7 +943,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
965 * yet: we're going to revalidate es->s_blocks_count after 943 * yet: we're going to revalidate es->s_blocks_count after
966 * taking the s_resize_lock below. */ 944 * taking the s_resize_lock below. */
967 o_blocks_count = ext4_blocks_count(es); 945 o_blocks_count = ext4_blocks_count(es);
968 o_groups_count = EXT4_SB(sb)->s_groups_count;
969 946
970 if (test_opt(sb, DEBUG)) 947 if (test_opt(sb, DEBUG))
971 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", 948 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1045,13 +1022,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1045 goto exit_put; 1022 goto exit_put;
1046 } 1023 }
1047 ext4_blocks_count_set(es, o_blocks_count + add); 1024 ext4_blocks_count_set(es, o_blocks_count + add);
1048 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1049 sb->s_dirt = 1;
1050 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1025 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1051 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1026 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1052 o_blocks_count + add); 1027 o_blocks_count + add);
1053 /* We add the blocks to the bitmap and set the group need init bit */ 1028 /* We add the blocks to the bitmap and set the group need init bit */
1054 ext4_add_groupblocks(handle, sb, o_blocks_count, add); 1029 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1030 ext4_handle_dirty_super(handle, sb);
1055 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1031 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1056 o_blocks_count + add); 1032 o_blocks_count + add);
1057 if ((err = ext4_journal_stop(handle))) 1033 if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e8983a9811..40131b777af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/parser.h> 28#include <linux/parser.h>
29#include <linux/smp_lock.h>
30#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 30#include <linux/exportfs.h>
32#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -41,6 +40,9 @@
41#include <linux/crc16.h> 40#include <linux/crc16.h>
42#include <asm/uaccess.h> 41#include <asm/uaccess.h>
43 42
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45
44#include "ext4.h" 46#include "ext4.h"
45#include "ext4_jbd2.h" 47#include "ext4_jbd2.h"
46#include "xattr.h" 48#include "xattr.h"
@@ -50,8 +52,11 @@
50#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 53#include <trace/events/ext4.h>
52 54
53struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
54static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat;
55 60
56static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
57 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 73static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 74static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt); 77 const char *dev_name, void *data);
78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb);
73 80
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = { 82static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE, 83 .owner = THIS_MODULE,
77 .name = "ext3", 84 .name = "ext3",
78 .get_sb = ext4_get_sb, 85 .mount = ext4_mount,
79 .kill_sb = kill_block_super, 86 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV, 87 .fs_flags = FS_REQUIRES_DEV,
81}; 88};
@@ -241,14 +248,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 248 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 249 return ERR_PTR(-EROFS);
243 250
244 vfs_check_frozen(sb, SB_FREEZE_WRITE); 251 vfs_check_frozen(sb, SB_FREEZE_TRANS);
245 /* Special case here: if the journal has aborted behind our 252 /* Special case here: if the journal has aborted behind our
246 * backs (eg. EIO in the commit thread), then we still need to 253 * backs (eg. EIO in the commit thread), then we still need to
247 * take the FS itself readonly cleanly. */ 254 * take the FS itself readonly cleanly. */
248 journal = EXT4_SB(sb)->s_journal; 255 journal = EXT4_SB(sb)->s_journal;
249 if (journal) { 256 if (journal) {
250 if (is_journal_aborted(journal)) { 257 if (is_journal_aborted(journal)) {
251 ext4_abort(sb, __func__, "Detected aborted journal"); 258 ext4_abort(sb, "Detected aborted journal");
252 return ERR_PTR(-EROFS); 259 return ERR_PTR(-EROFS);
253 } 260 }
254 return jbd2_journal_start(journal, nblocks); 261 return jbd2_journal_start(journal, nblocks);
@@ -262,7 +269,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
262 * that sync() will call the filesystem's write_super callback if 269 * that sync() will call the filesystem's write_super callback if
263 * appropriate. 270 * appropriate.
264 */ 271 */
265int __ext4_journal_stop(const char *where, handle_t *handle) 272int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
266{ 273{
267 struct super_block *sb; 274 struct super_block *sb;
268 int err; 275 int err;
@@ -279,12 +286,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
279 if (!err) 286 if (!err)
280 err = rc; 287 err = rc;
281 if (err) 288 if (err)
282 __ext4_std_error(sb, where, err); 289 __ext4_std_error(sb, where, line, err);
283 return err; 290 return err;
284} 291}
285 292
286void ext4_journal_abort_handle(const char *caller, const char *err_fn, 293void ext4_journal_abort_handle(const char *caller, unsigned int line,
287 struct buffer_head *bh, handle_t *handle, int err) 294 const char *err_fn, struct buffer_head *bh,
295 handle_t *handle, int err)
288{ 296{
289 char nbuf[16]; 297 char nbuf[16];
290 const char *errstr = ext4_decode_error(NULL, err, nbuf); 298 const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -300,12 +308,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
300 if (is_handle_aborted(handle)) 308 if (is_handle_aborted(handle))
301 return; 309 return;
302 310
303 printk(KERN_ERR "%s: aborting transaction: %s in %s\n", 311 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
304 caller, errstr, err_fn); 312 caller, line, errstr, err_fn);
305 313
306 jbd2_journal_abort_handle(handle); 314 jbd2_journal_abort_handle(handle);
307} 315}
308 316
317static void __save_error_info(struct super_block *sb, const char *func,
318 unsigned int line)
319{
320 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
321
322 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
323 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
324 es->s_last_error_time = cpu_to_le32(get_seconds());
325 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
326 es->s_last_error_line = cpu_to_le32(line);
327 if (!es->s_first_error_time) {
328 es->s_first_error_time = es->s_last_error_time;
329 strncpy(es->s_first_error_func, func,
330 sizeof(es->s_first_error_func));
331 es->s_first_error_line = cpu_to_le32(line);
332 es->s_first_error_ino = es->s_last_error_ino;
333 es->s_first_error_block = es->s_last_error_block;
334 }
335 /*
336 * Start the daily error reporting function if it hasn't been
337 * started already
338 */
339 if (!es->s_error_count)
340 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
341 es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
342}
343
344static void save_error_info(struct super_block *sb, const char *func,
345 unsigned int line)
346{
347 __save_error_info(sb, func, line);
348 ext4_commit_super(sb, 1);
349}
350
351
309/* Deal with the reporting of failure conditions on a filesystem such as 352/* Deal with the reporting of failure conditions on a filesystem such as
310 * inconsistencies detected or read IO failures. 353 * inconsistencies detected or read IO failures.
311 * 354 *
@@ -323,11 +366,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
323 366
324static void ext4_handle_error(struct super_block *sb) 367static void ext4_handle_error(struct super_block *sb)
325{ 368{
326 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
327
328 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
329 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
330
331 if (sb->s_flags & MS_RDONLY) 369 if (sb->s_flags & MS_RDONLY)
332 return; 370 return;
333 371
@@ -342,19 +380,19 @@ static void ext4_handle_error(struct super_block *sb)
342 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 380 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
343 sb->s_flags |= MS_RDONLY; 381 sb->s_flags |= MS_RDONLY;
344 } 382 }
345 ext4_commit_super(sb, 1);
346 if (test_opt(sb, ERRORS_PANIC)) 383 if (test_opt(sb, ERRORS_PANIC))
347 panic("EXT4-fs (device %s): panic forced after error\n", 384 panic("EXT4-fs (device %s): panic forced after error\n",
348 sb->s_id); 385 sb->s_id);
349} 386}
350 387
351void __ext4_error(struct super_block *sb, const char *function, 388void __ext4_error(struct super_block *sb, const char *function,
352 const char *fmt, ...) 389 unsigned int line, const char *fmt, ...)
353{ 390{
354 va_list args; 391 va_list args;
355 392
356 va_start(args, fmt); 393 va_start(args, fmt);
357 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 394 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
395 sb->s_id, function, line, current->comm);
358 vprintk(fmt, args); 396 vprintk(fmt, args);
359 printk("\n"); 397 printk("\n");
360 va_end(args); 398 va_end(args);
@@ -362,14 +400,22 @@ void __ext4_error(struct super_block *sb, const char *function,
362 ext4_handle_error(sb); 400 ext4_handle_error(sb);
363} 401}
364 402
365void ext4_error_inode(const char *function, struct inode *inode, 403void ext4_error_inode(struct inode *inode, const char *function,
404 unsigned int line, ext4_fsblk_t block,
366 const char *fmt, ...) 405 const char *fmt, ...)
367{ 406{
368 va_list args; 407 va_list args;
408 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
369 409
410 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
411 es->s_last_error_block = cpu_to_le64(block);
412 save_error_info(inode->i_sb, function, line);
370 va_start(args, fmt); 413 va_start(args, fmt);
371 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", 414 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
372 inode->i_sb->s_id, function, inode->i_ino, current->comm); 415 inode->i_sb->s_id, function, line, inode->i_ino);
416 if (block)
417 printk("block %llu: ", block);
418 printk("comm %s: ", current->comm);
373 vprintk(fmt, args); 419 vprintk(fmt, args);
374 printk("\n"); 420 printk("\n");
375 va_end(args); 421 va_end(args);
@@ -377,20 +423,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
377 ext4_handle_error(inode->i_sb); 423 ext4_handle_error(inode->i_sb);
378} 424}
379 425
380void ext4_error_file(const char *function, struct file *file, 426void ext4_error_file(struct file *file, const char *function,
381 const char *fmt, ...) 427 unsigned int line, const char *fmt, ...)
382{ 428{
383 va_list args; 429 va_list args;
430 struct ext4_super_block *es;
384 struct inode *inode = file->f_dentry->d_inode; 431 struct inode *inode = file->f_dentry->d_inode;
385 char pathname[80], *path; 432 char pathname[80], *path;
386 433
434 es = EXT4_SB(inode->i_sb)->s_es;
435 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
436 save_error_info(inode->i_sb, function, line);
387 va_start(args, fmt); 437 va_start(args, fmt);
388 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 438 path = d_path(&(file->f_path), pathname, sizeof(pathname));
389 if (!path) 439 if (!path)
390 path = "(unknown)"; 440 path = "(unknown)";
391 printk(KERN_CRIT 441 printk(KERN_CRIT
392 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", 442 "EXT4-fs error (device %s): %s:%d: inode #%lu "
393 inode->i_sb->s_id, function, inode->i_ino, current->comm, path); 443 "(comm %s path %s): ",
444 inode->i_sb->s_id, function, line, inode->i_ino,
445 current->comm, path);
394 vprintk(fmt, args); 446 vprintk(fmt, args);
395 printk("\n"); 447 printk("\n");
396 va_end(args); 448 va_end(args);
@@ -435,7 +487,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
435/* __ext4_std_error decodes expected errors from journaling functions 487/* __ext4_std_error decodes expected errors from journaling functions
436 * automatically and invokes the appropriate error response. */ 488 * automatically and invokes the appropriate error response. */
437 489
438void __ext4_std_error(struct super_block *sb, const char *function, int errno) 490void __ext4_std_error(struct super_block *sb, const char *function,
491 unsigned int line, int errno)
439{ 492{
440 char nbuf[16]; 493 char nbuf[16];
441 const char *errstr; 494 const char *errstr;
@@ -448,8 +501,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
448 return; 501 return;
449 502
450 errstr = ext4_decode_error(sb, errno, nbuf); 503 errstr = ext4_decode_error(sb, errno, nbuf);
451 printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", 504 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
452 sb->s_id, function, errstr); 505 sb->s_id, function, line, errstr);
506 save_error_info(sb, function, line);
453 507
454 ext4_handle_error(sb); 508 ext4_handle_error(sb);
455} 509}
@@ -464,29 +518,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
464 * case we take the easy way out and panic immediately. 518 * case we take the easy way out and panic immediately.
465 */ 519 */
466 520
467void ext4_abort(struct super_block *sb, const char *function, 521void __ext4_abort(struct super_block *sb, const char *function,
468 const char *fmt, ...) 522 unsigned int line, const char *fmt, ...)
469{ 523{
470 va_list args; 524 va_list args;
471 525
526 save_error_info(sb, function, line);
472 va_start(args, fmt); 527 va_start(args, fmt);
473 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 528 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
529 function, line);
474 vprintk(fmt, args); 530 vprintk(fmt, args);
475 printk("\n"); 531 printk("\n");
476 va_end(args); 532 va_end(args);
477 533
534 if ((sb->s_flags & MS_RDONLY) == 0) {
535 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
536 sb->s_flags |= MS_RDONLY;
537 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
538 if (EXT4_SB(sb)->s_journal)
539 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
540 save_error_info(sb, function, line);
541 }
478 if (test_opt(sb, ERRORS_PANIC)) 542 if (test_opt(sb, ERRORS_PANIC))
479 panic("EXT4-fs panic from previous error\n"); 543 panic("EXT4-fs panic from previous error\n");
480
481 if (sb->s_flags & MS_RDONLY)
482 return;
483
484 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
485 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
486 sb->s_flags |= MS_RDONLY;
487 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
488 if (EXT4_SB(sb)->s_journal)
489 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
490} 544}
491 545
492void ext4_msg (struct super_block * sb, const char *prefix, 546void ext4_msg (struct super_block * sb, const char *prefix,
@@ -502,38 +556,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
502} 556}
503 557
504void __ext4_warning(struct super_block *sb, const char *function, 558void __ext4_warning(struct super_block *sb, const char *function,
505 const char *fmt, ...) 559 unsigned int line, const char *fmt, ...)
506{ 560{
507 va_list args; 561 va_list args;
508 562
509 va_start(args, fmt); 563 va_start(args, fmt);
510 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", 564 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
511 sb->s_id, function); 565 sb->s_id, function, line);
512 vprintk(fmt, args); 566 vprintk(fmt, args);
513 printk("\n"); 567 printk("\n");
514 va_end(args); 568 va_end(args);
515} 569}
516 570
517void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, 571void __ext4_grp_locked_error(const char *function, unsigned int line,
518 const char *function, const char *fmt, ...) 572 struct super_block *sb, ext4_group_t grp,
573 unsigned long ino, ext4_fsblk_t block,
574 const char *fmt, ...)
519__releases(bitlock) 575__releases(bitlock)
520__acquires(bitlock) 576__acquires(bitlock)
521{ 577{
522 va_list args; 578 va_list args;
523 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 579 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
524 580
581 es->s_last_error_ino = cpu_to_le32(ino);
582 es->s_last_error_block = cpu_to_le64(block);
583 __save_error_info(sb, function, line);
525 va_start(args, fmt); 584 va_start(args, fmt);
526 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 585 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
586 sb->s_id, function, line, grp);
587 if (ino)
588 printk("inode %lu: ", ino);
589 if (block)
590 printk("block %llu:", (unsigned long long) block);
527 vprintk(fmt, args); 591 vprintk(fmt, args);
528 printk("\n"); 592 printk("\n");
529 va_end(args); 593 va_end(args);
530 594
531 if (test_opt(sb, ERRORS_CONT)) { 595 if (test_opt(sb, ERRORS_CONT)) {
532 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
533 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
534 ext4_commit_super(sb, 0); 596 ext4_commit_super(sb, 0);
535 return; 597 return;
536 } 598 }
599
537 ext4_unlock_group(sb, grp); 600 ext4_unlock_group(sb, grp);
538 ext4_handle_error(sb); 601 ext4_handle_error(sb);
539 /* 602 /*
@@ -646,13 +709,13 @@ static void ext4_put_super(struct super_block *sb)
646 struct ext4_super_block *es = sbi->s_es; 709 struct ext4_super_block *es = sbi->s_es;
647 int i, err; 710 int i, err;
648 711
712 ext4_unregister_li_request(sb);
649 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 713 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
650 714
651 flush_workqueue(sbi->dio_unwritten_wq); 715 flush_workqueue(sbi->dio_unwritten_wq);
652 destroy_workqueue(sbi->dio_unwritten_wq); 716 destroy_workqueue(sbi->dio_unwritten_wq);
653 717
654 lock_super(sb); 718 lock_super(sb);
655 lock_kernel();
656 if (sb->s_dirt) 719 if (sb->s_dirt)
657 ext4_commit_super(sb, 1); 720 ext4_commit_super(sb, 1);
658 721
@@ -660,10 +723,10 @@ static void ext4_put_super(struct super_block *sb)
660 err = jbd2_journal_destroy(sbi->s_journal); 723 err = jbd2_journal_destroy(sbi->s_journal);
661 sbi->s_journal = NULL; 724 sbi->s_journal = NULL;
662 if (err < 0) 725 if (err < 0)
663 ext4_abort(sb, __func__, 726 ext4_abort(sb, "Couldn't clean up the journal");
664 "Couldn't clean up the journal");
665 } 727 }
666 728
729 del_timer(&sbi->s_err_report);
667 ext4_release_system_zone(sb); 730 ext4_release_system_zone(sb);
668 ext4_mb_release(sb); 731 ext4_mb_release(sb);
669 ext4_ext_release(sb); 732 ext4_ext_release(sb);
@@ -720,7 +783,6 @@ static void ext4_put_super(struct super_block *sb)
720 * Now that we are completely done shutting down the 783 * Now that we are completely done shutting down the
721 * superblock, we need to actually destroy the kobject. 784 * superblock, we need to actually destroy the kobject.
722 */ 785 */
723 unlock_kernel();
724 unlock_super(sb); 786 unlock_super(sb);
725 kobject_put(&sbi->s_kobj); 787 kobject_put(&sbi->s_kobj);
726 wait_for_completion(&sbi->s_kobj_unregister); 788 wait_for_completion(&sbi->s_kobj_unregister);
@@ -813,8 +875,10 @@ static void destroy_inodecache(void)
813 kmem_cache_destroy(ext4_inode_cachep); 875 kmem_cache_destroy(ext4_inode_cachep);
814} 876}
815 877
816static void ext4_clear_inode(struct inode *inode) 878void ext4_clear_inode(struct inode *inode)
817{ 879{
880 invalidate_inode_buffers(inode);
881 end_writeback(inode);
818 dquot_drop(inode); 882 dquot_drop(inode);
819 ext4_discard_preallocations(inode); 883 ext4_discard_preallocations(inode);
820 if (EXT4_JOURNAL(inode)) 884 if (EXT4_JOURNAL(inode))
@@ -946,14 +1010,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
946 seq_puts(seq, ",journal_async_commit"); 1010 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM)) 1011 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum"); 1012 seq_puts(seq, ",journal_checksum");
949 if (test_opt(sb, NOBH))
950 seq_puts(seq, ",nobh");
951 if (test_opt(sb, I_VERSION)) 1013 if (test_opt(sb, I_VERSION))
952 seq_puts(seq, ",i_version"); 1014 seq_puts(seq, ",i_version");
953 if (!test_opt(sb, DELALLOC)) 1015 if (!test_opt(sb, DELALLOC) &&
1016 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
954 seq_puts(seq, ",nodelalloc"); 1017 seq_puts(seq, ",nodelalloc");
955 1018
956
957 if (sbi->s_stripe) 1019 if (sbi->s_stripe)
958 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1020 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
959 /* 1021 /*
@@ -977,7 +1039,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
977 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 1039 if (test_opt(sb, NO_AUTO_DA_ALLOC))
978 seq_puts(seq, ",noauto_da_alloc"); 1040 seq_puts(seq, ",noauto_da_alloc");
979 1041
980 if (test_opt(sb, DISCARD)) 1042 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
981 seq_puts(seq, ",discard"); 1043 seq_puts(seq, ",discard");
982 1044
983 if (test_opt(sb, NOLOAD)) 1045 if (test_opt(sb, NOLOAD))
@@ -986,6 +1048,16 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
986 if (test_opt(sb, DIOREAD_NOLOCK)) 1048 if (test_opt(sb, DIOREAD_NOLOCK))
987 seq_puts(seq, ",dioread_nolock"); 1049 seq_puts(seq, ",dioread_nolock");
988 1050
1051 if (test_opt(sb, BLOCK_VALIDITY) &&
1052 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1053 seq_puts(seq, ",block_validity");
1054
1055 if (!test_opt(sb, INIT_INODE_TABLE))
1056 seq_puts(seq, ",noinit_inode_table");
1057 else if (sbi->s_li_wait_mult)
1058 seq_printf(seq, ",init_inode_table=%u",
1059 (unsigned) sbi->s_li_wait_mult);
1060
989 ext4_show_quota_options(seq, sb); 1061 ext4_show_quota_options(seq, sb);
990 1062
991 return 0; 1063 return 0;
@@ -1065,6 +1137,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
1065static int ext4_write_info(struct super_block *sb, int type); 1137static int ext4_write_info(struct super_block *sb, int type);
1066static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1138static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1067 char *path); 1139 char *path);
1140static int ext4_quota_off(struct super_block *sb, int type);
1068static int ext4_quota_on_mount(struct super_block *sb, int type); 1141static int ext4_quota_on_mount(struct super_block *sb, int type);
1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1142static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1070 size_t len, loff_t off); 1143 size_t len, loff_t off);
@@ -1086,7 +1159,7 @@ static const struct dquot_operations ext4_quota_operations = {
1086 1159
1087static const struct quotactl_ops ext4_qctl_operations = { 1160static const struct quotactl_ops ext4_qctl_operations = {
1088 .quota_on = ext4_quota_on, 1161 .quota_on = ext4_quota_on,
1089 .quota_off = dquot_quota_off, 1162 .quota_off = ext4_quota_off,
1090 .quota_sync = dquot_quota_sync, 1163 .quota_sync = dquot_quota_sync,
1091 .get_info = dquot_get_dqinfo, 1164 .get_info = dquot_get_dqinfo,
1092 .set_info = dquot_set_dqinfo, 1165 .set_info = dquot_set_dqinfo,
@@ -1100,20 +1173,20 @@ static const struct super_operations ext4_sops = {
1100 .destroy_inode = ext4_destroy_inode, 1173 .destroy_inode = ext4_destroy_inode,
1101 .write_inode = ext4_write_inode, 1174 .write_inode = ext4_write_inode,
1102 .dirty_inode = ext4_dirty_inode, 1175 .dirty_inode = ext4_dirty_inode,
1103 .delete_inode = ext4_delete_inode, 1176 .evict_inode = ext4_evict_inode,
1104 .put_super = ext4_put_super, 1177 .put_super = ext4_put_super,
1105 .sync_fs = ext4_sync_fs, 1178 .sync_fs = ext4_sync_fs,
1106 .freeze_fs = ext4_freeze, 1179 .freeze_fs = ext4_freeze,
1107 .unfreeze_fs = ext4_unfreeze, 1180 .unfreeze_fs = ext4_unfreeze,
1108 .statfs = ext4_statfs, 1181 .statfs = ext4_statfs,
1109 .remount_fs = ext4_remount, 1182 .remount_fs = ext4_remount,
1110 .clear_inode = ext4_clear_inode,
1111 .show_options = ext4_show_options, 1183 .show_options = ext4_show_options,
1112#ifdef CONFIG_QUOTA 1184#ifdef CONFIG_QUOTA
1113 .quota_read = ext4_quota_read, 1185 .quota_read = ext4_quota_read,
1114 .quota_write = ext4_quota_write, 1186 .quota_write = ext4_quota_write,
1115#endif 1187#endif
1116 .bdev_try_to_free_page = bdev_try_to_free_page, 1188 .bdev_try_to_free_page = bdev_try_to_free_page,
1189 .trim_fs = ext4_trim_fs
1117}; 1190};
1118 1191
1119static const struct super_operations ext4_nojournal_sops = { 1192static const struct super_operations ext4_nojournal_sops = {
@@ -1121,12 +1194,11 @@ static const struct super_operations ext4_nojournal_sops = {
1121 .destroy_inode = ext4_destroy_inode, 1194 .destroy_inode = ext4_destroy_inode,
1122 .write_inode = ext4_write_inode, 1195 .write_inode = ext4_write_inode,
1123 .dirty_inode = ext4_dirty_inode, 1196 .dirty_inode = ext4_dirty_inode,
1124 .delete_inode = ext4_delete_inode, 1197 .evict_inode = ext4_evict_inode,
1125 .write_super = ext4_write_super, 1198 .write_super = ext4_write_super,
1126 .put_super = ext4_put_super, 1199 .put_super = ext4_put_super,
1127 .statfs = ext4_statfs, 1200 .statfs = ext4_statfs,
1128 .remount_fs = ext4_remount, 1201 .remount_fs = ext4_remount,
1129 .clear_inode = ext4_clear_inode,
1130 .show_options = ext4_show_options, 1202 .show_options = ext4_show_options,
1131#ifdef CONFIG_QUOTA 1203#ifdef CONFIG_QUOTA
1132 .quota_read = ext4_quota_read, 1204 .quota_read = ext4_quota_read,
@@ -1161,6 +1233,7 @@ enum {
1161 Opt_inode_readahead_blks, Opt_journal_ioprio, 1233 Opt_inode_readahead_blks, Opt_journal_ioprio,
1162 Opt_dioread_nolock, Opt_dioread_lock, 1234 Opt_dioread_nolock, Opt_dioread_lock,
1163 Opt_discard, Opt_nodiscard, 1235 Opt_discard, Opt_nodiscard,
1236 Opt_init_inode_table, Opt_noinit_inode_table,
1164}; 1237};
1165 1238
1166static const match_table_t tokens = { 1239static const match_table_t tokens = {
@@ -1231,6 +1304,9 @@ static const match_table_t tokens = {
1231 {Opt_dioread_lock, "dioread_lock"}, 1304 {Opt_dioread_lock, "dioread_lock"},
1232 {Opt_discard, "discard"}, 1305 {Opt_discard, "discard"},
1233 {Opt_nodiscard, "nodiscard"}, 1306 {Opt_nodiscard, "nodiscard"},
1307 {Opt_init_inode_table, "init_itable=%u"},
1308 {Opt_init_inode_table, "init_itable"},
1309 {Opt_noinit_inode_table, "noinit_itable"},
1234 {Opt_err, NULL}, 1310 {Opt_err, NULL},
1235}; 1311};
1236 1312
@@ -1624,10 +1700,12 @@ set_qf_format:
1624 *n_blocks_count = option; 1700 *n_blocks_count = option;
1625 break; 1701 break;
1626 case Opt_nobh: 1702 case Opt_nobh:
1627 set_opt(sbi->s_mount_opt, NOBH); 1703 ext4_msg(sb, KERN_WARNING,
1704 "Ignoring deprecated nobh option");
1628 break; 1705 break;
1629 case Opt_bh: 1706 case Opt_bh:
1630 clear_opt(sbi->s_mount_opt, NOBH); 1707 ext4_msg(sb, KERN_WARNING,
1708 "Ignoring deprecated bh option");
1631 break; 1709 break;
1632 case Opt_i_version: 1710 case Opt_i_version:
1633 set_opt(sbi->s_mount_opt, I_VERSION); 1711 set_opt(sbi->s_mount_opt, I_VERSION);
@@ -1699,6 +1777,20 @@ set_qf_format:
1699 case Opt_dioread_lock: 1777 case Opt_dioread_lock:
1700 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1778 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1701 break; 1779 break;
1780 case Opt_init_inode_table:
1781 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1782 if (args[0].from) {
1783 if (match_int(&args[0], &option))
1784 return 0;
1785 } else
1786 option = EXT4_DEF_LI_WAIT_MULT;
1787 if (option < 0)
1788 return 0;
1789 sbi->s_li_wait_mult = option;
1790 break;
1791 case Opt_noinit_inode_table:
1792 clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1793 break;
1702 default: 1794 default:
1703 ext4_msg(sb, KERN_ERR, 1795 ext4_msg(sb, KERN_ERR,
1704 "Unrecognized mount option \"%s\" " 1796 "Unrecognized mount option \"%s\" "
@@ -1882,7 +1974,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1882} 1974}
1883 1975
1884/* Called at mount-time, super-block is locked */ 1976/* Called at mount-time, super-block is locked */
1885static int ext4_check_descriptors(struct super_block *sb) 1977static int ext4_check_descriptors(struct super_block *sb,
1978 ext4_group_t *first_not_zeroed)
1886{ 1979{
1887 struct ext4_sb_info *sbi = EXT4_SB(sb); 1980 struct ext4_sb_info *sbi = EXT4_SB(sb);
1888 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 1981 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1891,7 +1984,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1891 ext4_fsblk_t inode_bitmap; 1984 ext4_fsblk_t inode_bitmap;
1892 ext4_fsblk_t inode_table; 1985 ext4_fsblk_t inode_table;
1893 int flexbg_flag = 0; 1986 int flexbg_flag = 0;
1894 ext4_group_t i; 1987 ext4_group_t i, grp = sbi->s_groups_count;
1895 1988
1896 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1989 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1897 flexbg_flag = 1; 1990 flexbg_flag = 1;
@@ -1907,6 +2000,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1907 last_block = first_block + 2000 last_block = first_block +
1908 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2001 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1909 2002
2003 if ((grp == sbi->s_groups_count) &&
2004 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2005 grp = i;
2006
1910 block_bitmap = ext4_block_bitmap(sb, gdp); 2007 block_bitmap = ext4_block_bitmap(sb, gdp);
1911 if (block_bitmap < first_block || block_bitmap > last_block) { 2008 if (block_bitmap < first_block || block_bitmap > last_block) {
1912 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2009 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -1944,6 +2041,8 @@ static int ext4_check_descriptors(struct super_block *sb)
1944 if (!flexbg_flag) 2041 if (!flexbg_flag)
1945 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2042 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1946 } 2043 }
2044 if (NULL != first_not_zeroed)
2045 *first_not_zeroed = grp;
1947 2046
1948 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2047 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1949 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2048 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2249,6 +2348,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2249{ 2348{
2250 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2349 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2251 2350
2351 if (!sb->s_bdev->bd_part)
2352 return snprintf(buf, PAGE_SIZE, "0\n");
2252 return snprintf(buf, PAGE_SIZE, "%lu\n", 2353 return snprintf(buf, PAGE_SIZE, "%lu\n",
2253 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2354 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2254 sbi->s_sectors_written_start) >> 1); 2355 sbi->s_sectors_written_start) >> 1);
@@ -2259,6 +2360,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2259{ 2360{
2260 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2361 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2261 2362
2363 if (!sb->s_bdev->bd_part)
2364 return snprintf(buf, PAGE_SIZE, "0\n");
2262 return snprintf(buf, PAGE_SIZE, "%llu\n", 2365 return snprintf(buf, PAGE_SIZE, "%llu\n",
2263 (unsigned long long)(sbi->s_kbytes_written + 2366 (unsigned long long)(sbi->s_kbytes_written +
2264 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2367 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2312,6 +2415,7 @@ static struct ext4_attr ext4_attr_##_name = { \
2312#define EXT4_ATTR(name, mode, show, store) \ 2415#define EXT4_ATTR(name, mode, show, store) \
2313static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2416static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2314 2417
2418#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2315#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2419#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2316#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2420#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2317#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2421#define EXT4_RW_ATTR_SBI_UI(name, elname) \
@@ -2348,6 +2452,16 @@ static struct attribute *ext4_attrs[] = {
2348 NULL, 2452 NULL,
2349}; 2453};
2350 2454
2455/* Features this copy of ext4 supports */
2456EXT4_INFO_ATTR(lazy_itable_init);
2457EXT4_INFO_ATTR(batched_discard);
2458
2459static struct attribute *ext4_feat_attrs[] = {
2460 ATTR_LIST(lazy_itable_init),
2461 ATTR_LIST(batched_discard),
2462 NULL,
2463};
2464
2351static ssize_t ext4_attr_show(struct kobject *kobj, 2465static ssize_t ext4_attr_show(struct kobject *kobj,
2352 struct attribute *attr, char *buf) 2466 struct attribute *attr, char *buf)
2353{ 2467{
@@ -2376,7 +2490,6 @@ static void ext4_sb_release(struct kobject *kobj)
2376 complete(&sbi->s_kobj_unregister); 2490 complete(&sbi->s_kobj_unregister);
2377} 2491}
2378 2492
2379
2380static const struct sysfs_ops ext4_attr_ops = { 2493static const struct sysfs_ops ext4_attr_ops = {
2381 .show = ext4_attr_show, 2494 .show = ext4_attr_show,
2382 .store = ext4_attr_store, 2495 .store = ext4_attr_store,
@@ -2388,6 +2501,17 @@ static struct kobj_type ext4_ktype = {
2388 .release = ext4_sb_release, 2501 .release = ext4_sb_release,
2389}; 2502};
2390 2503
2504static void ext4_feat_release(struct kobject *kobj)
2505{
2506 complete(&ext4_feat->f_kobj_unregister);
2507}
2508
2509static struct kobj_type ext4_feat_ktype = {
2510 .default_attrs = ext4_feat_attrs,
2511 .sysfs_ops = &ext4_attr_ops,
2512 .release = ext4_feat_release,
2513};
2514
2391/* 2515/*
2392 * Check whether this filesystem can be mounted based on 2516 * Check whether this filesystem can be mounted based on
2393 * the features present and the RDONLY/RDWR mount requested. 2517 * the features present and the RDONLY/RDWR mount requested.
@@ -2431,6 +2555,419 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2431 return 1; 2555 return 1;
2432} 2556}
2433 2557
2558/*
2559 * This function is called once a day if we have errors logged
2560 * on the file system
2561 */
2562static void print_daily_error_info(unsigned long arg)
2563{
2564 struct super_block *sb = (struct super_block *) arg;
2565 struct ext4_sb_info *sbi;
2566 struct ext4_super_block *es;
2567
2568 sbi = EXT4_SB(sb);
2569 es = sbi->s_es;
2570
2571 if (es->s_error_count)
2572 ext4_msg(sb, KERN_NOTICE, "error count: %u",
2573 le32_to_cpu(es->s_error_count));
2574 if (es->s_first_error_time) {
2575 printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
2576 sb->s_id, le32_to_cpu(es->s_first_error_time),
2577 (int) sizeof(es->s_first_error_func),
2578 es->s_first_error_func,
2579 le32_to_cpu(es->s_first_error_line));
2580 if (es->s_first_error_ino)
2581 printk(": inode %u",
2582 le32_to_cpu(es->s_first_error_ino));
2583 if (es->s_first_error_block)
2584 printk(": block %llu", (unsigned long long)
2585 le64_to_cpu(es->s_first_error_block));
2586 printk("\n");
2587 }
2588 if (es->s_last_error_time) {
2589 printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
2590 sb->s_id, le32_to_cpu(es->s_last_error_time),
2591 (int) sizeof(es->s_last_error_func),
2592 es->s_last_error_func,
2593 le32_to_cpu(es->s_last_error_line));
2594 if (es->s_last_error_ino)
2595 printk(": inode %u",
2596 le32_to_cpu(es->s_last_error_ino));
2597 if (es->s_last_error_block)
2598 printk(": block %llu", (unsigned long long)
2599 le64_to_cpu(es->s_last_error_block));
2600 printk("\n");
2601 }
2602 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2603}
2604
2605static void ext4_lazyinode_timeout(unsigned long data)
2606{
2607 struct task_struct *p = (struct task_struct *)data;
2608 wake_up_process(p);
2609}
2610
2611/* Find next suitable group and run ext4_init_inode_table */
2612static int ext4_run_li_request(struct ext4_li_request *elr)
2613{
2614 struct ext4_group_desc *gdp = NULL;
2615 ext4_group_t group, ngroups;
2616 struct super_block *sb;
2617 unsigned long timeout = 0;
2618 int ret = 0;
2619
2620 sb = elr->lr_super;
2621 ngroups = EXT4_SB(sb)->s_groups_count;
2622
2623 for (group = elr->lr_next_group; group < ngroups; group++) {
2624 gdp = ext4_get_group_desc(sb, group, NULL);
2625 if (!gdp) {
2626 ret = 1;
2627 break;
2628 }
2629
2630 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2631 break;
2632 }
2633
2634 if (group == ngroups)
2635 ret = 1;
2636
2637 if (!ret) {
2638 timeout = jiffies;
2639 ret = ext4_init_inode_table(sb, group,
2640 elr->lr_timeout ? 0 : 1);
2641 if (elr->lr_timeout == 0) {
2642 timeout = jiffies - timeout;
2643 if (elr->lr_sbi->s_li_wait_mult)
2644 timeout *= elr->lr_sbi->s_li_wait_mult;
2645 else
2646 timeout *= 20;
2647 elr->lr_timeout = timeout;
2648 }
2649 elr->lr_next_sched = jiffies + elr->lr_timeout;
2650 elr->lr_next_group = group + 1;
2651 }
2652
2653 return ret;
2654}
2655
2656/*
2657 * Remove lr_request from the list_request and free the
2658 * request tructure. Should be called with li_list_mtx held
2659 */
2660static void ext4_remove_li_request(struct ext4_li_request *elr)
2661{
2662 struct ext4_sb_info *sbi;
2663
2664 if (!elr)
2665 return;
2666
2667 sbi = elr->lr_sbi;
2668
2669 list_del(&elr->lr_request);
2670 sbi->s_li_request = NULL;
2671 kfree(elr);
2672}
2673
2674static void ext4_unregister_li_request(struct super_block *sb)
2675{
2676 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
2677
2678 if (!ext4_li_info)
2679 return;
2680
2681 mutex_lock(&ext4_li_info->li_list_mtx);
2682 ext4_remove_li_request(elr);
2683 mutex_unlock(&ext4_li_info->li_list_mtx);
2684}
2685
2686/*
2687 * This is the function where ext4lazyinit thread lives. It walks
2688 * through the request list searching for next scheduled filesystem.
2689 * When such a fs is found, run the lazy initialization request
2690 * (ext4_rn_li_request) and keep track of the time spend in this
2691 * function. Based on that time we compute next schedule time of
2692 * the request. When walking through the list is complete, compute
2693 * next waking time and put itself into sleep.
2694 */
2695static int ext4_lazyinit_thread(void *arg)
2696{
2697 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2698 struct list_head *pos, *n;
2699 struct ext4_li_request *elr;
2700 unsigned long next_wakeup;
2701 DEFINE_WAIT(wait);
2702 int ret;
2703
2704 BUG_ON(NULL == eli);
2705
2706 eli->li_timer.data = (unsigned long)current;
2707 eli->li_timer.function = ext4_lazyinode_timeout;
2708
2709 eli->li_task = current;
2710 wake_up(&eli->li_wait_task);
2711
2712cont_thread:
2713 while (true) {
2714 next_wakeup = MAX_JIFFY_OFFSET;
2715
2716 mutex_lock(&eli->li_list_mtx);
2717 if (list_empty(&eli->li_request_list)) {
2718 mutex_unlock(&eli->li_list_mtx);
2719 goto exit_thread;
2720 }
2721
2722 list_for_each_safe(pos, n, &eli->li_request_list) {
2723 elr = list_entry(pos, struct ext4_li_request,
2724 lr_request);
2725
2726 if (time_after_eq(jiffies, elr->lr_next_sched))
2727 ret = ext4_run_li_request(elr);
2728
2729 if (ret) {
2730 ret = 0;
2731 ext4_remove_li_request(elr);
2732 continue;
2733 }
2734
2735 if (time_before(elr->lr_next_sched, next_wakeup))
2736 next_wakeup = elr->lr_next_sched;
2737 }
2738 mutex_unlock(&eli->li_list_mtx);
2739
2740 if (freezing(current))
2741 refrigerator();
2742
2743 if (time_after_eq(jiffies, next_wakeup)) {
2744 cond_resched();
2745 continue;
2746 }
2747
2748 eli->li_timer.expires = next_wakeup;
2749 add_timer(&eli->li_timer);
2750 prepare_to_wait(&eli->li_wait_daemon, &wait,
2751 TASK_INTERRUPTIBLE);
2752 if (time_before(jiffies, next_wakeup))
2753 schedule();
2754 finish_wait(&eli->li_wait_daemon, &wait);
2755 }
2756
2757exit_thread:
2758 /*
2759 * It looks like the request list is empty, but we need
2760 * to check it under the li_list_mtx lock, to prevent any
2761 * additions into it, and of course we should lock ext4_li_mtx
2762 * to atomically free the list and ext4_li_info, because at
2763 * this point another ext4 filesystem could be registering
2764 * new one.
2765 */
2766 mutex_lock(&ext4_li_mtx);
2767 mutex_lock(&eli->li_list_mtx);
2768 if (!list_empty(&eli->li_request_list)) {
2769 mutex_unlock(&eli->li_list_mtx);
2770 mutex_unlock(&ext4_li_mtx);
2771 goto cont_thread;
2772 }
2773 mutex_unlock(&eli->li_list_mtx);
2774 del_timer_sync(&ext4_li_info->li_timer);
2775 eli->li_task = NULL;
2776 wake_up(&eli->li_wait_task);
2777
2778 kfree(ext4_li_info);
2779 ext4_li_info = NULL;
2780 mutex_unlock(&ext4_li_mtx);
2781
2782 return 0;
2783}
2784
2785static void ext4_clear_request_list(void)
2786{
2787 struct list_head *pos, *n;
2788 struct ext4_li_request *elr;
2789
2790 mutex_lock(&ext4_li_info->li_list_mtx);
2791 if (list_empty(&ext4_li_info->li_request_list))
2792 return;
2793
2794 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2795 elr = list_entry(pos, struct ext4_li_request,
2796 lr_request);
2797 ext4_remove_li_request(elr);
2798 }
2799 mutex_unlock(&ext4_li_info->li_list_mtx);
2800}
2801
2802static int ext4_run_lazyinit_thread(void)
2803{
2804 struct task_struct *t;
2805
2806 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
2807 if (IS_ERR(t)) {
2808 int err = PTR_ERR(t);
2809 ext4_clear_request_list();
2810 del_timer_sync(&ext4_li_info->li_timer);
2811 kfree(ext4_li_info);
2812 ext4_li_info = NULL;
2813 printk(KERN_CRIT "EXT4: error %d creating inode table "
2814 "initialization thread\n",
2815 err);
2816 return err;
2817 }
2818 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2819
2820 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2821 return 0;
2822}
2823
2824/*
2825 * Check whether it make sense to run itable init. thread or not.
2826 * If there is at least one uninitialized inode table, return
2827 * corresponding group number, else the loop goes through all
2828 * groups and return total number of groups.
2829 */
2830static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2831{
2832 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2833 struct ext4_group_desc *gdp = NULL;
2834
2835 for (group = 0; group < ngroups; group++) {
2836 gdp = ext4_get_group_desc(sb, group, NULL);
2837 if (!gdp)
2838 continue;
2839
2840 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2841 break;
2842 }
2843
2844 return group;
2845}
2846
2847static int ext4_li_info_new(void)
2848{
2849 struct ext4_lazy_init *eli = NULL;
2850
2851 eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2852 if (!eli)
2853 return -ENOMEM;
2854
2855 eli->li_task = NULL;
2856 INIT_LIST_HEAD(&eli->li_request_list);
2857 mutex_init(&eli->li_list_mtx);
2858
2859 init_waitqueue_head(&eli->li_wait_daemon);
2860 init_waitqueue_head(&eli->li_wait_task);
2861 init_timer(&eli->li_timer);
2862 eli->li_state |= EXT4_LAZYINIT_QUIT;
2863
2864 ext4_li_info = eli;
2865
2866 return 0;
2867}
2868
2869static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2870 ext4_group_t start)
2871{
2872 struct ext4_sb_info *sbi = EXT4_SB(sb);
2873 struct ext4_li_request *elr;
2874 unsigned long rnd;
2875
2876 elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2877 if (!elr)
2878 return NULL;
2879
2880 elr->lr_super = sb;
2881 elr->lr_sbi = sbi;
2882 elr->lr_next_group = start;
2883
2884 /*
2885 * Randomize first schedule time of the request to
2886 * spread the inode table initialization requests
2887 * better.
2888 */
2889 get_random_bytes(&rnd, sizeof(rnd));
2890 elr->lr_next_sched = jiffies + (unsigned long)rnd %
2891 (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2892
2893 return elr;
2894}
2895
2896static int ext4_register_li_request(struct super_block *sb,
2897 ext4_group_t first_not_zeroed)
2898{
2899 struct ext4_sb_info *sbi = EXT4_SB(sb);
2900 struct ext4_li_request *elr;
2901 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2902 int ret;
2903
2904 if (sbi->s_li_request != NULL)
2905 return 0;
2906
2907 if (first_not_zeroed == ngroups ||
2908 (sb->s_flags & MS_RDONLY) ||
2909 !test_opt(sb, INIT_INODE_TABLE)) {
2910 sbi->s_li_request = NULL;
2911 return 0;
2912 }
2913
2914 if (first_not_zeroed == ngroups) {
2915 sbi->s_li_request = NULL;
2916 return 0;
2917 }
2918
2919 elr = ext4_li_request_new(sb, first_not_zeroed);
2920 if (!elr)
2921 return -ENOMEM;
2922
2923 mutex_lock(&ext4_li_mtx);
2924
2925 if (NULL == ext4_li_info) {
2926 ret = ext4_li_info_new();
2927 if (ret)
2928 goto out;
2929 }
2930
2931 mutex_lock(&ext4_li_info->li_list_mtx);
2932 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
2933 mutex_unlock(&ext4_li_info->li_list_mtx);
2934
2935 sbi->s_li_request = elr;
2936
2937 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2938 ret = ext4_run_lazyinit_thread();
2939 if (ret)
2940 goto out;
2941 }
2942out:
2943 mutex_unlock(&ext4_li_mtx);
2944 if (ret)
2945 kfree(elr);
2946 return ret;
2947}
2948
2949/*
2950 * We do not need to lock anything since this is called on
2951 * module unload.
2952 */
2953static void ext4_destroy_lazyinit_thread(void)
2954{
2955 /*
2956 * If thread exited earlier
2957 * there's nothing to be done.
2958 */
2959 if (!ext4_li_info)
2960 return;
2961
2962 ext4_clear_request_list();
2963
2964 while (ext4_li_info->li_task) {
2965 wake_up(&ext4_li_info->li_wait_daemon);
2966 wait_event(ext4_li_info->li_wait_task,
2967 ext4_li_info->li_task == NULL);
2968 }
2969}
2970
2434static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2971static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2435 __releases(kernel_lock) 2972 __releases(kernel_lock)
2436 __acquires(kernel_lock) 2973 __acquires(kernel_lock)
@@ -2448,7 +2985,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2448 struct inode *root; 2985 struct inode *root;
2449 char *cp; 2986 char *cp;
2450 const char *descr; 2987 const char *descr;
2451 int ret = -EINVAL; 2988 int ret = -ENOMEM;
2452 int blocksize; 2989 int blocksize;
2453 unsigned int db_count; 2990 unsigned int db_count;
2454 unsigned int i; 2991 unsigned int i;
@@ -2456,16 +2993,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2456 __u64 blocks_count; 2993 __u64 blocks_count;
2457 int err; 2994 int err;
2458 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2995 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2996 ext4_group_t first_not_zeroed;
2459 2997
2460 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2998 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2461 if (!sbi) 2999 if (!sbi)
2462 return -ENOMEM; 3000 goto out_free_orig;
2463 3001
2464 sbi->s_blockgroup_lock = 3002 sbi->s_blockgroup_lock =
2465 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 3003 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2466 if (!sbi->s_blockgroup_lock) { 3004 if (!sbi->s_blockgroup_lock) {
2467 kfree(sbi); 3005 kfree(sbi);
2468 return -ENOMEM; 3006 goto out_free_orig;
2469 } 3007 }
2470 sb->s_fs_info = sbi; 3008 sb->s_fs_info = sbi;
2471 sbi->s_mount_opt = 0; 3009 sbi->s_mount_opt = 0;
@@ -2473,15 +3011,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2473 sbi->s_resgid = EXT4_DEF_RESGID; 3011 sbi->s_resgid = EXT4_DEF_RESGID;
2474 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 3012 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2475 sbi->s_sb_block = sb_block; 3013 sbi->s_sb_block = sb_block;
2476 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, 3014 if (sb->s_bdev->bd_part)
2477 sectors[1]); 3015 sbi->s_sectors_written_start =
2478 3016 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2479 unlock_kernel();
2480 3017
2481 /* Cleanup superblock name */ 3018 /* Cleanup superblock name */
2482 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3019 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2483 *cp = '!'; 3020 *cp = '!';
2484 3021
3022 ret = -EINVAL;
2485 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 3023 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
2486 if (!blocksize) { 3024 if (!blocksize) {
2487 ext4_msg(sb, KERN_ERR, "unable to set blocksize"); 3025 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2516,6 +3054,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2516 3054
2517 /* Set defaults before we parse the mount options */ 3055 /* Set defaults before we parse the mount options */
2518 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3056 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3057 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
2519 if (def_mount_opts & EXT4_DEFM_DEBUG) 3058 if (def_mount_opts & EXT4_DEFM_DEBUG)
2520 set_opt(sbi->s_mount_opt, DEBUG); 3059 set_opt(sbi->s_mount_opt, DEBUG);
2521 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3060 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2546,6 +3085,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2546 set_opt(sbi->s_mount_opt, ERRORS_CONT); 3085 set_opt(sbi->s_mount_opt, ERRORS_CONT);
2547 else 3086 else
2548 set_opt(sbi->s_mount_opt, ERRORS_RO); 3087 set_opt(sbi->s_mount_opt, ERRORS_RO);
3088 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
3089 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
3090 if (def_mount_opts & EXT4_DEFM_DISCARD)
3091 set_opt(sbi->s_mount_opt, DISCARD);
2549 3092
2550 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 3093 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2551 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 3094 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2553,15 +3096,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2553 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 3096 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2554 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 3097 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2555 3098
2556 set_opt(sbi->s_mount_opt, BARRIER); 3099 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3100 set_opt(sbi->s_mount_opt, BARRIER);
2557 3101
2558 /* 3102 /*
2559 * enable delayed allocation by default 3103 * enable delayed allocation by default
2560 * Use -o nodelalloc to turn it off 3104 * Use -o nodelalloc to turn it off
2561 */ 3105 */
2562 if (!IS_EXT3_SB(sb)) 3106 if (!IS_EXT3_SB(sb) &&
3107 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
2563 set_opt(sbi->s_mount_opt, DELALLOC); 3108 set_opt(sbi->s_mount_opt, DELALLOC);
2564 3109
3110 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3111 &journal_devnum, &journal_ioprio, NULL, 0)) {
3112 ext4_msg(sb, KERN_WARNING,
3113 "failed to parse options in superblock: %s",
3114 sbi->s_es->s_mount_opts);
3115 }
2565 if (!parse_options((char *) data, sb, &journal_devnum, 3116 if (!parse_options((char *) data, sb, &journal_devnum,
2566 &journal_ioprio, NULL, 0)) 3117 &journal_ioprio, NULL, 0))
2567 goto failed_mount; 3118 goto failed_mount;
@@ -2706,15 +3257,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2706 * Test whether we have more sectors than will fit in sector_t, 3257 * Test whether we have more sectors than will fit in sector_t,
2707 * and whether the max offset is addressable by the page cache. 3258 * and whether the max offset is addressable by the page cache.
2708 */ 3259 */
2709 if ((ext4_blocks_count(es) > 3260 ret = generic_check_addressable(sb->s_blocksize_bits,
2710 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || 3261 ext4_blocks_count(es));
2711 (ext4_blocks_count(es) > 3262 if (ret) {
2712 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2713 ext4_msg(sb, KERN_ERR, "filesystem" 3263 ext4_msg(sb, KERN_ERR, "filesystem"
2714 " too large to mount safely on this system"); 3264 " too large to mount safely on this system");
2715 if (sizeof(sector_t) < 8) 3265 if (sizeof(sector_t) < 8)
2716 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3266 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2717 ret = -EFBIG;
2718 goto failed_mount; 3267 goto failed_mount;
2719 } 3268 }
2720 3269
@@ -2783,7 +3332,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2783 goto failed_mount2; 3332 goto failed_mount2;
2784 } 3333 }
2785 } 3334 }
2786 if (!ext4_check_descriptors(sb)) { 3335 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
2787 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3336 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2788 goto failed_mount2; 3337 goto failed_mount2;
2789 } 3338 }
@@ -2912,18 +3461,7 @@ no_journal:
2912 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3461 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq; 3462 goto failed_mount_wq;
2914 } 3463 }
2915 if (test_opt(sb, NOBH)) { 3464
2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2918 "its supported only with writeback mode");
2919 clear_opt(sbi->s_mount_opt, NOBH);
2920 }
2921 if (test_opt(sb, DIOREAD_NOLOCK)) {
2922 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2923 "not supported with nobh mode");
2924 goto failed_mount_wq;
2925 }
2926 }
2927 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3465 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2928 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3466 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2929 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3467 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3010,11 +3548,15 @@ no_journal:
3010 ext4_ext_init(sb); 3548 ext4_ext_init(sb);
3011 err = ext4_mb_init(sb, needs_recovery); 3549 err = ext4_mb_init(sb, needs_recovery);
3012 if (err) { 3550 if (err) {
3013 ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", 3551 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3014 err); 3552 err);
3015 goto failed_mount4; 3553 goto failed_mount4;
3016 } 3554 }
3017 3555
3556 err = ext4_register_li_request(sb, first_not_zeroed);
3557 if (err)
3558 goto failed_mount4;
3559
3018 sbi->s_kobj.kset = ext4_kset; 3560 sbi->s_kobj.kset = ext4_kset;
3019 init_completion(&sbi->s_kobj_unregister); 3561 init_completion(&sbi->s_kobj_unregister);
3020 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3562 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3043,9 +3585,15 @@ no_journal:
3043 descr = "out journal"; 3585 descr = "out journal";
3044 3586
3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 3587 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data); 3588 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3589 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3590
3591 init_timer(&sbi->s_err_report);
3592 sbi->s_err_report.function = print_daily_error_info;
3593 sbi->s_err_report.data = (unsigned long) sb;
3594 if (es->s_error_count)
3595 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3047 3596
3048 lock_kernel();
3049 kfree(orig_data); 3597 kfree(orig_data);
3050 return 0; 3598 return 0;
3051 3599
@@ -3092,7 +3640,7 @@ out_fail:
3092 sb->s_fs_info = NULL; 3640 sb->s_fs_info = NULL;
3093 kfree(sbi->s_blockgroup_lock); 3641 kfree(sbi->s_blockgroup_lock);
3094 kfree(sbi); 3642 kfree(sbi);
3095 lock_kernel(); 3643out_free_orig:
3096 kfree(orig_data); 3644 kfree(orig_data);
3097 return ret; 3645 return ret;
3098} 3646}
@@ -3110,7 +3658,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3110 journal->j_min_batch_time = sbi->s_min_batch_time; 3658 journal->j_min_batch_time = sbi->s_min_batch_time;
3111 journal->j_max_batch_time = sbi->s_max_batch_time; 3659 journal->j_max_batch_time = sbi->s_max_batch_time;
3112 3660
3113 spin_lock(&journal->j_state_lock); 3661 write_lock(&journal->j_state_lock);
3114 if (test_opt(sb, BARRIER)) 3662 if (test_opt(sb, BARRIER))
3115 journal->j_flags |= JBD2_BARRIER; 3663 journal->j_flags |= JBD2_BARRIER;
3116 else 3664 else
@@ -3119,7 +3667,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3119 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 3667 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
3120 else 3668 else
3121 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 3669 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
3122 spin_unlock(&journal->j_state_lock); 3670 write_unlock(&journal->j_state_lock);
3123} 3671}
3124 3672
3125static journal_t *ext4_get_journal(struct super_block *sb, 3673static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3327,8 +3875,17 @@ static int ext4_load_journal(struct super_block *sb,
3327 3875
3328 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3876 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
3329 err = jbd2_journal_wipe(journal, !really_read_only); 3877 err = jbd2_journal_wipe(journal, !really_read_only);
3330 if (!err) 3878 if (!err) {
3879 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
3880 if (save)
3881 memcpy(save, ((char *) es) +
3882 EXT4_S_ERR_START, EXT4_S_ERR_LEN);
3331 err = jbd2_journal_load(journal); 3883 err = jbd2_journal_load(journal);
3884 if (save)
3885 memcpy(((char *) es) + EXT4_S_ERR_START,
3886 save, EXT4_S_ERR_LEN);
3887 kfree(save);
3888 }
3332 3889
3333 if (err) { 3890 if (err) {
3334 ext4_msg(sb, KERN_ERR, "error loading journal"); 3891 ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3339,7 +3896,7 @@ static int ext4_load_journal(struct super_block *sb,
3339 EXT4_SB(sb)->s_journal = journal; 3896 EXT4_SB(sb)->s_journal = journal;
3340 ext4_clear_journal_err(sb, es); 3897 ext4_clear_journal_err(sb, es);
3341 3898
3342 if (journal_devnum && 3899 if (!really_read_only && journal_devnum &&
3343 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3900 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3344 es->s_journal_dev = cpu_to_le32(journal_devnum); 3901 es->s_journal_dev = cpu_to_le32(journal_devnum);
3345 3902
@@ -3384,13 +3941,20 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3384 */ 3941 */
3385 if (!(sb->s_flags & MS_RDONLY)) 3942 if (!(sb->s_flags & MS_RDONLY))
3386 es->s_wtime = cpu_to_le32(get_seconds()); 3943 es->s_wtime = cpu_to_le32(get_seconds());
3387 es->s_kbytes_written = 3944 if (sb->s_bdev->bd_part)
3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3945 es->s_kbytes_written =
3946 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3947 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3390 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3948 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3949 else
3950 es->s_kbytes_written =
3951 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3952 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
3953 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3392 &EXT4_SB(sb)->s_freeblocks_counter)); 3954 &EXT4_SB(sb)->s_freeblocks_counter));
3393 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3955 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
3956 es->s_free_inodes_count =
3957 cpu_to_le32(percpu_counter_sum_positive(
3394 &EXT4_SB(sb)->s_freeinodes_counter)); 3958 &EXT4_SB(sb)->s_freeinodes_counter));
3395 sb->s_dirt = 0; 3959 sb->s_dirt = 0;
3396 BUFFER_TRACE(sbh, "marking dirty"); 3960 BUFFER_TRACE(sbh, "marking dirty");
@@ -3491,7 +4055,7 @@ int ext4_force_commit(struct super_block *sb)
3491 4055
3492 journal = EXT4_SB(sb)->s_journal; 4056 journal = EXT4_SB(sb)->s_journal;
3493 if (journal) { 4057 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE); 4058 vfs_check_frozen(sb, SB_FREEZE_TRANS);
3495 ret = ext4_journal_force_commit(journal); 4059 ret = ext4_journal_force_commit(journal);
3496 } 4060 }
3497 4061
@@ -3587,8 +4151,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3587#endif 4151#endif
3588 char *orig_data = kstrdup(data, GFP_KERNEL); 4152 char *orig_data = kstrdup(data, GFP_KERNEL);
3589 4153
3590 lock_kernel();
3591
3592 /* Store the original options */ 4154 /* Store the original options */
3593 lock_super(sb); 4155 lock_super(sb);
3594 old_sb_flags = sb->s_flags; 4156 old_sb_flags = sb->s_flags;
@@ -3616,7 +4178,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3616 } 4178 }
3617 4179
3618 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 4180 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
3619 ext4_abort(sb, __func__, "Abort forced by user"); 4181 ext4_abort(sb, "Abort forced by user");
3620 4182
3621 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 4183 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3622 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 4184 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3711,6 +4273,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3711 enable_quota = 1; 4273 enable_quota = 1;
3712 } 4274 }
3713 } 4275 }
4276
4277 /*
4278 * Reinitialize lazy itable initialization thread based on
4279 * current settings
4280 */
4281 if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4282 ext4_unregister_li_request(sb);
4283 else {
4284 ext4_group_t first_not_zeroed;
4285 first_not_zeroed = ext4_has_uninit_itable(sb);
4286 ext4_register_li_request(sb, first_not_zeroed);
4287 }
4288
3714 ext4_setup_system_zone(sb); 4289 ext4_setup_system_zone(sb);
3715 if (sbi->s_journal == NULL) 4290 if (sbi->s_journal == NULL)
3716 ext4_commit_super(sb, 1); 4291 ext4_commit_super(sb, 1);
@@ -3723,7 +4298,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3723 kfree(old_opts.s_qf_names[i]); 4298 kfree(old_opts.s_qf_names[i]);
3724#endif 4299#endif
3725 unlock_super(sb); 4300 unlock_super(sb);
3726 unlock_kernel();
3727 if (enable_quota) 4301 if (enable_quota)
3728 dquot_resume(sb, -1); 4302 dquot_resume(sb, -1);
3729 4303
@@ -3749,7 +4323,6 @@ restore_opts:
3749 } 4323 }
3750#endif 4324#endif
3751 unlock_super(sb); 4325 unlock_super(sb);
3752 unlock_kernel();
3753 kfree(orig_data); 4326 kfree(orig_data);
3754 return err; 4327 return err;
3755} 4328}
@@ -3981,6 +4554,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3981 return err; 4554 return err;
3982} 4555}
3983 4556
4557static int ext4_quota_off(struct super_block *sb, int type)
4558{
4559 /* Force all delayed allocation blocks to be allocated */
4560 if (test_opt(sb, DELALLOC)) {
4561 down_read(&sb->s_umount);
4562 sync_filesystem(sb);
4563 up_read(&sb->s_umount);
4564 }
4565
4566 return dquot_quota_off(sb, type);
4567}
4568
3984/* Read data from quotafile - avoid pagecache and such because we cannot afford 4569/* Read data from quotafile - avoid pagecache and such because we cannot afford
3985 * acquiring the locks... As quota files are never truncated and quota code 4570 * acquiring the locks... As quota files are never truncated and quota code
3986 * itself serializes the operations (and noone else should touch the files) 4571 * itself serializes the operations (and noone else should touch the files)
@@ -4030,7 +4615,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4030 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4615 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
4031 int err = 0; 4616 int err = 0;
4032 int offset = off & (sb->s_blocksize - 1); 4617 int offset = off & (sb->s_blocksize - 1);
4033 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
4034 struct buffer_head *bh; 4618 struct buffer_head *bh;
4035 handle_t *handle = journal_current_handle(); 4619 handle_t *handle = journal_current_handle();
4036 4620
@@ -4055,24 +4639,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4055 bh = ext4_bread(handle, inode, blk, 1, &err); 4639 bh = ext4_bread(handle, inode, blk, 1, &err);
4056 if (!bh) 4640 if (!bh)
4057 goto out; 4641 goto out;
4058 if (journal_quota) { 4642 err = ext4_journal_get_write_access(handle, bh);
4059 err = ext4_journal_get_write_access(handle, bh); 4643 if (err) {
4060 if (err) { 4644 brelse(bh);
4061 brelse(bh); 4645 goto out;
4062 goto out;
4063 }
4064 } 4646 }
4065 lock_buffer(bh); 4647 lock_buffer(bh);
4066 memcpy(bh->b_data+offset, data, len); 4648 memcpy(bh->b_data+offset, data, len);
4067 flush_dcache_page(bh->b_page); 4649 flush_dcache_page(bh->b_page);
4068 unlock_buffer(bh); 4650 unlock_buffer(bh);
4069 if (journal_quota) 4651 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4070 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4071 else {
4072 /* Always do at least ordered writes for quotas */
4073 err = ext4_jbd2_file_inode(handle, inode);
4074 mark_buffer_dirty(bh);
4075 }
4076 brelse(bh); 4652 brelse(bh);
4077out: 4653out:
4078 if (err) { 4654 if (err) {
@@ -4091,17 +4667,17 @@ out:
4091 4667
4092#endif 4668#endif
4093 4669
4094static int ext4_get_sb(struct file_system_type *fs_type, int flags, 4670static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4095 const char *dev_name, void *data, struct vfsmount *mnt) 4671 const char *dev_name, void *data)
4096{ 4672{
4097 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4673 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
4098} 4674}
4099 4675
4100#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4676#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4101static struct file_system_type ext2_fs_type = { 4677static struct file_system_type ext2_fs_type = {
4102 .owner = THIS_MODULE, 4678 .owner = THIS_MODULE,
4103 .name = "ext2", 4679 .name = "ext2",
4104 .get_sb = ext4_get_sb, 4680 .mount = ext4_mount,
4105 .kill_sb = kill_block_super, 4681 .kill_sb = kill_block_super,
4106 .fs_flags = FS_REQUIRES_DEV, 4682 .fs_flags = FS_REQUIRES_DEV,
4107}; 4683};
@@ -4146,28 +4722,58 @@ static inline void unregister_as_ext3(void) { }
4146static struct file_system_type ext4_fs_type = { 4722static struct file_system_type ext4_fs_type = {
4147 .owner = THIS_MODULE, 4723 .owner = THIS_MODULE,
4148 .name = "ext4", 4724 .name = "ext4",
4149 .get_sb = ext4_get_sb, 4725 .mount = ext4_mount,
4150 .kill_sb = kill_block_super, 4726 .kill_sb = kill_block_super,
4151 .fs_flags = FS_REQUIRES_DEV, 4727 .fs_flags = FS_REQUIRES_DEV,
4152}; 4728};
4153 4729
4154static int __init init_ext4_fs(void) 4730int __init ext4_init_feat_adverts(void)
4731{
4732 struct ext4_features *ef;
4733 int ret = -ENOMEM;
4734
4735 ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
4736 if (!ef)
4737 goto out;
4738
4739 ef->f_kobj.kset = ext4_kset;
4740 init_completion(&ef->f_kobj_unregister);
4741 ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
4742 "features");
4743 if (ret) {
4744 kfree(ef);
4745 goto out;
4746 }
4747
4748 ext4_feat = ef;
4749 ret = 0;
4750out:
4751 return ret;
4752}
4753
4754static int __init ext4_init_fs(void)
4155{ 4755{
4156 int err; 4756 int err;
4157 4757
4158 ext4_check_flag_values(); 4758 ext4_check_flag_values();
4159 err = init_ext4_system_zone(); 4759 err = ext4_init_pageio();
4160 if (err) 4760 if (err)
4161 return err; 4761 return err;
4762 err = ext4_init_system_zone();
4763 if (err)
4764 goto out5;
4162 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4765 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4163 if (!ext4_kset) 4766 if (!ext4_kset)
4164 goto out4; 4767 goto out4;
4165 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4768 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4166 err = init_ext4_mballoc(); 4769
4770 err = ext4_init_feat_adverts();
4771
4772 err = ext4_init_mballoc();
4167 if (err) 4773 if (err)
4168 goto out3; 4774 goto out3;
4169 4775
4170 err = init_ext4_xattr(); 4776 err = ext4_init_xattr();
4171 if (err) 4777 if (err)
4172 goto out2; 4778 goto out2;
4173 err = init_inodecache(); 4779 err = init_inodecache();
@@ -4178,38 +4784,46 @@ static int __init init_ext4_fs(void)
4178 err = register_filesystem(&ext4_fs_type); 4784 err = register_filesystem(&ext4_fs_type);
4179 if (err) 4785 if (err)
4180 goto out; 4786 goto out;
4787
4788 ext4_li_info = NULL;
4789 mutex_init(&ext4_li_mtx);
4181 return 0; 4790 return 0;
4182out: 4791out:
4183 unregister_as_ext2(); 4792 unregister_as_ext2();
4184 unregister_as_ext3(); 4793 unregister_as_ext3();
4185 destroy_inodecache(); 4794 destroy_inodecache();
4186out1: 4795out1:
4187 exit_ext4_xattr(); 4796 ext4_exit_xattr();
4188out2: 4797out2:
4189 exit_ext4_mballoc(); 4798 ext4_exit_mballoc();
4190out3: 4799out3:
4800 kfree(ext4_feat);
4191 remove_proc_entry("fs/ext4", NULL); 4801 remove_proc_entry("fs/ext4", NULL);
4192 kset_unregister(ext4_kset); 4802 kset_unregister(ext4_kset);
4193out4: 4803out4:
4194 exit_ext4_system_zone(); 4804 ext4_exit_system_zone();
4805out5:
4806 ext4_exit_pageio();
4195 return err; 4807 return err;
4196} 4808}
4197 4809
4198static void __exit exit_ext4_fs(void) 4810static void __exit ext4_exit_fs(void)
4199{ 4811{
4812 ext4_destroy_lazyinit_thread();
4200 unregister_as_ext2(); 4813 unregister_as_ext2();
4201 unregister_as_ext3(); 4814 unregister_as_ext3();
4202 unregister_filesystem(&ext4_fs_type); 4815 unregister_filesystem(&ext4_fs_type);
4203 destroy_inodecache(); 4816 destroy_inodecache();
4204 exit_ext4_xattr(); 4817 ext4_exit_xattr();
4205 exit_ext4_mballoc(); 4818 ext4_exit_mballoc();
4206 remove_proc_entry("fs/ext4", NULL); 4819 remove_proc_entry("fs/ext4", NULL);
4207 kset_unregister(ext4_kset); 4820 kset_unregister(ext4_kset);
4208 exit_ext4_system_zone(); 4821 ext4_exit_system_zone();
4822 ext4_exit_pageio();
4209} 4823}
4210 4824
4211MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 4825MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
4212MODULE_DESCRIPTION("Fourth Extended Filesystem"); 4826MODULE_DESCRIPTION("Fourth Extended Filesystem");
4213MODULE_LICENSE("GPL"); 4827MODULE_LICENSE("GPL");
4214module_init(init_ext4_fs) 4828module_init(ext4_init_fs)
4215module_exit(exit_ext4_fs) 4829module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04338009793..fa4b899da4b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
458 458
459 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 459 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
460 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 460 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
461 sb->s_dirt = 1; 461 ext4_handle_dirty_super(handle, sb);
462 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
463 } 462 }
464} 463}
465 464
@@ -1418,7 +1417,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
1418 ea_bdebug(bh, "out of memory"); 1417 ea_bdebug(bh, "out of memory");
1419 return; 1418 return;
1420 } 1419 }
1421 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 1420 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1422 if (error) { 1421 if (error) {
1423 mb_cache_entry_free(ce); 1422 mb_cache_entry_free(ce);
1424 if (error == -EBUSY) { 1423 if (error == -EBUSY) {
@@ -1490,8 +1489,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1490 return NULL; /* never share */ 1489 return NULL; /* never share */
1491 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1490 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1492again: 1491again:
1493 ce = mb_cache_entry_find_first(ext4_xattr_cache, 0, 1492 ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
1494 inode->i_sb->s_bdev, hash); 1493 hash);
1495 while (ce) { 1494 while (ce) {
1496 struct buffer_head *bh; 1495 struct buffer_head *bh;
1497 1496
@@ -1515,7 +1514,7 @@ again:
1515 return bh; 1514 return bh;
1516 } 1515 }
1517 brelse(bh); 1516 brelse(bh);
1518 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 1517 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1519 } 1518 }
1520 return NULL; 1519 return NULL;
1521} 1520}
@@ -1589,18 +1588,16 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1589#undef BLOCK_HASH_SHIFT 1588#undef BLOCK_HASH_SHIFT
1590 1589
1591int __init 1590int __init
1592init_ext4_xattr(void) 1591ext4_init_xattr(void)
1593{ 1592{
1594 ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL, 1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
1595 sizeof(struct mb_cache_entry) +
1596 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1597 if (!ext4_xattr_cache) 1594 if (!ext4_xattr_cache)
1598 return -ENOMEM; 1595 return -ENOMEM;
1599 return 0; 1596 return 0;
1600} 1597}
1601 1598
1602void 1599void
1603exit_ext4_xattr(void) 1600ext4_exit_xattr(void)
1604{ 1601{
1605 if (ext4_xattr_cache) 1602 if (ext4_xattr_cache)
1606 mb_cache_destroy(ext4_xattr_cache); 1603 mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e4390..1ef16520b95 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
84 struct ext4_inode *raw_inode, handle_t *handle); 84 struct ext4_inode *raw_inode, handle_t *handle);
85 85
86extern int init_ext4_xattr(void); 86extern int __init ext4_init_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void ext4_exit_xattr(void);
88 88
89extern const struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
121{ 121{
122} 122}
123 123
124static inline int 124static __init inline int
125init_ext4_xattr(void) 125ext4_init_xattr(void)
126{ 126{
127 return 0; 127 return 0;
128} 128}
129 129
130static inline void 130static inline void
131exit_ext4_xattr(void) 131ext4_exit_xattr(void)
132{ 132{
133} 133}
134 134
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 27ac2572595..d75a77f85c2 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,7 +306,6 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
306extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
307extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
308extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
309extern int fat_setsize(struct inode *inode, loff_t offset);
310extern void fat_truncate_blocks(struct inode *inode, loff_t offset); 309extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
311extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 310extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
312 struct kstat *stat); 311 struct kstat *stat);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a..b47d2c9f4fa 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
577 577
578 sb_issue_discard(sb, 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl), 579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus); 580 nr_clus * sbi->sec_per_clus,
581 GFP_NOFS, 0);
581 582
582 first_cl = cluster; 583 first_cl = cluster;
583 } 584 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 990dfae022e..7257752b6d5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -364,18 +364,6 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
364 return 0; 364 return 0;
365} 365}
366 366
367int fat_setsize(struct inode *inode, loff_t offset)
368{
369 int error;
370
371 error = simple_setsize(inode, offset);
372 if (error)
373 return error;
374 fat_truncate_blocks(inode, offset);
375
376 return error;
377}
378
379#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 367#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
380/* valid file mode bits */ 368/* valid file mode bits */
381#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 369#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -387,21 +375,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
387 unsigned int ia_valid; 375 unsigned int ia_valid;
388 int error; 376 int error;
389 377
390 /*
391 * Expand the file. Since inode_setattr() updates ->i_size
392 * before calling the ->truncate(), but FAT needs to fill the
393 * hole before it. XXX: this is no longer true with new truncate
394 * sequence.
395 */
396 if (attr->ia_valid & ATTR_SIZE) {
397 if (attr->ia_size > inode->i_size) {
398 error = fat_cont_expand(inode, attr->ia_size);
399 if (error || attr->ia_valid == ATTR_SIZE)
400 goto out;
401 attr->ia_valid &= ~ATTR_SIZE;
402 }
403 }
404
405 /* Check for setting the inode time. */ 378 /* Check for setting the inode time. */
406 ia_valid = attr->ia_valid; 379 ia_valid = attr->ia_valid;
407 if (ia_valid & TIMES_SET_FLAGS) { 380 if (ia_valid & TIMES_SET_FLAGS) {
@@ -417,6 +390,21 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
417 goto out; 390 goto out;
418 } 391 }
419 392
393 /*
394 * Expand the file. Since inode_setattr() updates ->i_size
395 * before calling the ->truncate(), but FAT needs to fill the
396 * hole before it. XXX: this is no longer true with new truncate
397 * sequence.
398 */
399 if (attr->ia_valid & ATTR_SIZE) {
400 if (attr->ia_size > inode->i_size) {
401 error = fat_cont_expand(inode, attr->ia_size);
402 if (error || attr->ia_valid == ATTR_SIZE)
403 goto out;
404 attr->ia_valid &= ~ATTR_SIZE;
405 }
406 }
407
420 if (((attr->ia_valid & ATTR_UID) && 408 if (((attr->ia_valid & ATTR_UID) &&
421 (attr->ia_uid != sbi->options.fs_uid)) || 409 (attr->ia_uid != sbi->options.fs_uid)) ||
422 ((attr->ia_valid & ATTR_GID) && 410 ((attr->ia_valid & ATTR_GID) &&
@@ -441,12 +429,11 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
441 } 429 }
442 430
443 if (attr->ia_valid & ATTR_SIZE) { 431 if (attr->ia_valid & ATTR_SIZE) {
444 error = fat_setsize(inode, attr->ia_size); 432 truncate_setsize(inode, attr->ia_size);
445 if (error) 433 fat_truncate_blocks(inode, attr->ia_size);
446 goto out;
447 } 434 }
448 435
449 generic_setattr(inode, attr); 436 setattr_copy(inode, attr);
450 mark_inode_dirty(inode); 437 mark_inode_dirty(inode);
451out: 438out:
452 return error; 439 return error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7bf45aee56d..ad6998a92c3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/seq_file.h> 17#include <linux/seq_file.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include <linux/mpage.h> 19#include <linux/mpage.h>
@@ -159,7 +158,7 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
159 int err; 158 int err;
160 159
161 *pagep = NULL; 160 *pagep = NULL;
162 err = cont_write_begin_newtrunc(file, mapping, pos, len, flags, 161 err = cont_write_begin(file, mapping, pos, len, flags,
163 pagep, fsdata, fat_get_block, 162 pagep, fsdata, fat_get_block,
164 &MSDOS_I(mapping->host)->mmu_private); 163 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0) 164 if (err < 0)
@@ -212,8 +211,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
212 * FAT need to use the DIO_LOCKING for avoiding the race 211 * FAT need to use the DIO_LOCKING for avoiding the race
213 * condition of fat_get_block() and ->truncate(). 212 * condition of fat_get_block() and ->truncate().
214 */ 213 */
215 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, 214 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
216 iov, offset, nr_segs, fat_get_block, NULL); 215 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE)) 216 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); 217 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219 218
@@ -263,7 +262,7 @@ static const struct address_space_operations fat_aops = {
263 * check if the location is still valid and retry if it 262 * check if the location is still valid and retry if it
264 * isn't. Otherwise we do changes. 263 * isn't. Otherwise we do changes.
265 * 5. Spinlock is used to protect hash/unhash/location check/lookup 264 * 5. Spinlock is used to protect hash/unhash/location check/lookup
266 * 6. fat_clear_inode() unhashes the F-d-c entry. 265 * 6. fat_evict_inode() unhashes the F-d-c entry.
267 * 7. lookup() and readdir() do igrab() if they find a F-d-c entry 266 * 7. lookup() and readdir() do igrab() if they find a F-d-c entry
268 * and consider negative result as cache miss. 267 * and consider negative result as cache miss.
269 */ 268 */
@@ -448,16 +447,15 @@ out:
448 447
449EXPORT_SYMBOL_GPL(fat_build_inode); 448EXPORT_SYMBOL_GPL(fat_build_inode);
450 449
451static void fat_delete_inode(struct inode *inode) 450static void fat_evict_inode(struct inode *inode)
452{ 451{
453 truncate_inode_pages(&inode->i_data, 0); 452 truncate_inode_pages(&inode->i_data, 0);
454 inode->i_size = 0; 453 if (!inode->i_nlink) {
455 fat_truncate_blocks(inode, 0); 454 inode->i_size = 0;
456 clear_inode(inode); 455 fat_truncate_blocks(inode, 0);
457} 456 }
458 457 invalidate_inode_buffers(inode);
459static void fat_clear_inode(struct inode *inode) 458 end_writeback(inode);
460{
461 fat_cache_inval_inode(inode); 459 fat_cache_inval_inode(inode);
462 fat_detach(inode); 460 fat_detach(inode);
463} 461}
@@ -490,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
490{ 488{
491 struct msdos_sb_info *sbi = MSDOS_SB(sb); 489 struct msdos_sb_info *sbi = MSDOS_SB(sb);
492 490
493 lock_kernel();
494
495 if (sb->s_dirt) 491 if (sb->s_dirt)
496 fat_write_super(sb); 492 fat_write_super(sb);
497 493
@@ -505,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
505 501
506 sb->s_fs_info = NULL; 502 sb->s_fs_info = NULL;
507 kfree(sbi); 503 kfree(sbi);
508
509 unlock_kernel();
510} 504}
511 505
512static struct kmem_cache *fat_inode_cachep; 506static struct kmem_cache *fat_inode_cachep;
@@ -674,12 +668,11 @@ static const struct super_operations fat_sops = {
674 .alloc_inode = fat_alloc_inode, 668 .alloc_inode = fat_alloc_inode,
675 .destroy_inode = fat_destroy_inode, 669 .destroy_inode = fat_destroy_inode,
676 .write_inode = fat_write_inode, 670 .write_inode = fat_write_inode,
677 .delete_inode = fat_delete_inode, 671 .evict_inode = fat_evict_inode,
678 .put_super = fat_put_super, 672 .put_super = fat_put_super,
679 .write_super = fat_write_super, 673 .write_super = fat_write_super,
680 .sync_fs = fat_sync_fs, 674 .sync_fs = fat_sync_fs,
681 .statfs = fat_statfs, 675 .statfs = fat_statfs,
682 .clear_inode = fat_clear_inode,
683 .remount_fs = fat_remount, 676 .remount_fs = fat_remount,
684 677
685 .show_options = fat_show_options, 678 .show_options = fat_show_options,
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba..970e682ea75 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,13 +250,12 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
250{ 250{
251 int i, err = 0; 251 int i, err = 0;
252 252
253 ll_rw_block(SWRITE, nr_bhs, bhs); 253 for (i = 0; i < nr_bhs; i++)
254 write_dirty_buffer(bhs[i], WRITE);
255
254 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
255 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
256 if (buffer_eopnotsupp(bhs[i])) { 258 if (!err && !buffer_uptodate(bhs[i]))
257 clear_buffer_eopnotsupp(bhs[i]);
258 err = -EOPNOTSUPP;
259 } else if (!err && !buffer_uptodate(bhs[i]))
260 err = -EIO; 259 err = -EIO;
261 } 260 }
262 return err; 261 return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd7..3345aabd1dd 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -662,27 +662,30 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
662{ 662{
663 int res; 663 int res;
664 664
665 lock_super(sb);
665 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0); 666 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
666 if (res) 667 if (res) {
668 unlock_super(sb);
667 return res; 669 return res;
670 }
668 671
669 sb->s_flags |= MS_NOATIME; 672 sb->s_flags |= MS_NOATIME;
670 sb->s_root->d_op = &msdos_dentry_operations; 673 sb->s_root->d_op = &msdos_dentry_operations;
674 unlock_super(sb);
671 return 0; 675 return 0;
672} 676}
673 677
674static int msdos_get_sb(struct file_system_type *fs_type, 678static struct dentry *msdos_mount(struct file_system_type *fs_type,
675 int flags, const char *dev_name, 679 int flags, const char *dev_name,
676 void *data, struct vfsmount *mnt) 680 void *data)
677{ 681{
678 return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, 682 return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
679 mnt);
680} 683}
681 684
682static struct file_system_type msdos_fs_type = { 685static struct file_system_type msdos_fs_type = {
683 .owner = THIS_MODULE, 686 .owner = THIS_MODULE,
684 .name = "msdos", 687 .name = "msdos",
685 .get_sb = msdos_get_sb, 688 .mount = msdos_mount,
686 .kill_sb = kill_block_super, 689 .kill_sb = kill_block_super,
687 .fs_flags = FS_REQUIRES_DEV, 690 .fs_flags = FS_REQUIRES_DEV,
688}; 691};
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fba..b936703b892 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1055,30 +1055,33 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1055{ 1055{
1056 int res; 1056 int res;
1057 1057
1058 lock_super(sb);
1058 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1); 1059 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1059 if (res) 1060 if (res) {
1061 unlock_super(sb);
1060 return res; 1062 return res;
1063 }
1061 1064
1062 if (MSDOS_SB(sb)->options.name_check != 's') 1065 if (MSDOS_SB(sb)->options.name_check != 's')
1063 sb->s_root->d_op = &vfat_ci_dentry_ops; 1066 sb->s_root->d_op = &vfat_ci_dentry_ops;
1064 else 1067 else
1065 sb->s_root->d_op = &vfat_dentry_ops; 1068 sb->s_root->d_op = &vfat_dentry_ops;
1066 1069
1070 unlock_super(sb);
1067 return 0; 1071 return 0;
1068} 1072}
1069 1073
1070static int vfat_get_sb(struct file_system_type *fs_type, 1074static struct dentry *vfat_mount(struct file_system_type *fs_type,
1071 int flags, const char *dev_name, 1075 int flags, const char *dev_name,
1072 void *data, struct vfsmount *mnt) 1076 void *data)
1073{ 1077{
1074 return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, 1078 return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
1075 mnt);
1076} 1079}
1077 1080
1078static struct file_system_type vfat_fs_type = { 1081static struct file_system_type vfat_fs_type = {
1079 .owner = THIS_MODULE, 1082 .owner = THIS_MODULE,
1080 .name = "vfat", 1083 .name = "vfat",
1081 .get_sb = vfat_get_sb, 1084 .mount = vfat_mount,
1082 .kill_sb = kill_block_super, 1085 .kill_sb = kill_block_super,
1083 .fs_flags = FS_REQUIRES_DEV, 1086 .fs_flags = FS_REQUIRES_DEV,
1084}; 1087};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9d175d623aa..ecc8b3954ed 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
640 * match the state "is the filp on a fasync list". 640 * match the state "is the filp on a fasync list".
641 * 641 *
642 */ 642 */
643static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 643int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
644{ 644{
645 struct fasync_struct *fa, **fp; 645 struct fasync_struct *fa, **fp;
646 int result = 0; 646 int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
666 return result; 666 return result;
667} 667}
668 668
669struct fasync_struct *fasync_alloc(void)
670{
671 return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
672}
673
669/* 674/*
670 * Add a fasync entry. Return negative on error, positive if 675 * NOTE! This can be used only for unused fasync entries:
671 * added, and zero if did nothing but change an existing one. 676 * entries that actually got inserted on the fasync list
677 * need to be released by rcu - see fasync_remove_entry.
678 */
679void fasync_free(struct fasync_struct *new)
680{
681 kmem_cache_free(fasync_cache, new);
682}
683
684/*
685 * Insert a new entry into the fasync list. Return the pointer to the
686 * old one if we didn't use the new one.
672 * 687 *
673 * NOTE! It is very important that the FASYNC flag always 688 * NOTE! It is very important that the FASYNC flag always
674 * match the state "is the filp on a fasync list". 689 * match the state "is the filp on a fasync list".
675 */ 690 */
676static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) 691struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
677{ 692{
678 struct fasync_struct *new, *fa, **fp; 693 struct fasync_struct *fa, **fp;
679 int result = 0;
680
681 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
682 if (!new)
683 return -ENOMEM;
684 694
685 spin_lock(&filp->f_lock); 695 spin_lock(&filp->f_lock);
686 spin_lock(&fasync_lock); 696 spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
691 spin_lock_irq(&fa->fa_lock); 701 spin_lock_irq(&fa->fa_lock);
692 fa->fa_fd = fd; 702 fa->fa_fd = fd;
693 spin_unlock_irq(&fa->fa_lock); 703 spin_unlock_irq(&fa->fa_lock);
694
695 kmem_cache_free(fasync_cache, new);
696 goto out; 704 goto out;
697 } 705 }
698 706
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
702 new->fa_fd = fd; 710 new->fa_fd = fd;
703 new->fa_next = *fapp; 711 new->fa_next = *fapp;
704 rcu_assign_pointer(*fapp, new); 712 rcu_assign_pointer(*fapp, new);
705 result = 1;
706 filp->f_flags |= FASYNC; 713 filp->f_flags |= FASYNC;
707 714
708out: 715out:
709 spin_unlock(&fasync_lock); 716 spin_unlock(&fasync_lock);
710 spin_unlock(&filp->f_lock); 717 spin_unlock(&filp->f_lock);
711 return result; 718 return fa;
719}
720
721/*
722 * Add a fasync entry. Return negative on error, positive if
723 * added, and zero if did nothing but change an existing one.
724 */
725static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
726{
727 struct fasync_struct *new;
728
729 new = fasync_alloc();
730 if (!new)
731 return -ENOMEM;
732
733 /*
734 * fasync_insert_entry() returns the old (update) entry if
735 * it existed.
736 *
737 * So free the (unused) new entry and return 0 to let the
738 * caller know that we didn't add any new fasync entries.
739 */
740 if (fasync_insert_entry(fd, filp, fapp, new)) {
741 fasync_free(new);
742 return 0;
743 }
744
745 return 1;
712} 746}
713 747
714/* 748/*
@@ -767,11 +801,26 @@ void kill_fasync(struct fasync_struct **fp, int sig, int band)
767} 801}
768EXPORT_SYMBOL(kill_fasync); 802EXPORT_SYMBOL(kill_fasync);
769 803
770static int __init fasync_init(void) 804static int __init fcntl_init(void)
771{ 805{
806 /*
807 * Please add new bits here to ensure allocation uniqueness.
808 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
809 * is defined as O_NONBLOCK on some platforms and not on others.
810 */
811 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
812 O_RDONLY | O_WRONLY | O_RDWR |
813 O_CREAT | O_EXCL | O_NOCTTY |
814 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
815 __O_SYNC | O_DSYNC | FASYNC |
816 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
818 FMODE_EXEC
819 ));
820
772 fasync_cache = kmem_cache_create("fasync_cache", 821 fasync_cache = kmem_cache_create("fasync_cache",
773 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); 822 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
774 return 0; 823 return 0;
775} 824}
776 825
777module_init(fasync_init) 826module_init(fcntl_init)
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d..4e303c22d5e 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
151 */ 151 */
152const struct file_operations def_fifo_fops = { 152const struct file_operations def_fifo_fops = {
153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */ 153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */
154 .llseek = noop_llseek,
154}; 155};
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d99..0be344755c0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
39 */ 39 */
40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
41 41
42static inline void * alloc_fdmem(unsigned int size) 42static inline void *alloc_fdmem(unsigned int size)
43{ 43{
44 if (size <= PAGE_SIZE) 44 void *data;
45 return kmalloc(size, GFP_KERNEL); 45
46 else 46 data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
47 return vmalloc(size); 47 if (data != NULL)
48 return data;
49
50 return vmalloc(size);
48} 51}
49 52
50static inline void free_fdarr(struct fdtable *fdt) 53static void free_fdmem(void *ptr)
51{ 54{
52 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) 55 is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
53 kfree(fdt->fd);
54 else
55 vfree(fdt->fd);
56} 56}
57 57
58static inline void free_fdset(struct fdtable *fdt) 58static void __free_fdtable(struct fdtable *fdt)
59{ 59{
60 if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) 60 free_fdmem(fdt->fd);
61 kfree(fdt->open_fds); 61 free_fdmem(fdt->open_fds);
62 else 62 kfree(fdt);
63 vfree(fdt->open_fds);
64} 63}
65 64
66static void free_fdtable_work(struct work_struct *work) 65static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
75 spin_unlock_bh(&f->lock); 74 spin_unlock_bh(&f->lock);
76 while(fdt) { 75 while(fdt) {
77 struct fdtable *next = fdt->next; 76 struct fdtable *next = fdt->next;
78 vfree(fdt->fd); 77
79 free_fdset(fdt); 78 __free_fdtable(fdt);
80 kfree(fdt);
81 fdt = next; 79 fdt = next;
82 } 80 }
83} 81}
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
98 container_of(fdt, struct files_struct, fdtab)); 96 container_of(fdt, struct files_struct, fdtab));
99 return; 97 return;
100 } 98 }
101 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { 99 if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
102 kfree(fdt->fd); 100 kfree(fdt->fd);
103 kfree(fdt->open_fds); 101 kfree(fdt->open_fds);
104 kfree(fdt); 102 kfree(fdt);
@@ -178,13 +176,12 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
178 fdt->open_fds = (fd_set *)data; 176 fdt->open_fds = (fd_set *)data;
179 data += nr / BITS_PER_BYTE; 177 data += nr / BITS_PER_BYTE;
180 fdt->close_on_exec = (fd_set *)data; 178 fdt->close_on_exec = (fd_set *)data;
181 INIT_RCU_HEAD(&fdt->rcu);
182 fdt->next = NULL; 179 fdt->next = NULL;
183 180
184 return fdt; 181 return fdt;
185 182
186out_arr: 183out_arr:
187 free_fdarr(fdt); 184 free_fdmem(fdt->fd);
188out_fdt: 185out_fdt:
189 kfree(fdt); 186 kfree(fdt);
190out: 187out:
@@ -214,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
214 * caller and alloc_fdtable(). Cheaper to catch it here... 211 * caller and alloc_fdtable(). Cheaper to catch it here...
215 */ 212 */
216 if (unlikely(new_fdt->max_fds <= nr)) { 213 if (unlikely(new_fdt->max_fds <= nr)) {
217 free_fdarr(new_fdt); 214 __free_fdtable(new_fdt);
218 free_fdset(new_fdt);
219 kfree(new_fdt);
220 return -EMFILE; 215 return -EMFILE;
221 } 216 }
222 /* 217 /*
@@ -232,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
232 free_fdtable(cur_fdt); 227 free_fdtable(cur_fdt);
233 } else { 228 } else {
234 /* Somebody else expanded, so undo our attempt */ 229 /* Somebody else expanded, so undo our attempt */
235 free_fdarr(new_fdt); 230 __free_fdtable(new_fdt);
236 free_fdset(new_fdt);
237 kfree(new_fdt);
238 } 231 }
239 return 1; 232 return 1;
240} 233}
@@ -312,7 +305,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
312 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 305 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
313 new_fdt->open_fds = (fd_set *)&newf->open_fds_init; 306 new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
314 new_fdt->fd = &newf->fd_array[0]; 307 new_fdt->fd = &newf->fd_array[0];
315 INIT_RCU_HEAD(&new_fdt->rcu);
316 new_fdt->next = NULL; 308 new_fdt->next = NULL;
317 309
318 spin_lock(&oldf->file_lock); 310 spin_lock(&oldf->file_lock);
@@ -325,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
325 while (unlikely(open_files > new_fdt->max_fds)) { 317 while (unlikely(open_files > new_fdt->max_fds)) {
326 spin_unlock(&oldf->file_lock); 318 spin_unlock(&oldf->file_lock);
327 319
328 if (new_fdt != &newf->fdtab) { 320 if (new_fdt != &newf->fdtab)
329 free_fdarr(new_fdt); 321 __free_fdtable(new_fdt);
330 free_fdset(new_fdt);
331 kfree(new_fdt);
332 }
333 322
334 new_fdt = alloc_fdtable(open_files - 1); 323 new_fdt = alloc_fdtable(open_files - 1);
335 if (!new_fdt) { 324 if (!new_fdt) {
@@ -339,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
339 328
340 /* beyond sysctl_nr_open; nothing to do */ 329 /* beyond sysctl_nr_open; nothing to do */
341 if (unlikely(new_fdt->max_fds < open_files)) { 330 if (unlikely(new_fdt->max_fds < open_files)) {
342 free_fdarr(new_fdt); 331 __free_fdtable(new_fdt);
343 free_fdset(new_fdt);
344 kfree(new_fdt);
345 *errorp = -EMFILE; 332 *errorp = -EMFILE;
346 goto out_release; 333 goto out_release;
347 } 334 }
@@ -430,7 +417,6 @@ struct files_struct init_files = {
430 .fd = &init_files.fd_array[0], 417 .fd = &init_files.fd_array[0],
431 .close_on_exec = (fd_set *)&init_files.close_on_exec_init, 418 .close_on_exec = (fd_set *)&init_files.close_on_exec_init,
432 .open_fds = (fd_set *)&init_files.open_fds_init, 419 .open_fds = (fd_set *)&init_files.open_fds_init,
433 .rcu = RCU_HEAD_INIT,
434 }, 420 },
435 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 421 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
436}; 422};
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c7d10ead4a..c3dee381f1b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
22#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/lglock.h>
23#include <linux/percpu_counter.h> 24#include <linux/percpu_counter.h>
25#include <linux/percpu.h>
24#include <linux/ima.h> 26#include <linux/ima.h>
25 27
26#include <asm/atomic.h> 28#include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
32 .max_files = NR_FILE 34 .max_files = NR_FILE
33}; 35};
34 36
35/* public. Not pretty! */ 37DECLARE_LGLOCK(files_lglock);
36__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 38DEFINE_LGLOCK(files_lglock);
37 39
38/* SLAB cache for file structures */ 40/* SLAB cache for file structures */
39static struct kmem_cache *filp_cachep __read_mostly; 41static struct kmem_cache *filp_cachep __read_mostly;
@@ -58,7 +60,7 @@ static inline void file_free(struct file *f)
58/* 60/*
59 * Return the total number of open files in the system 61 * Return the total number of open files in the system
60 */ 62 */
61static int get_nr_files(void) 63static long get_nr_files(void)
62{ 64{
63 return percpu_counter_read_positive(&nr_files); 65 return percpu_counter_read_positive(&nr_files);
64} 66}
@@ -66,7 +68,7 @@ static int get_nr_files(void)
66/* 68/*
67 * Return the maximum number of open files in the system 69 * Return the maximum number of open files in the system
68 */ 70 */
69int get_max_files(void) 71unsigned long get_max_files(void)
70{ 72{
71 return files_stat.max_files; 73 return files_stat.max_files;
72} 74}
@@ -80,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
80 void __user *buffer, size_t *lenp, loff_t *ppos) 82 void __user *buffer, size_t *lenp, loff_t *ppos)
81{ 83{
82 files_stat.nr_files = get_nr_files(); 84 files_stat.nr_files = get_nr_files();
83 return proc_dointvec(table, write, buffer, lenp, ppos); 85 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
84} 86}
85#else 87#else
86int proc_nr_files(ctl_table *table, int write, 88int proc_nr_files(ctl_table *table, int write,
@@ -103,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
103struct file *get_empty_filp(void) 105struct file *get_empty_filp(void)
104{ 106{
105 const struct cred *cred = current_cred(); 107 const struct cred *cred = current_cred();
106 static int old_max; 108 static long old_max;
107 struct file * f; 109 struct file * f;
108 110
109 /* 111 /*
@@ -138,8 +140,7 @@ struct file *get_empty_filp(void)
138over: 140over:
139 /* Ran out of filps - report that */ 141 /* Ran out of filps - report that */
140 if (get_nr_files() > old_max) { 142 if (get_nr_files() > old_max) {
141 printk(KERN_INFO "VFS: file-max limit %d reached\n", 143 pr_info("VFS: file-max limit %lu reached\n", get_max_files());
142 get_max_files());
143 old_max = get_nr_files(); 144 old_max = get_nr_files();
144 } 145 }
145 goto fail; 146 goto fail;
@@ -249,7 +250,7 @@ static void __fput(struct file *file)
249 cdev_put(inode->i_cdev); 250 cdev_put(inode->i_cdev);
250 fops_put(file->f_op); 251 fops_put(file->f_op);
251 put_pid(file->f_owner.pid); 252 put_pid(file->f_owner.pid);
252 file_kill(file); 253 file_sb_list_del(file);
253 if (file->f_mode & FMODE_WRITE) 254 if (file->f_mode & FMODE_WRITE)
254 drop_file_write_access(file); 255 drop_file_write_access(file);
255 file->f_path.dentry = NULL; 256 file->f_path.dentry = NULL;
@@ -289,11 +290,20 @@ struct file *fget(unsigned int fd)
289EXPORT_SYMBOL(fget); 290EXPORT_SYMBOL(fget);
290 291
291/* 292/*
292 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 293 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
293 * You can use this only if it is guranteed that the current task already 294 *
294 * holds a refcnt to that file. That check has to be done at fget() only 295 * You can use this instead of fget if you satisfy all of the following
295 * and a flag is returned to be passed to the corresponding fput_light(). 296 * conditions:
296 * There must not be a cloning between an fget_light/fput_light pair. 297 * 1) You must call fput_light before exiting the syscall and returning control
298 * to userspace (i.e. you cannot remember the returned struct file * after
299 * returning to userspace).
300 * 2) You must not call filp_close on the returned struct file * in between
301 * calls to fget_light and fput_light.
302 * 3) You must not clone the current task in between the calls to fget_light
303 * and fput_light.
304 *
305 * The fput_needed flag returned by fget_light should be passed to the
306 * corresponding fput_light.
297 */ 307 */
298struct file *fget_light(unsigned int fd, int *fput_needed) 308struct file *fget_light(unsigned int fd, int *fput_needed)
299{ 309{
@@ -319,41 +329,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
319 return file; 329 return file;
320} 330}
321 331
322
323void put_filp(struct file *file) 332void put_filp(struct file *file)
324{ 333{
325 if (atomic_long_dec_and_test(&file->f_count)) { 334 if (atomic_long_dec_and_test(&file->f_count)) {
326 security_file_free(file); 335 security_file_free(file);
327 file_kill(file); 336 file_sb_list_del(file);
328 file_free(file); 337 file_free(file);
329 } 338 }
330} 339}
331 340
332void file_move(struct file *file, struct list_head *list) 341static inline int file_list_cpu(struct file *file)
333{ 342{
334 if (!list) 343#ifdef CONFIG_SMP
335 return; 344 return file->f_sb_list_cpu;
336 file_list_lock(); 345#else
337 list_move(&file->f_u.fu_list, list); 346 return smp_processor_id();
338 file_list_unlock(); 347#endif
339} 348}
340 349
341void file_kill(struct file *file) 350/* helper for file_sb_list_add to reduce ifdefs */
351static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
352{
353 struct list_head *list;
354#ifdef CONFIG_SMP
355 int cpu;
356 cpu = smp_processor_id();
357 file->f_sb_list_cpu = cpu;
358 list = per_cpu_ptr(sb->s_files, cpu);
359#else
360 list = &sb->s_files;
361#endif
362 list_add(&file->f_u.fu_list, list);
363}
364
365/**
366 * file_sb_list_add - add a file to the sb's file list
367 * @file: file to add
368 * @sb: sb to add it to
369 *
370 * Use this function to associate a file with the superblock of the inode it
371 * refers to.
372 */
373void file_sb_list_add(struct file *file, struct super_block *sb)
374{
375 lg_local_lock(files_lglock);
376 __file_sb_list_add(file, sb);
377 lg_local_unlock(files_lglock);
378}
379
380/**
381 * file_sb_list_del - remove a file from the sb's file list
382 * @file: file to remove
383 * @sb: sb to remove it from
384 *
385 * Use this function to remove a file from its superblock.
386 */
387void file_sb_list_del(struct file *file)
342{ 388{
343 if (!list_empty(&file->f_u.fu_list)) { 389 if (!list_empty(&file->f_u.fu_list)) {
344 file_list_lock(); 390 lg_local_lock_cpu(files_lglock, file_list_cpu(file));
345 list_del_init(&file->f_u.fu_list); 391 list_del_init(&file->f_u.fu_list);
346 file_list_unlock(); 392 lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
347 } 393 }
348} 394}
349 395
396#ifdef CONFIG_SMP
397
398/*
399 * These macros iterate all files on all CPUs for a given superblock.
400 * files_lglock must be held globally.
401 */
402#define do_file_list_for_each_entry(__sb, __file) \
403{ \
404 int i; \
405 for_each_possible_cpu(i) { \
406 struct list_head *list; \
407 list = per_cpu_ptr((__sb)->s_files, i); \
408 list_for_each_entry((__file), list, f_u.fu_list)
409
410#define while_file_list_for_each_entry \
411 } \
412}
413
414#else
415
416#define do_file_list_for_each_entry(__sb, __file) \
417{ \
418 struct list_head *list; \
419 list = &(sb)->s_files; \
420 list_for_each_entry((__file), list, f_u.fu_list)
421
422#define while_file_list_for_each_entry \
423}
424
425#endif
426
350int fs_may_remount_ro(struct super_block *sb) 427int fs_may_remount_ro(struct super_block *sb)
351{ 428{
352 struct file *file; 429 struct file *file;
353
354 /* Check that no files are currently opened for writing. */ 430 /* Check that no files are currently opened for writing. */
355 file_list_lock(); 431 lg_global_lock(files_lglock);
356 list_for_each_entry(file, &sb->s_files, f_u.fu_list) { 432 do_file_list_for_each_entry(sb, file) {
357 struct inode *inode = file->f_path.dentry->d_inode; 433 struct inode *inode = file->f_path.dentry->d_inode;
358 434
359 /* File with pending delete? */ 435 /* File with pending delete? */
@@ -363,11 +439,11 @@ int fs_may_remount_ro(struct super_block *sb)
363 /* Writeable file? */ 439 /* Writeable file? */
364 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) 440 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
365 goto too_bad; 441 goto too_bad;
366 } 442 } while_file_list_for_each_entry;
367 file_list_unlock(); 443 lg_global_unlock(files_lglock);
368 return 1; /* Tis' cool bro. */ 444 return 1; /* Tis' cool bro. */
369too_bad: 445too_bad:
370 file_list_unlock(); 446 lg_global_unlock(files_lglock);
371 return 0; 447 return 0;
372} 448}
373 449
@@ -383,8 +459,8 @@ void mark_files_ro(struct super_block *sb)
383 struct file *f; 459 struct file *f;
384 460
385retry: 461retry:
386 file_list_lock(); 462 lg_global_lock(files_lglock);
387 list_for_each_entry(f, &sb->s_files, f_u.fu_list) { 463 do_file_list_for_each_entry(sb, f) {
388 struct vfsmount *mnt; 464 struct vfsmount *mnt;
389 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) 465 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
390 continue; 466 continue;
@@ -399,21 +475,18 @@ retry:
399 continue; 475 continue;
400 file_release_write(f); 476 file_release_write(f);
401 mnt = mntget(f->f_path.mnt); 477 mnt = mntget(f->f_path.mnt);
402 file_list_unlock(); 478 /* This can sleep, so we can't hold the spinlock. */
403 /* 479 lg_global_unlock(files_lglock);
404 * This can sleep, so we can't hold
405 * the file_list_lock() spinlock.
406 */
407 mnt_drop_write(mnt); 480 mnt_drop_write(mnt);
408 mntput(mnt); 481 mntput(mnt);
409 goto retry; 482 goto retry;
410 } 483 } while_file_list_for_each_entry;
411 file_list_unlock(); 484 lg_global_unlock(files_lglock);
412} 485}
413 486
414void __init files_init(unsigned long mempages) 487void __init files_init(unsigned long mempages)
415{ 488{
416 int n; 489 unsigned long n;
417 490
418 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 491 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
419 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 492 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -424,9 +497,8 @@ void __init files_init(unsigned long mempages)
424 */ 497 */
425 498
426 n = (mempages * (PAGE_SIZE / 1024)) / 10; 499 n = (mempages * (PAGE_SIZE / 1024)) / 10;
427 files_stat.max_files = n; 500 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
428 if (files_stat.max_files < NR_FILE)
429 files_stat.max_files = NR_FILE;
430 files_defer_init(); 501 files_defer_init();
502 lg_lock_init(files_lglock);
431 percpu_counter_init(&nr_files, 0); 503 percpu_counter_init(&nr_files, 0);
432} 504}
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 50ab5eecb99..881aa3d217f 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void vxfs_put_fake_inode(struct inode *);
63extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); 63extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t);
64extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); 64extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t);
65extern struct inode * vxfs_iget(struct super_block *, ino_t); 65extern struct inode * vxfs_iget(struct super_block *, ino_t);
66extern void vxfs_clear_inode(struct inode *); 66extern void vxfs_evict_inode(struct inode *);
67 67
68/* vxfs_lookup.c */ 68/* vxfs_lookup.c */
69extern const struct inode_operations vxfs_dir_inode_ops; 69extern const struct inode_operations vxfs_dir_inode_ops;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 03a6ea5e99f..8c04eac5079 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
260 struct inode *ip = NULL; 260 struct inode *ip = NULL;
261 261
262 if ((ip = new_inode(sbp))) { 262 if ((ip = new_inode(sbp))) {
263 ip->i_ino = get_next_ino();
263 vxfs_iinit(ip, vip); 264 vxfs_iinit(ip, vip);
264 ip->i_mapping->a_ops = &vxfs_aops; 265 ip->i_mapping->a_ops = &vxfs_aops;
265 } 266 }
@@ -337,15 +338,17 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
337} 338}
338 339
339/** 340/**
340 * vxfs_clear_inode - remove inode from main memory 341 * vxfs_evict_inode - remove inode from main memory
341 * @ip: inode to discard. 342 * @ip: inode to discard.
342 * 343 *
343 * Description: 344 * Description:
344 * vxfs_clear_inode() is called on the final iput and frees the private 345 * vxfs_evict_inode() is called on the final iput and frees the private
345 * inode area. 346 * inode area.
346 */ 347 */
347void 348void
348vxfs_clear_inode(struct inode *ip) 349vxfs_evict_inode(struct inode *ip)
349{ 350{
351 truncate_inode_pages(&ip->i_data, 0);
352 end_writeback(ip);
350 kmem_cache_free(vxfs_inode_cachep, ip->i_private); 353 kmem_cache_free(vxfs_inode_cachep, ip->i_private);
351} 354}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c..6c5131d592f 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/kernel.h> 37#include <linux/kernel.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/smp_lock.h>
40 39
41#include "vxfs.h" 40#include "vxfs.h"
42#include "vxfs_dir.h" 41#include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
212 if (dp->d_name.len > VXFS_NAMELEN) 211 if (dp->d_name.len > VXFS_NAMELEN)
213 return ERR_PTR(-ENAMETOOLONG); 212 return ERR_PTR(-ENAMETOOLONG);
214 213
215 lock_kernel();
216 ino = vxfs_inode_by_name(dip, dp); 214 ino = vxfs_inode_by_name(dip, dp);
217 if (ino) { 215 if (ino) {
218 ip = vxfs_iget(dip->i_sb, ino); 216 ip = vxfs_iget(dip->i_sb, ino);
219 if (IS_ERR(ip)) { 217 if (IS_ERR(ip))
220 unlock_kernel();
221 return ERR_CAST(ip); 218 return ERR_CAST(ip);
222 }
223 } 219 }
224 unlock_kernel();
225 d_add(dp, ip); 220 d_add(dp, ip);
226 return NULL; 221 return NULL;
227} 222}
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
248 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
249 loff_t pos; 244 loff_t pos;
250 245
251 lock_kernel();
252
253 switch ((long)fp->f_pos) { 246 switch ((long)fp->f_pos) {
254 case 0: 247 case 0:
255 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
265 258
266 pos = fp->f_pos - 2; 259 pos = fp->f_pos - 2;
267 260
268 if (pos > VXFS_DIRROUND(ip->i_size)) { 261 if (pos > VXFS_DIRROUND(ip->i_size))
269 unlock_kernel();
270 return 0; 262 return 0;
271 }
272 263
273 npages = dir_pages(ip); 264 npages = dir_pages(ip);
274 nblocks = dir_blocks(ip); 265 nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
327done: 318done:
328 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; 319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
329out: 320out:
330 unlock_kernel();
331 return 0; 321 return 0;
332} 322}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e..9d1c9955838 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/smp_lock.h>
42#include <linux/stat.h> 41#include <linux/stat.h>
43#include <linux/vfs.h> 42#include <linux/vfs.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
@@ -61,7 +60,7 @@ static int vxfs_statfs(struct dentry *, struct kstatfs *);
61static int vxfs_remount(struct super_block *, int *, char *); 60static int vxfs_remount(struct super_block *, int *, char *);
62 61
63static const struct super_operations vxfs_super_ops = { 62static const struct super_operations vxfs_super_ops = {
64 .clear_inode = vxfs_clear_inode, 63 .evict_inode = vxfs_evict_inode,
65 .put_super = vxfs_put_super, 64 .put_super = vxfs_put_super,
66 .statfs = vxfs_statfs, 65 .statfs = vxfs_statfs,
67 .remount_fs = vxfs_remount, 66 .remount_fs = vxfs_remount,
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
81{ 80{
82 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 81 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
83 82
84 lock_kernel();
85
86 vxfs_put_fake_inode(infp->vsi_fship); 83 vxfs_put_fake_inode(infp->vsi_fship);
87 vxfs_put_fake_inode(infp->vsi_ilist); 84 vxfs_put_fake_inode(infp->vsi_ilist);
88 vxfs_put_fake_inode(infp->vsi_stilist); 85 vxfs_put_fake_inode(infp->vsi_stilist);
89 86
90 brelse(infp->vsi_bp); 87 brelse(infp->vsi_bp);
91 kfree(infp); 88 kfree(infp);
92
93 unlock_kernel();
94} 89}
95 90
96/** 91/**
@@ -135,7 +130,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
135} 130}
136 131
137/** 132/**
138 * vxfs_read_super - read superblock into memory and initalize filesystem 133 * vxfs_read_super - read superblock into memory and initialize filesystem
139 * @sbp: VFS superblock (to fill) 134 * @sbp: VFS superblock (to fill)
140 * @dp: fs private mount data 135 * @dp: fs private mount data
141 * @silent: do not complain loudly when sth is wrong 136 * @silent: do not complain loudly when sth is wrong
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
148 * The superblock on success, else %NULL. 143 * The superblock on success, else %NULL.
149 * 144 *
150 * Locking: 145 * Locking:
151 * We are under the bkl and @sbp->s_lock. 146 * We are under @sbp->s_lock.
152 */ 147 */
153static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) 148static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
154{ 149{
@@ -251,17 +246,16 @@ out:
251/* 246/*
252 * The usual module blurb. 247 * The usual module blurb.
253 */ 248 */
254static int vxfs_get_sb(struct file_system_type *fs_type, 249static struct dentry *vxfs_mount(struct file_system_type *fs_type,
255 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 250 int flags, const char *dev_name, void *data)
256{ 251{
257 return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, 252 return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
258 mnt);
259} 253}
260 254
261static struct file_system_type vxfs_fs_type = { 255static struct file_system_type vxfs_fs_type = {
262 .owner = THIS_MODULE, 256 .owner = THIS_MODULE,
263 .name = "vxfs", 257 .name = "vxfs",
264 .get_sb = vxfs_get_sb, 258 .mount = vxfs_mount,
265 .kill_sb = kill_block_super, 259 .kill_sb = kill_block_super,
266 .fs_flags = FS_REQUIRES_DEV, 260 .fs_flags = FS_REQUIRES_DEV,
267}; 261};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac9..3d06ccc953a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,15 +26,9 @@
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/tracepoint.h>
29#include "internal.h" 30#include "internal.h"
30 31
31#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
32
33/*
34 * We don't actually have pdflush, but this one is exported though /proc...
35 */
36int nr_pdflush_threads;
37
38/* 32/*
39 * Passed into wb_writeback(), essentially a subset of writeback_control 33 * Passed into wb_writeback(), essentially a subset of writeback_control
40 */ 34 */
@@ -50,6 +44,19 @@ struct wb_writeback_work {
50 struct completion *done; /* set if the caller waits */ 44 struct completion *done; /* set if the caller waits */
51}; 45};
52 46
47/*
48 * Include the creation of the trace points after defining the
49 * wb_writeback_work structure so that the definition remains local to this
50 * file.
51 */
52#define CREATE_TRACE_POINTS
53#include <trace/events/writeback.h>
54
55/*
56 * We don't actually have pdflush, but this one is exported though /proc...
57 */
58int nr_pdflush_threads;
59
53/** 60/**
54 * writeback_in_progress - determine whether there is writeback in progress 61 * writeback_in_progress - determine whether there is writeback in progress
55 * @bdi: the device's backing_dev_info structure. 62 * @bdi: the device's backing_dev_info structure.
@@ -59,28 +66,42 @@ struct wb_writeback_work {
59 */ 66 */
60int writeback_in_progress(struct backing_dev_info *bdi) 67int writeback_in_progress(struct backing_dev_info *bdi)
61{ 68{
62 return !list_empty(&bdi->work_list); 69 return test_bit(BDI_writeback_running, &bdi->state);
70}
71
72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73{
74 struct super_block *sb = inode->i_sb;
75
76 if (strcmp(sb->s_type->name, "bdev") == 0)
77 return inode->i_mapping->backing_dev_info;
78
79 return sb->s_bdi;
80}
81
82static inline struct inode *wb_inode(struct list_head *head)
83{
84 return list_entry(head, struct inode, i_wb_list);
63} 85}
64 86
65static void bdi_queue_work(struct backing_dev_info *bdi, 87static void bdi_queue_work(struct backing_dev_info *bdi,
66 struct wb_writeback_work *work) 88 struct wb_writeback_work *work)
67{ 89{
68 spin_lock(&bdi->wb_lock); 90 trace_writeback_queue(bdi, work);
69 list_add_tail(&work->list, &bdi->work_list);
70 spin_unlock(&bdi->wb_lock);
71 91
72 /* 92 spin_lock_bh(&bdi->wb_lock);
73 * If the default thread isn't there, make sure we add it. When 93 list_add_tail(&work->list, &bdi->work_list);
74 * it gets created and wakes up, we'll run this work. 94 if (bdi->wb.task) {
75 */ 95 wake_up_process(bdi->wb.task);
76 if (unlikely(list_empty_careful(&bdi->wb_list))) 96 } else {
97 /*
98 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it.
100 */
101 trace_writeback_nothread(bdi, work);
77 wake_up_process(default_backing_dev_info.wb.task); 102 wake_up_process(default_backing_dev_info.wb.task);
78 else {
79 struct bdi_writeback *wb = &bdi->wb;
80
81 if (wb->task)
82 wake_up_process(wb->task);
83 } 103 }
104 spin_unlock_bh(&bdi->wb_lock);
84} 105}
85 106
86static void 107static void
@@ -95,8 +116,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
95 */ 116 */
96 work = kzalloc(sizeof(*work), GFP_ATOMIC); 117 work = kzalloc(sizeof(*work), GFP_ATOMIC);
97 if (!work) { 118 if (!work) {
98 if (bdi->wb.task) 119 if (bdi->wb.task) {
120 trace_writeback_nowork(bdi);
99 wake_up_process(bdi->wb.task); 121 wake_up_process(bdi->wb.task);
122 }
100 return; 123 return;
101 } 124 }
102 125
@@ -154,11 +177,11 @@ static void redirty_tail(struct inode *inode)
154 if (!list_empty(&wb->b_dirty)) { 177 if (!list_empty(&wb->b_dirty)) {
155 struct inode *tail; 178 struct inode *tail;
156 179
157 tail = list_entry(wb->b_dirty.next, struct inode, i_list); 180 tail = wb_inode(wb->b_dirty.next);
158 if (time_before(inode->dirtied_when, tail->dirtied_when)) 181 if (time_before(inode->dirtied_when, tail->dirtied_when))
159 inode->dirtied_when = jiffies; 182 inode->dirtied_when = jiffies;
160 } 183 }
161 list_move(&inode->i_list, &wb->b_dirty); 184 list_move(&inode->i_wb_list, &wb->b_dirty);
162} 185}
163 186
164/* 187/*
@@ -168,7 +191,7 @@ static void requeue_io(struct inode *inode)
168{ 191{
169 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 192 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
170 193
171 list_move(&inode->i_list, &wb->b_more_io); 194 list_move(&inode->i_wb_list, &wb->b_more_io);
172} 195}
173 196
174static void inode_sync_complete(struct inode *inode) 197static void inode_sync_complete(struct inode *inode)
@@ -209,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
209 int do_sb_sort = 0; 232 int do_sb_sort = 0;
210 233
211 while (!list_empty(delaying_queue)) { 234 while (!list_empty(delaying_queue)) {
212 inode = list_entry(delaying_queue->prev, struct inode, i_list); 235 inode = wb_inode(delaying_queue->prev);
213 if (older_than_this && 236 if (older_than_this &&
214 inode_dirtied_after(inode, *older_than_this)) 237 inode_dirtied_after(inode, *older_than_this))
215 break; 238 break;
216 if (sb && sb != inode->i_sb) 239 if (sb && sb != inode->i_sb)
217 do_sb_sort = 1; 240 do_sb_sort = 1;
218 sb = inode->i_sb; 241 sb = inode->i_sb;
219 list_move(&inode->i_list, &tmp); 242 list_move(&inode->i_wb_list, &tmp);
220 } 243 }
221 244
222 /* just one sb in list, splice to dispatch_queue and we're done */ 245 /* just one sb in list, splice to dispatch_queue and we're done */
@@ -227,22 +250,29 @@ static void move_expired_inodes(struct list_head *delaying_queue,
227 250
228 /* Move inodes from one superblock together */ 251 /* Move inodes from one superblock together */
229 while (!list_empty(&tmp)) { 252 while (!list_empty(&tmp)) {
230 inode = list_entry(tmp.prev, struct inode, i_list); 253 sb = wb_inode(tmp.prev)->i_sb;
231 sb = inode->i_sb;
232 list_for_each_prev_safe(pos, node, &tmp) { 254 list_for_each_prev_safe(pos, node, &tmp) {
233 inode = list_entry(pos, struct inode, i_list); 255 inode = wb_inode(pos);
234 if (inode->i_sb == sb) 256 if (inode->i_sb == sb)
235 list_move(&inode->i_list, dispatch_queue); 257 list_move(&inode->i_wb_list, dispatch_queue);
236 } 258 }
237 } 259 }
238} 260}
239 261
240/* 262/*
241 * Queue all expired dirty inodes for io, eldest first. 263 * Queue all expired dirty inodes for io, eldest first.
264 * Before
265 * newly dirtied b_dirty b_io b_more_io
266 * =============> gf edc BA
267 * After
268 * newly dirtied b_dirty b_io b_more_io
269 * =============> g fBAedc
270 * |
271 * +--> dequeue for IO
242 */ 272 */
243static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 273static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
244{ 274{
245 list_splice_init(&wb->b_more_io, wb->b_io.prev); 275 list_splice_init(&wb->b_more_io, &wb->b_io);
246 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 276 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
247} 277}
248 278
@@ -352,73 +382,43 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
352 382
353 spin_lock(&inode_lock); 383 spin_lock(&inode_lock);
354 inode->i_state &= ~I_SYNC; 384 inode->i_state &= ~I_SYNC;
355 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 385 if (!(inode->i_state & I_FREEING)) {
356 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { 386 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
357 /*
358 * More pages get dirtied by a fast dirtier.
359 */
360 goto select_queue;
361 } else if (inode->i_state & I_DIRTY) {
362 /*
363 * At least XFS will redirty the inode during the
364 * writeback (delalloc) and on io completion (isize).
365 */
366 redirty_tail(inode);
367 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
368 /* 387 /*
369 * We didn't write back all the pages. nfs_writepages() 388 * We didn't write back all the pages. nfs_writepages()
370 * sometimes bales out without doing anything. Redirty 389 * sometimes bales out without doing anything.
371 * the inode; Move it from b_io onto b_more_io/b_dirty.
372 */ 390 */
373 /* 391 inode->i_state |= I_DIRTY_PAGES;
374 * akpm: if the caller was the kupdate function we put 392 if (wbc->nr_to_write <= 0) {
375 * this inode at the head of b_dirty so it gets first
376 * consideration. Otherwise, move it to the tail, for
377 * the reasons described there. I'm not really sure
378 * how much sense this makes. Presumably I had a good
379 * reasons for doing it this way, and I'd rather not
380 * muck with it at present.
381 */
382 if (wbc->for_kupdate) {
383 /* 393 /*
384 * For the kupdate function we move the inode 394 * slice used up: queue for next turn
385 * to b_more_io so it will get more writeout as
386 * soon as the queue becomes uncongested.
387 */ 395 */
388 inode->i_state |= I_DIRTY_PAGES; 396 requeue_io(inode);
389select_queue:
390 if (wbc->nr_to_write <= 0) {
391 /*
392 * slice used up: queue for next turn
393 */
394 requeue_io(inode);
395 } else {
396 /*
397 * somehow blocked: retry later
398 */
399 redirty_tail(inode);
400 }
401 } else { 397 } else {
402 /* 398 /*
403 * Otherwise fully redirty the inode so that 399 * Writeback blocked by something other than
404 * other inodes on this superblock will get some 400 * congestion. Delay the inode for some time to
405 * writeout. Otherwise heavy writing to one 401 * avoid spinning on the CPU (100% iowait)
406 * file would indefinitely suspend writeout of 402 * retrying writeback of the dirty page/inode
407 * all the other files. 403 * that cannot be performed immediately.
408 */ 404 */
409 inode->i_state |= I_DIRTY_PAGES;
410 redirty_tail(inode); 405 redirty_tail(inode);
411 } 406 }
412 } else if (atomic_read(&inode->i_count)) { 407 } else if (inode->i_state & I_DIRTY) {
413 /* 408 /*
414 * The inode is clean, inuse 409 * Filesystems can dirty the inode during writeback
410 * operations, such as delayed allocation during
411 * submission or metadata updates after data IO
412 * completion.
415 */ 413 */
416 list_move(&inode->i_list, &inode_in_use); 414 redirty_tail(inode);
417 } else { 415 } else {
418 /* 416 /*
419 * The inode is clean, unused 417 * The inode is clean. At this point we either have
418 * a reference to the inode or it's on it's way out.
419 * No need to add it back to the LRU.
420 */ 420 */
421 list_move(&inode->i_list, &inode_unused); 421 list_del_init(&inode->i_wb_list);
422 } 422 }
423 } 423 }
424 inode_sync_complete(inode); 424 inode_sync_complete(inode);
@@ -466,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
466{ 466{
467 while (!list_empty(&wb->b_io)) { 467 while (!list_empty(&wb->b_io)) {
468 long pages_skipped; 468 long pages_skipped;
469 struct inode *inode = list_entry(wb->b_io.prev, 469 struct inode *inode = wb_inode(wb->b_io.prev);
470 struct inode, i_list);
471 470
472 if (inode->i_sb != sb) { 471 if (inode->i_sb != sb) {
473 if (only_this_sb) { 472 if (only_this_sb) {
@@ -488,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
488 return 0; 487 return 0;
489 } 488 }
490 489
491 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 490 /*
491 * Don't bother with new inodes or inodes beeing freed, first
492 * kind does not need peridic writeout yet, and for the latter
493 * kind writeout is handled by the freer.
494 */
495 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
492 requeue_io(inode); 496 requeue_io(inode);
493 continue; 497 continue;
494 } 498 }
499
495 /* 500 /*
496 * Was this inode dirtied after sync_sb_inodes was called? 501 * Was this inode dirtied after sync_sb_inodes was called?
497 * This keeps sync from extra jobs and livelock. 502 * This keeps sync from extra jobs and livelock.
@@ -499,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
499 if (inode_dirtied_after(inode, wbc->wb_start)) 504 if (inode_dirtied_after(inode, wbc->wb_start))
500 return 1; 505 return 1;
501 506
502 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
503 __iget(inode); 507 __iget(inode);
504 pages_skipped = wbc->pages_skipped; 508 pages_skipped = wbc->pages_skipped;
505 writeback_single_inode(inode, wbc); 509 writeback_single_inode(inode, wbc);
@@ -530,14 +534,14 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
530{ 534{
531 int ret = 0; 535 int ret = 0;
532 536
533 wbc->wb_start = jiffies; /* livelock avoidance */ 537 if (!wbc->wb_start)
538 wbc->wb_start = jiffies; /* livelock avoidance */
534 spin_lock(&inode_lock); 539 spin_lock(&inode_lock);
535 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 540 if (!wbc->for_kupdate || list_empty(&wb->b_io))
536 queue_io(wb, wbc->older_than_this); 541 queue_io(wb, wbc->older_than_this);
537 542
538 while (!list_empty(&wb->b_io)) { 543 while (!list_empty(&wb->b_io)) {
539 struct inode *inode = list_entry(wb->b_io.prev, 544 struct inode *inode = wb_inode(wb->b_io.prev);
540 struct inode, i_list);
541 struct super_block *sb = inode->i_sb; 545 struct super_block *sb = inode->i_sb;
542 546
543 if (!pin_sb_for_writeback(sb)) { 547 if (!pin_sb_for_writeback(sb)) {
@@ -559,7 +563,6 @@ static void __writeback_inodes_sb(struct super_block *sb,
559{ 563{
560 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 564 WARN_ON(!rwsem_is_locked(&sb->s_umount));
561 565
562 wbc->wb_start = jiffies; /* livelock avoidance */
563 spin_lock(&inode_lock); 566 spin_lock(&inode_lock);
564 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 567 if (!wbc->for_kupdate || list_empty(&wb->b_io))
565 queue_io(wb, wbc->older_than_this); 568 queue_io(wb, wbc->older_than_this);
@@ -580,10 +583,10 @@ static inline bool over_bground_thresh(void)
580{ 583{
581 unsigned long background_thresh, dirty_thresh; 584 unsigned long background_thresh, dirty_thresh;
582 585
583 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 586 global_dirty_limits(&background_thresh, &dirty_thresh);
584 587
585 return (global_page_state(NR_FILE_DIRTY) + 588 return (global_page_state(NR_FILE_DIRTY) +
586 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 589 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
587} 590}
588 591
589/* 592/*
@@ -625,6 +628,7 @@ static long wb_writeback(struct bdi_writeback *wb,
625 wbc.range_end = LLONG_MAX; 628 wbc.range_end = LLONG_MAX;
626 } 629 }
627 630
631 wbc.wb_start = jiffies; /* livelock avoidance */
628 for (;;) { 632 for (;;) {
629 /* 633 /*
630 * Stop writeback when nr_pages has been consumed 634 * Stop writeback when nr_pages has been consumed
@@ -642,10 +646,14 @@ static long wb_writeback(struct bdi_writeback *wb,
642 wbc.more_io = 0; 646 wbc.more_io = 0;
643 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 647 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
644 wbc.pages_skipped = 0; 648 wbc.pages_skipped = 0;
649
650 trace_wbc_writeback_start(&wbc, wb->bdi);
645 if (work->sb) 651 if (work->sb)
646 __writeback_inodes_sb(work->sb, wb, &wbc); 652 __writeback_inodes_sb(work->sb, wb, &wbc);
647 else 653 else
648 writeback_inodes_wb(wb, &wbc); 654 writeback_inodes_wb(wb, &wbc);
655 trace_wbc_writeback_written(&wbc, wb->bdi);
656
649 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 657 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
650 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 658 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
651 659
@@ -671,8 +679,8 @@ static long wb_writeback(struct bdi_writeback *wb,
671 */ 679 */
672 spin_lock(&inode_lock); 680 spin_lock(&inode_lock);
673 if (!list_empty(&wb->b_more_io)) { 681 if (!list_empty(&wb->b_more_io)) {
674 inode = list_entry(wb->b_more_io.prev, 682 inode = wb_inode(wb->b_more_io.prev);
675 struct inode, i_list); 683 trace_wbc_writeback_wait(&wbc, wb->bdi);
676 inode_wait_for_writeback(inode); 684 inode_wait_for_writeback(inode);
677 } 685 }
678 spin_unlock(&inode_lock); 686 spin_unlock(&inode_lock);
@@ -685,20 +693,31 @@ static long wb_writeback(struct bdi_writeback *wb,
685 * Return the next wb_writeback_work struct that hasn't been processed yet. 693 * Return the next wb_writeback_work struct that hasn't been processed yet.
686 */ 694 */
687static struct wb_writeback_work * 695static struct wb_writeback_work *
688get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) 696get_next_work_item(struct backing_dev_info *bdi)
689{ 697{
690 struct wb_writeback_work *work = NULL; 698 struct wb_writeback_work *work = NULL;
691 699
692 spin_lock(&bdi->wb_lock); 700 spin_lock_bh(&bdi->wb_lock);
693 if (!list_empty(&bdi->work_list)) { 701 if (!list_empty(&bdi->work_list)) {
694 work = list_entry(bdi->work_list.next, 702 work = list_entry(bdi->work_list.next,
695 struct wb_writeback_work, list); 703 struct wb_writeback_work, list);
696 list_del_init(&work->list); 704 list_del_init(&work->list);
697 } 705 }
698 spin_unlock(&bdi->wb_lock); 706 spin_unlock_bh(&bdi->wb_lock);
699 return work; 707 return work;
700} 708}
701 709
710/*
711 * Add in the number of potentially dirty inodes, because each inode
712 * write can dirty pagecache in the underlying blockdev.
713 */
714static unsigned long get_nr_dirty_pages(void)
715{
716 return global_page_state(NR_FILE_DIRTY) +
717 global_page_state(NR_UNSTABLE_NFS) +
718 get_nr_dirty_inodes();
719}
720
702static long wb_check_old_data_flush(struct bdi_writeback *wb) 721static long wb_check_old_data_flush(struct bdi_writeback *wb)
703{ 722{
704 unsigned long expired; 723 unsigned long expired;
@@ -716,9 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
716 return 0; 735 return 0;
717 736
718 wb->last_old_flush = jiffies; 737 wb->last_old_flush = jiffies;
719 nr_pages = global_page_state(NR_FILE_DIRTY) + 738 nr_pages = get_nr_dirty_pages();
720 global_page_state(NR_UNSTABLE_NFS) +
721 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
722 739
723 if (nr_pages) { 740 if (nr_pages) {
724 struct wb_writeback_work work = { 741 struct wb_writeback_work work = {
@@ -743,7 +760,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
743 struct wb_writeback_work *work; 760 struct wb_writeback_work *work;
744 long wrote = 0; 761 long wrote = 0;
745 762
746 while ((work = get_next_work_item(bdi, wb)) != NULL) { 763 set_bit(BDI_writeback_running, &wb->bdi->state);
764 while ((work = get_next_work_item(bdi)) != NULL) {
747 /* 765 /*
748 * Override sync mode, in case we must wait for completion 766 * Override sync mode, in case we must wait for completion
749 * because this thread is exiting now. 767 * because this thread is exiting now.
@@ -751,6 +769,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
751 if (force_wait) 769 if (force_wait)
752 work->sync_mode = WB_SYNC_ALL; 770 work->sync_mode = WB_SYNC_ALL;
753 771
772 trace_writeback_exec(bdi, work);
773
754 wrote += wb_writeback(wb, work); 774 wrote += wb_writeback(wb, work);
755 775
756 /* 776 /*
@@ -767,6 +787,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
767 * Check for periodic writeback, kupdated() style 787 * Check for periodic writeback, kupdated() style
768 */ 788 */
769 wrote += wb_check_old_data_flush(wb); 789 wrote += wb_check_old_data_flush(wb);
790 clear_bit(BDI_writeback_running, &wb->bdi->state);
770 791
771 return wrote; 792 return wrote;
772} 793}
@@ -775,47 +796,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
775 * Handle writeback of dirty data for the device backed by this bdi. Also 796 * Handle writeback of dirty data for the device backed by this bdi. Also
776 * wakes up periodically and does kupdated style flushing. 797 * wakes up periodically and does kupdated style flushing.
777 */ 798 */
778int bdi_writeback_task(struct bdi_writeback *wb) 799int bdi_writeback_thread(void *data)
779{ 800{
780 unsigned long last_active = jiffies; 801 struct bdi_writeback *wb = data;
781 unsigned long wait_jiffies = -1UL; 802 struct backing_dev_info *bdi = wb->bdi;
782 long pages_written; 803 long pages_written;
783 804
805 current->flags |= PF_SWAPWRITE;
806 set_freezable();
807 wb->last_active = jiffies;
808
809 /*
810 * Our parent may run at a different priority, just set us to normal
811 */
812 set_user_nice(current, 0);
813
814 trace_writeback_thread_start(bdi);
815
784 while (!kthread_should_stop()) { 816 while (!kthread_should_stop()) {
817 /*
818 * Remove own delayed wake-up timer, since we are already awake
819 * and we'll take care of the preriodic write-back.
820 */
821 del_timer(&wb->wakeup_timer);
822
785 pages_written = wb_do_writeback(wb, 0); 823 pages_written = wb_do_writeback(wb, 0);
786 824
825 trace_writeback_pages_written(pages_written);
826
787 if (pages_written) 827 if (pages_written)
788 last_active = jiffies; 828 wb->last_active = jiffies;
789 else if (wait_jiffies != -1UL) {
790 unsigned long max_idle;
791 829
792 /* 830 set_current_state(TASK_INTERRUPTIBLE);
793 * Longest period of inactivity that we tolerate. If we 831 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
794 * see dirty data again later, the task will get 832 __set_current_state(TASK_RUNNING);
795 * recreated automatically. 833 continue;
796 */
797 max_idle = max(5UL * 60 * HZ, wait_jiffies);
798 if (time_after(jiffies, max_idle + last_active))
799 break;
800 } 834 }
801 835
802 if (dirty_writeback_interval) { 836 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
803 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 837 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
804 schedule_timeout_interruptible(wait_jiffies); 838 else {
805 } else { 839 /*
806 set_current_state(TASK_INTERRUPTIBLE); 840 * We have nothing to do, so can go sleep without any
807 if (list_empty_careful(&wb->bdi->work_list) && 841 * timeout and save power. When a work is queued or
808 !kthread_should_stop()) 842 * something is made dirty - we will be woken up.
809 schedule(); 843 */
810 __set_current_state(TASK_RUNNING); 844 schedule();
811 } 845 }
812 846
813 try_to_freeze(); 847 try_to_freeze();
814 } 848 }
815 849
850 /* Flush any work that raced with us exiting */
851 if (!list_empty(&bdi->work_list))
852 wb_do_writeback(wb, 1);
853
854 trace_writeback_thread_stop(bdi);
816 return 0; 855 return 0;
817} 856}
818 857
858
819/* 859/*
820 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 860 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
821 * the whole world. 861 * the whole world.
@@ -890,6 +930,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
890void __mark_inode_dirty(struct inode *inode, int flags) 930void __mark_inode_dirty(struct inode *inode, int flags)
891{ 931{
892 struct super_block *sb = inode->i_sb; 932 struct super_block *sb = inode->i_sb;
933 struct backing_dev_info *bdi = NULL;
934 bool wakeup_bdi = false;
893 935
894 /* 936 /*
895 * Don't do this for I_DIRTY_PAGES - that doesn't actually 937 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -932,10 +974,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
932 * dirty list. Add blockdev inodes as well. 974 * dirty list. Add blockdev inodes as well.
933 */ 975 */
934 if (!S_ISBLK(inode->i_mode)) { 976 if (!S_ISBLK(inode->i_mode)) {
935 if (hlist_unhashed(&inode->i_hash)) 977 if (inode_unhashed(inode))
936 goto out; 978 goto out;
937 } 979 }
938 if (inode->i_state & (I_FREEING|I_CLEAR)) 980 if (inode->i_state & I_FREEING)
939 goto out; 981 goto out;
940 982
941 /* 983 /*
@@ -943,22 +985,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
943 * reposition it (that would break b_dirty time-ordering). 985 * reposition it (that would break b_dirty time-ordering).
944 */ 986 */
945 if (!was_dirty) { 987 if (!was_dirty) {
946 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 988 bdi = inode_to_bdi(inode);
947 struct backing_dev_info *bdi = wb->bdi; 989
948 990 if (bdi_cap_writeback_dirty(bdi)) {
949 if (bdi_cap_writeback_dirty(bdi) && 991 WARN(!test_bit(BDI_registered, &bdi->state),
950 !test_bit(BDI_registered, &bdi->state)) { 992 "bdi-%s not registered\n", bdi->name);
951 WARN_ON(1); 993
952 printk(KERN_ERR "bdi-%s not registered\n", 994 /*
953 bdi->name); 995 * If this is the first dirty inode for this
996 * bdi, we have to wake-up the corresponding
997 * bdi thread to make sure background
998 * write-back happens later.
999 */
1000 if (!wb_has_dirty_io(&bdi->wb))
1001 wakeup_bdi = true;
954 } 1002 }
955 1003
956 inode->dirtied_when = jiffies; 1004 inode->dirtied_when = jiffies;
957 list_move(&inode->i_list, &wb->b_dirty); 1005 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
958 } 1006 }
959 } 1007 }
960out: 1008out:
961 spin_unlock(&inode_lock); 1009 spin_unlock(&inode_lock);
1010
1011 if (wakeup_bdi)
1012 bdi_wakeup_thread_delayed(bdi);
962} 1013}
963EXPORT_SYMBOL(__mark_inode_dirty); 1014EXPORT_SYMBOL(__mark_inode_dirty);
964 1015
@@ -1001,7 +1052,7 @@ static void wait_sb_inodes(struct super_block *sb)
1001 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1052 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1002 struct address_space *mapping; 1053 struct address_space *mapping;
1003 1054
1004 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 1055 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
1005 continue; 1056 continue;
1006 mapping = inode->i_mapping; 1057 mapping = inode->i_mapping;
1007 if (mapping->nrpages == 0) 1058 if (mapping->nrpages == 0)
@@ -1030,33 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb)
1030} 1081}
1031 1082
1032/** 1083/**
1033 * writeback_inodes_sb - writeback dirty inodes from given super_block 1084 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1034 * @sb: the superblock 1085 * @sb: the superblock
1086 * @nr: the number of pages to write
1035 * 1087 *
1036 * Start writeback on some inodes on this super_block. No guarantees are made 1088 * Start writeback on some inodes on this super_block. No guarantees are made
1037 * on how many (if any) will be written, and this function does not wait 1089 * on how many (if any) will be written, and this function does not wait
1038 * for IO completion of submitted IO. The number of pages submitted is 1090 * for IO completion of submitted IO.
1039 * returned.
1040 */ 1091 */
1041void writeback_inodes_sb(struct super_block *sb) 1092void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1042{ 1093{
1043 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1044 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1045 DECLARE_COMPLETION_ONSTACK(done); 1094 DECLARE_COMPLETION_ONSTACK(done);
1046 struct wb_writeback_work work = { 1095 struct wb_writeback_work work = {
1047 .sb = sb, 1096 .sb = sb,
1048 .sync_mode = WB_SYNC_NONE, 1097 .sync_mode = WB_SYNC_NONE,
1049 .done = &done, 1098 .done = &done,
1099 .nr_pages = nr,
1050 }; 1100 };
1051 1101
1052 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1102 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1053
1054 work.nr_pages = nr_dirty + nr_unstable +
1055 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1056
1057 bdi_queue_work(sb->s_bdi, &work); 1103 bdi_queue_work(sb->s_bdi, &work);
1058 wait_for_completion(&done); 1104 wait_for_completion(&done);
1059} 1105}
1106EXPORT_SYMBOL(writeback_inodes_sb_nr);
1107
1108/**
1109 * writeback_inodes_sb - writeback dirty inodes from given super_block
1110 * @sb: the superblock
1111 *
1112 * Start writeback on some inodes on this super_block. No guarantees are made
1113 * on how many (if any) will be written, and this function does not wait
1114 * for IO completion of submitted IO.
1115 */
1116void writeback_inodes_sb(struct super_block *sb)
1117{
1118 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1119}
1060EXPORT_SYMBOL(writeback_inodes_sb); 1120EXPORT_SYMBOL(writeback_inodes_sb);
1061 1121
1062/** 1122/**
@@ -1079,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
1079EXPORT_SYMBOL(writeback_inodes_sb_if_idle); 1139EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1080 1140
1081/** 1141/**
1142 * writeback_inodes_sb_if_idle - start writeback if none underway
1143 * @sb: the superblock
1144 * @nr: the number of pages to write
1145 *
1146 * Invoke writeback_inodes_sb if no writeback is currently underway.
1147 * Returns 1 if writeback was started, 0 if not.
1148 */
1149int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1150 unsigned long nr)
1151{
1152 if (!writeback_in_progress(sb->s_bdi)) {
1153 down_read(&sb->s_umount);
1154 writeback_inodes_sb_nr(sb, nr);
1155 up_read(&sb->s_umount);
1156 return 1;
1157 } else
1158 return 0;
1159}
1160EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1161
1162/**
1082 * sync_inodes_sb - sync sb inode pages 1163 * sync_inodes_sb - sync sb inode pages
1083 * @sb: the superblock 1164 * @sb: the superblock
1084 * 1165 *
@@ -1159,3 +1240,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1159 return ret; 1240 return ret;
1160} 1241}
1161EXPORT_SYMBOL(sync_inode); 1242EXPORT_SYMBOL(sync_inode);
1243
1244/**
1245 * sync_inode - write an inode to disk
1246 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete.
1248 *
1249 * Write an inode to disk and adjust it's dirty state after completion.
1250 *
1251 * Note: only writes the actual inode, no associated data or other metadata.
1252 */
1253int sync_inode_metadata(struct inode *inode, int wait)
1254{
1255 struct writeback_control wbc = {
1256 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1257 .nr_to_write = 0, /* metadata-only */
1258 };
1259
1260 return sync_inode(inode, &wbc);
1261}
1262EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db..ed45a9cf5f3 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
13{ 13{
14 struct path old_root; 14 struct path old_root;
15 15
16 write_lock(&fs->lock); 16 spin_lock(&fs->lock);
17 old_root = fs->root; 17 old_root = fs->root;
18 fs->root = *path; 18 fs->root = *path;
19 path_get(path); 19 path_get(path);
20 write_unlock(&fs->lock); 20 spin_unlock(&fs->lock);
21 if (old_root.dentry) 21 if (old_root.dentry)
22 path_put(&old_root); 22 path_put(&old_root);
23} 23}
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{ 30{
31 struct path old_pwd; 31 struct path old_pwd;
32 32
33 write_lock(&fs->lock); 33 spin_lock(&fs->lock);
34 old_pwd = fs->pwd; 34 old_pwd = fs->pwd;
35 fs->pwd = *path; 35 fs->pwd = *path;
36 path_get(path); 36 path_get(path);
37 write_unlock(&fs->lock); 37 spin_unlock(&fs->lock);
38 38
39 if (old_pwd.dentry) 39 if (old_pwd.dentry)
40 path_put(&old_pwd); 40 path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
51 task_lock(p); 51 task_lock(p);
52 fs = p->fs; 52 fs = p->fs;
53 if (fs) { 53 if (fs) {
54 write_lock(&fs->lock); 54 spin_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry 55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) { 56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root); 57 path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
64 fs->pwd = *new_root; 64 fs->pwd = *new_root;
65 count++; 65 count++;
66 } 66 }
67 write_unlock(&fs->lock); 67 spin_unlock(&fs->lock);
68 } 68 }
69 task_unlock(p); 69 task_unlock(p);
70 } while_each_thread(g, p); 70 } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
87 if (fs) { 87 if (fs) {
88 int kill; 88 int kill;
89 task_lock(tsk); 89 task_lock(tsk);
90 write_lock(&fs->lock); 90 spin_lock(&fs->lock);
91 tsk->fs = NULL; 91 tsk->fs = NULL;
92 kill = !--fs->users; 92 kill = !--fs->users;
93 write_unlock(&fs->lock); 93 spin_unlock(&fs->lock);
94 task_unlock(tsk); 94 task_unlock(tsk);
95 if (kill) 95 if (kill)
96 free_fs_struct(fs); 96 free_fs_struct(fs);
@@ -104,14 +104,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
104 if (fs) { 104 if (fs) {
105 fs->users = 1; 105 fs->users = 1;
106 fs->in_exec = 0; 106 fs->in_exec = 0;
107 rwlock_init(&fs->lock); 107 spin_lock_init(&fs->lock);
108 fs->umask = old->umask; 108 fs->umask = old->umask;
109 read_lock(&old->lock); 109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 } 110 }
116 return fs; 111 return fs;
117} 112}
@@ -126,10 +121,10 @@ int unshare_fs_struct(void)
126 return -ENOMEM; 121 return -ENOMEM;
127 122
128 task_lock(current); 123 task_lock(current);
129 write_lock(&fs->lock); 124 spin_lock(&fs->lock);
130 kill = !--fs->users; 125 kill = !--fs->users;
131 current->fs = new_fs; 126 current->fs = new_fs;
132 write_unlock(&fs->lock); 127 spin_unlock(&fs->lock);
133 task_unlock(current); 128 task_unlock(current);
134 129
135 if (kill) 130 if (kill)
@@ -148,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
148/* to be mentioned only in INIT_TASK */ 143/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = { 144struct fs_struct init_fs = {
150 .users = 1, 145 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock), 146 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022, 147 .umask = 0022,
153}; 148};
154 149
@@ -161,14 +156,14 @@ void daemonize_fs_struct(void)
161 156
162 task_lock(current); 157 task_lock(current);
163 158
164 write_lock(&init_fs.lock); 159 spin_lock(&init_fs.lock);
165 init_fs.users++; 160 init_fs.users++;
166 write_unlock(&init_fs.lock); 161 spin_unlock(&init_fs.lock);
167 162
168 write_lock(&fs->lock); 163 spin_lock(&fs->lock);
169 current->fs = &init_fs; 164 current->fs = &init_fs;
170 kill = !--fs->users; 165 kill = !--fs->users;
171 write_unlock(&fs->lock); 166 spin_unlock(&fs->lock);
172 167
173 task_unlock(current); 168 task_unlock(current);
174 if (kill) 169 if (kill)
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f..3f6dfa98988 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 select SLOW_WORK
5 help 4 help
6 This option enables a generic filesystem caching manager that can be 5 This option enables a generic filesystem caching manager that can be
7 used by various network and other filesystems to cache data locally. 6 used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e..f6aad48d38a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
82extern unsigned fscache_defer_create; 82extern unsigned fscache_defer_create;
83extern unsigned fscache_debug; 83extern unsigned fscache_debug;
84extern struct kobject *fscache_root; 84extern struct kobject *fscache_root;
85extern struct workqueue_struct *fscache_object_wq;
86extern struct workqueue_struct *fscache_op_wq;
87DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
88
89static inline bool fscache_object_congested(void)
90{
91 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
92}
85 93
86extern int fscache_wait_bit(void *); 94extern int fscache_wait_bit(void *);
87extern int fscache_wait_bit_interruptible(void *); 95extern int fscache_wait_bit_interruptible(void *);
@@ -313,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
313#define dbgprintk(FMT, ...) \ 321#define dbgprintk(FMT, ...) \
314 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 322 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
315 323
316/* make sure we maintain the format strings, even when debugging is disabled */
317static inline __attribute__((format(printf, 1, 2)))
318void _dbprintk(const char *fmt, ...)
319{
320}
321
322#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 324#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
323#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 325#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
324#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 326#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
325 327
326#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 328#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
327 329
328#ifdef __KDEBUG 330#ifdef __KDEBUG
329#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) 331#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -350,9 +352,9 @@ do { \
350} while (0) 352} while (0)
351 353
352#else 354#else
353#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 355#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
354#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 356#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
355#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 357#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
356#endif 358#endif
357 359
358/* 360/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f0..f9d856773f7 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/completion.h> 16#include <linux/completion.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/seq_file.h>
18#include "internal.h" 19#include "internal.h"
19 20
20MODULE_DESCRIPTION("FS Cache Manager"); 21MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask"); 41 "FS-Cache debugging mask");
41 42
42struct kobject *fscache_root; 43struct kobject *fscache_root;
44struct workqueue_struct *fscache_object_wq;
45struct workqueue_struct *fscache_op_wq;
46
47DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
48
49/* these values serve as lower bounds, will be adjusted in fscache_init() */
50static unsigned fscache_object_max_active = 4;
51static unsigned fscache_op_max_active = 2;
52
53#ifdef CONFIG_SYSCTL
54static struct ctl_table_header *fscache_sysctl_header;
55
56static int fscache_max_active_sysctl(struct ctl_table *table, int write,
57 void __user *buffer,
58 size_t *lenp, loff_t *ppos)
59{
60 struct workqueue_struct **wqp = table->extra1;
61 unsigned int *datap = table->data;
62 int ret;
63
64 ret = proc_dointvec(table, write, buffer, lenp, ppos);
65 if (ret == 0)
66 workqueue_set_max_active(*wqp, *datap);
67 return ret;
68}
69
70ctl_table fscache_sysctls[] = {
71 {
72 .procname = "object_max_active",
73 .data = &fscache_object_max_active,
74 .maxlen = sizeof(unsigned),
75 .mode = 0644,
76 .proc_handler = fscache_max_active_sysctl,
77 .extra1 = &fscache_object_wq,
78 },
79 {
80 .procname = "operation_max_active",
81 .data = &fscache_op_max_active,
82 .maxlen = sizeof(unsigned),
83 .mode = 0644,
84 .proc_handler = fscache_max_active_sysctl,
85 .extra1 = &fscache_op_wq,
86 },
87 {}
88};
89
90ctl_table fscache_sysctls_root[] = {
91 {
92 .procname = "fscache",
93 .mode = 0555,
94 .child = fscache_sysctls,
95 },
96 {}
97};
98#endif
43 99
44/* 100/*
45 * initialise the fs caching module 101 * initialise the fs caching module
46 */ 102 */
47static int __init fscache_init(void) 103static int __init fscache_init(void)
48{ 104{
105 unsigned int nr_cpus = num_possible_cpus();
106 unsigned int cpu;
49 int ret; 107 int ret;
50 108
51 ret = slow_work_register_user(THIS_MODULE); 109 fscache_object_max_active =
52 if (ret < 0) 110 clamp_val(nr_cpus,
53 goto error_slow_work; 111 fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
112
113 ret = -ENOMEM;
114 fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
115 fscache_object_max_active);
116 if (!fscache_object_wq)
117 goto error_object_wq;
118
119 fscache_op_max_active =
120 clamp_val(fscache_object_max_active / 2,
121 fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
122
123 ret = -ENOMEM;
124 fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
125 fscache_op_max_active);
126 if (!fscache_op_wq)
127 goto error_op_wq;
128
129 for_each_possible_cpu(cpu)
130 init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
54 131
55 ret = fscache_proc_init(); 132 ret = fscache_proc_init();
56 if (ret < 0) 133 if (ret < 0)
57 goto error_proc; 134 goto error_proc;
58 135
136#ifdef CONFIG_SYSCTL
137 ret = -ENOMEM;
138 fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
139 if (!fscache_sysctl_header)
140 goto error_sysctl;
141#endif
142
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", 143 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie), 144 sizeof(struct fscache_cookie),
61 0, 145 0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
78error_kobj: 162error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar); 163 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar: 164error_cookie_jar:
165#ifdef CONFIG_SYSCTL
166 unregister_sysctl_table(fscache_sysctl_header);
167error_sysctl:
168#endif
81 fscache_proc_cleanup(); 169 fscache_proc_cleanup();
82error_proc: 170error_proc:
83 slow_work_unregister_user(THIS_MODULE); 171 destroy_workqueue(fscache_op_wq);
84error_slow_work: 172error_op_wq:
173 destroy_workqueue(fscache_object_wq);
174error_object_wq:
85 return ret; 175 return ret;
86} 176}
87 177
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
96 186
97 kobject_put(fscache_root); 187 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar); 188 kmem_cache_destroy(fscache_cookie_jar);
189#ifdef CONFIG_SYSCTL
190 unregister_sysctl_table(fscache_sysctl_header);
191#endif
99 fscache_proc_cleanup(); 192 fscache_proc_cleanup();
100 slow_work_unregister_user(THIS_MODULE); 193 destroy_workqueue(fscache_op_wq);
194 destroy_workqueue(fscache_object_wq);
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n"); 195 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102} 196}
103 197
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c533..ebe29c58138 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ 34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ 35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ 36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ 37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ 38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
39 39
40 u8 buf[512]; /* key and aux data buffer */ 40 u8 buf[512]; /* key and aux data buffer */
41}; 41};
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
231 READS, NOREADS); 231 READS, NOREADS);
232 FILTER(obj->events & obj->event_mask, 232 FILTER(obj->events & obj->event_mask,
233 EVENTS, NOEVENTS); 233 EVENTS, NOEVENTS);
234 FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), 234 FILTER(work_busy(&obj->work), WORK, NOWORK);
235 WORK, NOWORK);
236 } 235 }
237 236
238 seq_printf(m, 237 seq_printf(m,
239 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", 238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
240 obj->debug_id, 239 obj->debug_id,
241 obj->parent ? obj->parent->debug_id : -1, 240 obj->parent ? obj->parent->debug_id : -1,
242 fscache_object_states_short[obj->state], 241 fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
249 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, 248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
250 obj->events, 249 obj->events,
251 obj->flags, 250 obj->flags,
252 obj->work.flags); 251 work_busy(&obj->work));
253 252
254 no_cookie = true; 253 no_cookie = true;
255 keylen = auxlen = 0; 254 keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ff..b6b897c550a 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/seq_file.h>
18#include "internal.h" 17#include "internal.h"
19 18
20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 19const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
50 [FSCACHE_OBJECT_DEAD] = "DEAD", 49 [FSCACHE_OBJECT_DEAD] = "DEAD",
51}; 50};
52 51
53static void fscache_object_slow_work_put_ref(struct slow_work *); 52static int fscache_get_object(struct fscache_object *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 53static void fscache_put_object(struct fscache_object *);
55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif
59static void fscache_initialise_object(struct fscache_object *); 54static void fscache_initialise_object(struct fscache_object *);
60static void fscache_lookup_object(struct fscache_object *); 55static void fscache_lookup_object(struct fscache_object *);
61static void fscache_object_available(struct fscache_object *); 56static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
64static void fscache_enqueue_dependents(struct fscache_object *); 59static void fscache_enqueue_dependents(struct fscache_object *);
65static void fscache_dequeue_object(struct fscache_object *); 60static void fscache_dequeue_object(struct fscache_object *);
66 61
67const struct slow_work_ops fscache_object_slow_work_ops = {
68 .owner = THIS_MODULE,
69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc,
74#endif
75};
76EXPORT_SYMBOL(fscache_object_slow_work_ops);
77
78/* 62/*
79 * we need to notify the parent when an op completes that we had outstanding 63 * we need to notify the parent when an op completes that we had outstanding
80 * upon it 64 * upon it
@@ -345,7 +329,7 @@ unsupported_event:
345/* 329/*
346 * execute an object 330 * execute an object
347 */ 331 */
348static void fscache_object_slow_work_execute(struct slow_work *work) 332void fscache_object_work_func(struct work_struct *work)
349{ 333{
350 struct fscache_object *object = 334 struct fscache_object *object =
351 container_of(work, struct fscache_object, work); 335 container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
359 if (object->events & object->event_mask) 343 if (object->events & object->event_mask)
360 fscache_enqueue_object(object); 344 fscache_enqueue_object(object);
361 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); 345 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
346 fscache_put_object(object);
362} 347}
363 348EXPORT_SYMBOL(fscache_object_work_func);
364/*
365 * describe an object for slow-work debugging
366 */
367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m)
370{
371 struct fscache_object *object =
372 container_of(work, struct fscache_object, work);
373
374 seq_printf(m, "FSC: OBJ%x: %s",
375 object->debug_id,
376 fscache_object_states_short[object->state]);
377}
378#endif
379 349
380/* 350/*
381 * initialise an object 351 * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
393 _enter(""); 363 _enter("");
394 ASSERT(object->cookie != NULL); 364 ASSERT(object->cookie != NULL);
395 ASSERT(object->cookie->parent != NULL); 365 ASSERT(object->cookie->parent != NULL);
396 ASSERT(list_empty(&object->work.link));
397 366
398 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | 367 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
399 (1 << FSCACHE_OBJECT_EV_RELEASE) | 368 (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
671 object->parent = NULL; 640 object->parent = NULL;
672 } 641 }
673 642
674 /* this just shifts the object release to the slow work processor */ 643 /* this just shifts the object release to the work processor */
675 fscache_stat(&fscache_n_cop_put_object); 644 fscache_put_object(object);
676 object->cache->ops->put_object(object);
677 fscache_stat_d(&fscache_n_cop_put_object);
678 645
679 _leave(""); 646 _leave("");
680} 647}
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
758} 725}
759 726
760/* 727/*
761 * allow the slow work item processor to get a ref on an object 728 * get a ref on an object
762 */ 729 */
763static int fscache_object_slow_work_get_ref(struct slow_work *work) 730static int fscache_get_object(struct fscache_object *object)
764{ 731{
765 struct fscache_object *object =
766 container_of(work, struct fscache_object, work);
767 int ret; 732 int ret;
768 733
769 fscache_stat(&fscache_n_cop_grab_object); 734 fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
773} 738}
774 739
775/* 740/*
776 * allow the slow work item processor to discard a ref on a work item 741 * discard a ref on a work item
777 */ 742 */
778static void fscache_object_slow_work_put_ref(struct slow_work *work) 743static void fscache_put_object(struct fscache_object *object)
779{ 744{
780 struct fscache_object *object =
781 container_of(work, struct fscache_object, work);
782
783 fscache_stat(&fscache_n_cop_put_object); 745 fscache_stat(&fscache_n_cop_put_object);
784 object->cache->ops->put_object(object); 746 object->cache->ops->put_object(object);
785 fscache_stat_d(&fscache_n_cop_put_object); 747 fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
792{ 754{
793 _enter("{OBJ%x}", object->debug_id); 755 _enter("{OBJ%x}", object->debug_id);
794 756
795 slow_work_enqueue(&object->work); 757 if (fscache_get_object(object) >= 0) {
758 wait_queue_head_t *cong_wq =
759 &get_cpu_var(fscache_object_cong_wait);
760
761 if (queue_work(fscache_object_wq, &object->work)) {
762 if (fscache_object_congested())
763 wake_up(cong_wq);
764 } else
765 fscache_put_object(object);
766
767 put_cpu_var(fscache_object_cong_wait);
768 }
769}
770
771/**
772 * fscache_object_sleep_till_congested - Sleep until object wq is congested
773 * @timoutp: Scheduler sleep timeout
774 *
775 * Allow an object handler to sleep until the object workqueue is congested.
776 *
777 * The caller must set up a wake up event before calling this and must have set
778 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
779 * condition before calling this function as no test is made here.
780 *
781 * %true is returned if the object wq is congested, %false otherwise.
782 */
783bool fscache_object_sleep_till_congested(signed long *timeoutp)
784{
785 wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
786 DEFINE_WAIT(wait);
787
788 if (fscache_object_congested())
789 return true;
790
791 add_wait_queue_exclusive(cong_wq, &wait);
792 if (!fscache_object_congested())
793 *timeoutp = schedule_timeout(*timeoutp);
794 finish_wait(cong_wq, &wait);
795
796 return fscache_object_congested();
796} 797}
798EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
797 799
798/* 800/*
799 * enqueue the dependents of an object for metadata-type processing 801 * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
819 821
820 /* sort onto appropriate lists */ 822 /* sort onto appropriate lists */
821 fscache_enqueue_object(dep); 823 fscache_enqueue_object(dep);
822 fscache_stat(&fscache_n_cop_put_object); 824 fscache_put_object(dep);
823 dep->cache->ops->put_object(dep);
824 fscache_stat_d(&fscache_n_cop_put_object);
825 825
826 if (!list_empty(&object->dependents)) 826 if (!list_empty(&object->dependents))
827 cond_resched_lock(&object->lock); 827 cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae4..b9f34eaede0 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
42 42
43 fscache_stat(&fscache_n_op_enqueue); 43 fscache_stat(&fscache_n_op_enqueue);
44 switch (op->flags & FSCACHE_OP_TYPE) { 44 switch (op->flags & FSCACHE_OP_TYPE) {
45 case FSCACHE_OP_FAST: 45 case FSCACHE_OP_ASYNC:
46 _debug("queue fast"); 46 _debug("queue async");
47 atomic_inc(&op->usage); 47 atomic_inc(&op->usage);
48 if (!schedule_work(&op->fast_work)) 48 if (!queue_work(fscache_op_wq, &op->work))
49 fscache_put_operation(op); 49 fscache_put_operation(op);
50 break; 50 break;
51 case FSCACHE_OP_SLOW:
52 _debug("queue slow");
53 slow_work_enqueue(&op->slow_work);
54 break;
55 case FSCACHE_OP_MYTHREAD: 51 case FSCACHE_OP_MYTHREAD:
56 _debug("queue for caller's attention"); 52 _debug("queue for caller's attention");
57 break; 53 break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
455} 451}
456 452
457/* 453/*
458 * allow the slow work item processor to get a ref on an operation 454 * execute an operation using fs_op_wq to provide processing context -
459 */ 455 * the caller holds a ref to this object, so we don't need to hold one
460static int fscache_op_get_ref(struct slow_work *work)
461{
462 struct fscache_operation *op =
463 container_of(work, struct fscache_operation, slow_work);
464
465 atomic_inc(&op->usage);
466 return 0;
467}
468
469/*
470 * allow the slow work item processor to discard a ref on an operation
471 */
472static void fscache_op_put_ref(struct slow_work *work)
473{
474 struct fscache_operation *op =
475 container_of(work, struct fscache_operation, slow_work);
476
477 fscache_put_operation(op);
478}
479
480/*
481 * execute an operation using the slow thread pool to provide processing context
482 * - the caller holds a ref to this object, so we don't need to hold one
483 */ 456 */
484static void fscache_op_execute(struct slow_work *work) 457void fscache_op_work_func(struct work_struct *work)
485{ 458{
486 struct fscache_operation *op = 459 struct fscache_operation *op =
487 container_of(work, struct fscache_operation, slow_work); 460 container_of(work, struct fscache_operation, work);
488 unsigned long start; 461 unsigned long start;
489 462
490 _enter("{OBJ%x OP%x,%d}", 463 _enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
494 start = jiffies; 467 start = jiffies;
495 op->processor(op); 468 op->processor(op);
496 fscache_hist(fscache_ops_histogram, start); 469 fscache_hist(fscache_ops_histogram, start);
470 fscache_put_operation(op);
497 471
498 _leave(""); 472 _leave("");
499} 473}
500
501/*
502 * describe an operation for slow-work debugging
503 */
504#ifdef CONFIG_SLOW_WORK_DEBUG
505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
506{
507 struct fscache_operation *op =
508 container_of(work, struct fscache_operation, slow_work);
509
510 seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
511 op->object->debug_id, op->debug_id,
512 op->name, op->state, op->flags);
513}
514#endif
515
516const struct slow_work_ops fscache_op_slow_work_ops = {
517 .owner = THIS_MODULE,
518 .get_ref = fscache_op_get_ref,
519 .put_ref = fscache_op_put_ref,
520 .execute = fscache_op_execute,
521#ifdef CONFIG_SLOW_WORK_DEBUG
522 .desc = fscache_op_desc,
523#endif
524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd21..41c441c2058 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
105 105
106page_busy: 106page_busy:
107 /* we might want to wait here, but that could deadlock the allocator as 107 /* we might want to wait here, but that could deadlock the allocator as
108 * the slow-work threads writing to the cache may all end up sleeping 108 * the work threads writing to the cache may all end up sleeping
109 * on memory allocation */ 109 * on memory allocation */
110 fscache_stat(&fscache_n_store_vmscan_busy); 110 fscache_stat(&fscache_n_store_vmscan_busy);
111 return false; 111 return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
188 return -ENOMEM; 188 return -ENOMEM;
189 } 189 }
190 190
191 fscache_operation_init(op, NULL); 191 fscache_operation_init(op, fscache_attr_changed_op, NULL);
192 fscache_operation_init_slow(op, fscache_attr_changed_op); 192 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
193 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
194 fscache_set_op_name(op, "Attr"); 193 fscache_set_op_name(op, "Attr");
195 194
196 spin_lock(&cookie->lock); 195 spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
218EXPORT_SYMBOL(__fscache_attr_changed); 217EXPORT_SYMBOL(__fscache_attr_changed);
219 218
220/* 219/*
221 * handle secondary execution given to a retrieval op on behalf of the
222 * cache
223 */
224static void fscache_retrieval_work(struct work_struct *work)
225{
226 struct fscache_retrieval *op =
227 container_of(work, struct fscache_retrieval, op.fast_work);
228 unsigned long start;
229
230 _enter("{OP%x}", op->op.debug_id);
231
232 start = jiffies;
233 op->op.processor(&op->op);
234 fscache_hist(fscache_ops_histogram, start);
235 fscache_put_operation(&op->op);
236}
237
238/*
239 * release a retrieval op reference 220 * release a retrieval op reference
240 */ 221 */
241static void fscache_release_retrieval_op(struct fscache_operation *_op) 222static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
269 return NULL; 250 return NULL;
270 } 251 }
271 252
272 fscache_operation_init(&op->op, fscache_release_retrieval_op); 253 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
273 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); 254 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
274 op->mapping = mapping; 255 op->mapping = mapping;
275 op->end_io_func = end_io_func; 256 op->end_io_func = end_io_func;
276 op->context = context; 257 op->context = context;
277 op->start_time = jiffies; 258 op->start_time = jiffies;
278 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
279 INIT_LIST_HEAD(&op->to_do); 259 INIT_LIST_HEAD(&op->to_do);
280 fscache_set_op_name(&op->op, "Retr"); 260 fscache_set_op_name(&op->op, "Retr");
281 return op; 261 return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
795 if (!op) 775 if (!op)
796 goto nomem; 776 goto nomem;
797 777
798 fscache_operation_init(&op->op, fscache_release_write_op); 778 fscache_operation_init(&op->op, fscache_write_op,
799 fscache_operation_init_slow(&op->op, fscache_write_op); 779 fscache_release_write_op);
800 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); 780 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
801 fscache_set_op_name(&op->op, "Write1"); 781 fscache_set_op_name(&op->op, "Write1");
802 782
803 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 783 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
852 fscache_stat(&fscache_n_store_ops); 832 fscache_stat(&fscache_n_store_ops);
853 fscache_stat(&fscache_n_stores_ok); 833 fscache_stat(&fscache_n_stores_ok);
854 834
855 /* the slow work queue now carries its own ref on the object */ 835 /* the work queue now carries its own ref on the object */
856 fscache_put_operation(&op->op); 836 fscache_put_operation(&op->op);
857 _leave(" = 0"); 837 _leave(" = 0");
858 return 0; 838 return 0;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f..85542a7daf4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
179static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
180 .open = nonseekable_open, 180 .open = nonseekable_open,
181 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
182 .llseek = no_llseek,
182}; 183};
183 184
184static const struct file_operations fuse_ctl_waiting_ops = { 185static const struct file_operations fuse_ctl_waiting_ops = {
185 .open = nonseekable_open, 186 .open = nonseekable_open,
186 .read = fuse_conn_waiting_read, 187 .read = fuse_conn_waiting_read,
188 .llseek = no_llseek,
187}; 189};
188 190
189static const struct file_operations fuse_conn_max_background_ops = { 191static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open, 192 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read, 193 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write, 194 .write = fuse_conn_max_background_write,
195 .llseek = no_llseek,
193}; 196};
194 197
195static const struct file_operations fuse_conn_congestion_threshold_ops = { 198static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open, 199 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read, 200 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write, 201 .write = fuse_conn_congestion_threshold_write,
202 .llseek = no_llseek,
199}; 203};
200 204
201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 205static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
218 if (!inode) 222 if (!inode)
219 return NULL; 223 return NULL;
220 224
225 inode->i_ino = get_next_ino();
221 inode->i_mode = mode; 226 inode->i_mode = mode;
222 inode->i_uid = fc->user_id; 227 inode->i_uid = fc->user_id;
223 inode->i_gid = fc->group_id; 228 inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
317 return 0; 322 return 0;
318} 323}
319 324
320static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, 325static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
321 const char *dev_name, void *raw_data, 326 int flags, const char *dev_name, void *raw_data)
322 struct vfsmount *mnt)
323{ 327{
324 return get_sb_single(fs_type, flags, raw_data, 328 return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
325 fuse_ctl_fill_super, mnt);
326} 329}
327 330
328static void fuse_ctl_kill_sb(struct super_block *sb) 331static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
341static struct file_system_type fuse_ctl_fs_type = { 344static struct file_system_type fuse_ctl_fs_type = {
342 .owner = THIS_MODULE, 345 .owner = THIS_MODULE,
343 .name = "fusectl", 346 .name = "fusectl",
344 .get_sb = fuse_ctl_get_sb, 347 .mount = fuse_ctl_mount,
345 .kill_sb = fuse_ctl_kill_sb, 348 .kill_sb = fuse_ctl_kill_sb,
346}; 349};
347 350
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278b..3e87cce5837 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
182 .unlocked_ioctl = cuse_file_ioctl, 182 .unlocked_ioctl = cuse_file_ioctl,
183 .compat_ioctl = cuse_file_compat_ioctl, 183 .compat_ioctl = cuse_file_compat_ioctl,
184 .poll = fuse_file_poll, 184 .poll = fuse_file_poll,
185 .llseek = noop_llseek,
185}; 186};
186 187
187 188
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9424796d663..6e07696308d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
239 239
240static void queue_request(struct fuse_conn *fc, struct fuse_req *req) 240static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
241{ 241{
242 req->in.h.unique = fuse_get_unique(fc);
243 req->in.h.len = sizeof(struct fuse_in_header) + 242 req->in.h.len = sizeof(struct fuse_in_header) +
244 len_args(req->in.numargs, (struct fuse_arg *) req->in.args); 243 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
245 list_add_tail(&req->list, &fc->pending); 244 list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
261 req = list_entry(fc->bg_queue.next, struct fuse_req, list); 260 req = list_entry(fc->bg_queue.next, struct fuse_req, list);
262 list_del(&req->list); 261 list_del(&req->list);
263 fc->active_background++; 262 fc->active_background++;
263 req->in.h.unique = fuse_get_unique(fc);
264 queue_request(fc, req); 264 queue_request(fc, req);
265 } 265 }
266} 266}
@@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
276 * Called with fc->lock, unlocks it 276 * Called with fc->lock, unlocks it
277 */ 277 */
278static void request_end(struct fuse_conn *fc, struct fuse_req *req) 278static void request_end(struct fuse_conn *fc, struct fuse_req *req)
279__releases(&fc->lock) 279__releases(fc->lock)
280{ 280{
281 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 281 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
282 req->end = NULL; 282 req->end = NULL;
@@ -306,8 +306,8 @@ __releases(&fc->lock)
306 306
307static void wait_answer_interruptible(struct fuse_conn *fc, 307static void wait_answer_interruptible(struct fuse_conn *fc,
308 struct fuse_req *req) 308 struct fuse_req *req)
309__releases(&fc->lock) 309__releases(fc->lock)
310__acquires(&fc->lock) 310__acquires(fc->lock)
311{ 311{
312 if (signal_pending(current)) 312 if (signal_pending(current))
313 return; 313 return;
@@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
325} 325}
326 326
327static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 327static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
328__releases(&fc->lock) 328__releases(fc->lock)
329__acquires(&fc->lock) 329__acquires(fc->lock)
330{ 330{
331 if (!fc->no_interrupt) { 331 if (!fc->no_interrupt) {
332 /* Any signal may interrupt this */ 332 /* Any signal may interrupt this */
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
398 else if (fc->conn_error) 398 else if (fc->conn_error)
399 req->out.h.error = -ECONNREFUSED; 399 req->out.h.error = -ECONNREFUSED;
400 else { 400 else {
401 req->in.h.unique = fuse_get_unique(fc);
401 queue_request(fc, req); 402 queue_request(fc, req);
402 /* acquire extra reference, since request is still needed 403 /* acquire extra reference, since request is still needed
403 after request_end() */ 404 after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
450} 451}
451EXPORT_SYMBOL_GPL(fuse_request_send_background); 452EXPORT_SYMBOL_GPL(fuse_request_send_background);
452 453
454static int fuse_request_send_notify_reply(struct fuse_conn *fc,
455 struct fuse_req *req, u64 unique)
456{
457 int err = -ENODEV;
458
459 req->isreply = 0;
460 req->in.h.unique = unique;
461 spin_lock(&fc->lock);
462 if (fc->connected) {
463 queue_request(fc, req);
464 err = 0;
465 }
466 spin_unlock(&fc->lock);
467
468 return err;
469}
470
453/* 471/*
454 * Called under fc->lock 472 * Called under fc->lock
455 * 473 *
@@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
535 if (!cs->write) { 553 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr); 554 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else { 555 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0); 556 kunmap(buf->page);
539 buf->len = PAGE_SIZE - cs->len; 557 buf->len = PAGE_SIZE - cs->len;
540 } 558 }
541 cs->currbuf = NULL; 559 cs->currbuf = NULL;
542 cs->mapaddr = NULL; 560 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) { 561 } else if (cs->mapaddr) {
544 kunmap_atomic(cs->mapaddr, KM_USER0); 562 kunmap(cs->pg);
545 if (cs->write) { 563 if (cs->write) {
546 flush_dcache_page(cs->pg); 564 flush_dcache_page(cs->pg);
547 set_page_dirty_lock(cs->pg); 565 set_page_dirty_lock(cs->pg);
@@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
572 590
573 BUG_ON(!cs->nr_segs); 591 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf; 592 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); 593 cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
576 cs->len = buf->len; 594 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset; 595 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++; 596 cs->pipebufs++;
@@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
592 buf->len = 0; 610 buf->len = 0;
593 611
594 cs->currbuf = buf; 612 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0); 613 cs->mapaddr = kmap(page);
596 cs->buf = cs->mapaddr; 614 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE; 615 cs->len = PAGE_SIZE;
598 cs->pipebufs++; 616 cs->pipebufs++;
@@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
611 return err; 629 return err;
612 BUG_ON(err != 1); 630 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE; 631 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); 632 cs->mapaddr = kmap(cs->pg);
615 cs->buf = cs->mapaddr + offset; 633 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen); 634 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len; 635 cs->seglen -= cs->len;
@@ -791,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
791 int err; 809 int err;
792 struct page *page = *pagep; 810 struct page *page = *pagep;
793 811
794 if (page && zeroing && count < PAGE_SIZE) { 812 if (page && zeroing && count < PAGE_SIZE)
795 void *mapaddr = kmap_atomic(page, KM_USER1); 813 clear_highpage(page);
796 memset(mapaddr, 0, PAGE_SIZE); 814
797 kunmap_atomic(mapaddr, KM_USER1);
798 }
799 while (count) { 815 while (count) {
800 if (cs->write && cs->pipebufs && page) { 816 if (cs->write && cs->pipebufs && page) {
801 return fuse_ref_page(cs, page, offset, count); 817 return fuse_ref_page(cs, page, offset, count);
@@ -812,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
812 } 828 }
813 } 829 }
814 if (page) { 830 if (page) {
815 void *mapaddr = kmap_atomic(page, KM_USER1); 831 void *mapaddr = kmap_atomic(page, KM_USER0);
816 void *buf = mapaddr + offset; 832 void *buf = mapaddr + offset;
817 offset += fuse_copy_do(cs, &buf, &count); 833 offset += fuse_copy_do(cs, &buf, &count);
818 kunmap_atomic(mapaddr, KM_USER1); 834 kunmap_atomic(mapaddr, KM_USER0);
819 } else 835 } else
820 offset += fuse_copy_do(cs, NULL, &count); 836 offset += fuse_copy_do(cs, NULL, &count);
821 } 837 }
@@ -887,8 +903,8 @@ static int request_pending(struct fuse_conn *fc)
887 903
888/* Wait until a request is available on the pending list */ 904/* Wait until a request is available on the pending list */
889static void request_wait(struct fuse_conn *fc) 905static void request_wait(struct fuse_conn *fc)
890__releases(&fc->lock) 906__releases(fc->lock)
891__acquires(&fc->lock) 907__acquires(fc->lock)
892{ 908{
893 DECLARE_WAITQUEUE(wait, current); 909 DECLARE_WAITQUEUE(wait, current);
894 910
@@ -916,7 +932,7 @@ __acquires(&fc->lock)
916 */ 932 */
917static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, 933static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
918 size_t nbytes, struct fuse_req *req) 934 size_t nbytes, struct fuse_req *req)
919__releases(&fc->lock) 935__releases(fc->lock)
920{ 936{
921 struct fuse_in_header ih; 937 struct fuse_in_header ih;
922 struct fuse_interrupt_in arg; 938 struct fuse_interrupt_in arg;
@@ -1231,6 +1247,194 @@ err:
1231 return err; 1247 return err;
1232} 1248}
1233 1249
1250static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1251 struct fuse_copy_state *cs)
1252{
1253 struct fuse_notify_store_out outarg;
1254 struct inode *inode;
1255 struct address_space *mapping;
1256 u64 nodeid;
1257 int err;
1258 pgoff_t index;
1259 unsigned int offset;
1260 unsigned int num;
1261 loff_t file_size;
1262 loff_t end;
1263
1264 err = -EINVAL;
1265 if (size < sizeof(outarg))
1266 goto out_finish;
1267
1268 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1269 if (err)
1270 goto out_finish;
1271
1272 err = -EINVAL;
1273 if (size - sizeof(outarg) != outarg.size)
1274 goto out_finish;
1275
1276 nodeid = outarg.nodeid;
1277
1278 down_read(&fc->killsb);
1279
1280 err = -ENOENT;
1281 if (!fc->sb)
1282 goto out_up_killsb;
1283
1284 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1285 if (!inode)
1286 goto out_up_killsb;
1287
1288 mapping = inode->i_mapping;
1289 index = outarg.offset >> PAGE_CACHE_SHIFT;
1290 offset = outarg.offset & ~PAGE_CACHE_MASK;
1291 file_size = i_size_read(inode);
1292 end = outarg.offset + outarg.size;
1293 if (end > file_size) {
1294 file_size = end;
1295 fuse_write_update_size(inode, file_size);
1296 }
1297
1298 num = outarg.size;
1299 while (num) {
1300 struct page *page;
1301 unsigned int this_num;
1302
1303 err = -ENOMEM;
1304 page = find_or_create_page(mapping, index,
1305 mapping_gfp_mask(mapping));
1306 if (!page)
1307 goto out_iput;
1308
1309 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1310 err = fuse_copy_page(cs, &page, offset, this_num, 0);
1311 if (!err && offset == 0 && (num != 0 || file_size == end))
1312 SetPageUptodate(page);
1313 unlock_page(page);
1314 page_cache_release(page);
1315
1316 if (err)
1317 goto out_iput;
1318
1319 num -= this_num;
1320 offset = 0;
1321 index++;
1322 }
1323
1324 err = 0;
1325
1326out_iput:
1327 iput(inode);
1328out_up_killsb:
1329 up_read(&fc->killsb);
1330out_finish:
1331 fuse_copy_finish(cs);
1332 return err;
1333}
1334
1335static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1336{
1337 release_pages(req->pages, req->num_pages, 0);
1338}
1339
1340static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1341 struct fuse_notify_retrieve_out *outarg)
1342{
1343 int err;
1344 struct address_space *mapping = inode->i_mapping;
1345 struct fuse_req *req;
1346 pgoff_t index;
1347 loff_t file_size;
1348 unsigned int num;
1349 unsigned int offset;
1350 size_t total_len = 0;
1351
1352 req = fuse_get_req(fc);
1353 if (IS_ERR(req))
1354 return PTR_ERR(req);
1355
1356 offset = outarg->offset & ~PAGE_CACHE_MASK;
1357
1358 req->in.h.opcode = FUSE_NOTIFY_REPLY;
1359 req->in.h.nodeid = outarg->nodeid;
1360 req->in.numargs = 2;
1361 req->in.argpages = 1;
1362 req->page_offset = offset;
1363 req->end = fuse_retrieve_end;
1364
1365 index = outarg->offset >> PAGE_CACHE_SHIFT;
1366 file_size = i_size_read(inode);
1367 num = outarg->size;
1368 if (outarg->offset > file_size)
1369 num = 0;
1370 else if (outarg->offset + num > file_size)
1371 num = file_size - outarg->offset;
1372
1373 while (num) {
1374 struct page *page;
1375 unsigned int this_num;
1376
1377 page = find_get_page(mapping, index);
1378 if (!page)
1379 break;
1380
1381 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1382 req->pages[req->num_pages] = page;
1383 req->num_pages++;
1384
1385 num -= this_num;
1386 total_len += this_num;
1387 }
1388 req->misc.retrieve_in.offset = outarg->offset;
1389 req->misc.retrieve_in.size = total_len;
1390 req->in.args[0].size = sizeof(req->misc.retrieve_in);
1391 req->in.args[0].value = &req->misc.retrieve_in;
1392 req->in.args[1].size = total_len;
1393
1394 err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
1395 if (err)
1396 fuse_retrieve_end(fc, req);
1397
1398 return err;
1399}
1400
1401static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
1402 struct fuse_copy_state *cs)
1403{
1404 struct fuse_notify_retrieve_out outarg;
1405 struct inode *inode;
1406 int err;
1407
1408 err = -EINVAL;
1409 if (size != sizeof(outarg))
1410 goto copy_finish;
1411
1412 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1413 if (err)
1414 goto copy_finish;
1415
1416 fuse_copy_finish(cs);
1417
1418 down_read(&fc->killsb);
1419 err = -ENOENT;
1420 if (fc->sb) {
1421 u64 nodeid = outarg.nodeid;
1422
1423 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1424 if (inode) {
1425 err = fuse_retrieve(fc, inode, &outarg);
1426 iput(inode);
1427 }
1428 }
1429 up_read(&fc->killsb);
1430
1431 return err;
1432
1433copy_finish:
1434 fuse_copy_finish(cs);
1435 return err;
1436}
1437
1234static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, 1438static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1235 unsigned int size, struct fuse_copy_state *cs) 1439 unsigned int size, struct fuse_copy_state *cs)
1236{ 1440{
@@ -1244,6 +1448,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1244 case FUSE_NOTIFY_INVAL_ENTRY: 1448 case FUSE_NOTIFY_INVAL_ENTRY:
1245 return fuse_notify_inval_entry(fc, size, cs); 1449 return fuse_notify_inval_entry(fc, size, cs);
1246 1450
1451 case FUSE_NOTIFY_STORE:
1452 return fuse_notify_store(fc, size, cs);
1453
1454 case FUSE_NOTIFY_RETRIEVE:
1455 return fuse_notify_retrieve(fc, size, cs);
1456
1247 default: 1457 default:
1248 fuse_copy_finish(cs); 1458 fuse_copy_finish(cs);
1249 return -EINVAL; 1459 return -EINVAL;
@@ -1503,8 +1713,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1503 * This function releases and reacquires fc->lock 1713 * This function releases and reacquires fc->lock
1504 */ 1714 */
1505static void end_requests(struct fuse_conn *fc, struct list_head *head) 1715static void end_requests(struct fuse_conn *fc, struct list_head *head)
1506__releases(&fc->lock) 1716__releases(fc->lock)
1507__acquires(&fc->lock) 1717__acquires(fc->lock)
1508{ 1718{
1509 while (!list_empty(head)) { 1719 while (!list_empty(head)) {
1510 struct fuse_req *req; 1720 struct fuse_req *req;
@@ -1527,8 +1737,8 @@ __acquires(&fc->lock)
1527 * locked). 1737 * locked).
1528 */ 1738 */
1529static void end_io_requests(struct fuse_conn *fc) 1739static void end_io_requests(struct fuse_conn *fc)
1530__releases(&fc->lock) 1740__releases(fc->lock)
1531__acquires(&fc->lock) 1741__acquires(fc->lock)
1532{ 1742{
1533 while (!list_empty(&fc->io)) { 1743 while (!list_empty(&fc->io)) {
1534 struct fuse_req *req = 1744 struct fuse_req *req =
@@ -1552,6 +1762,16 @@ __acquires(&fc->lock)
1552 } 1762 }
1553} 1763}
1554 1764
1765static void end_queued_requests(struct fuse_conn *fc)
1766__releases(fc->lock)
1767__acquires(fc->lock)
1768{
1769 fc->max_background = UINT_MAX;
1770 flush_bg_queue(fc);
1771 end_requests(fc, &fc->pending);
1772 end_requests(fc, &fc->processing);
1773}
1774
1555/* 1775/*
1556 * Abort all requests. 1776 * Abort all requests.
1557 * 1777 *
@@ -1578,8 +1798,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
1578 fc->connected = 0; 1798 fc->connected = 0;
1579 fc->blocked = 0; 1799 fc->blocked = 0;
1580 end_io_requests(fc); 1800 end_io_requests(fc);
1581 end_requests(fc, &fc->pending); 1801 end_queued_requests(fc);
1582 end_requests(fc, &fc->processing);
1583 wake_up_all(&fc->waitq); 1802 wake_up_all(&fc->waitq);
1584 wake_up_all(&fc->blocked_waitq); 1803 wake_up_all(&fc->blocked_waitq);
1585 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1804 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1594,8 +1813,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
1594 if (fc) { 1813 if (fc) {
1595 spin_lock(&fc->lock); 1814 spin_lock(&fc->lock);
1596 fc->connected = 0; 1815 fc->connected = 0;
1597 end_requests(fc, &fc->pending); 1816 fc->blocked = 0;
1598 end_requests(fc, &fc->processing); 1817 end_queued_requests(fc);
1818 wake_up_all(&fc->blocked_waitq);
1599 spin_unlock(&fc->lock); 1819 spin_unlock(&fc->lock);
1600 fuse_conn_put(fc); 1820 fuse_conn_put(fc);
1601 } 1821 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3cdc5f78a40..c9627c95482 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
1016 exist. So if permissions are revoked this won't be 1016 exist. So if permissions are revoked this won't be
1017 noticed immediately, only after the attribute 1017 noticed immediately, only after the attribute
1018 timeout has expired */ 1018 timeout has expired */
1019 } else if (mask & MAY_ACCESS) { 1019 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
1020 err = fuse_access(inode, mask); 1020 err = fuse_access(inode, mask);
1021 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { 1021 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
1022 if (!(inode->i_mode & S_IXUGO)) { 1022 if (!(inode->i_mode & S_IXUGO)) {
@@ -1270,21 +1270,18 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1270 if (!fuse_allow_task(fc, current)) 1270 if (!fuse_allow_task(fc, current))
1271 return -EACCES; 1271 return -EACCES;
1272 1272
1273 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1273 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
1274 err = inode_change_ok(inode, attr); 1274 attr->ia_valid |= ATTR_FORCE;
1275 if (err) 1275
1276 return err; 1276 err = inode_change_ok(inode, attr);
1277 } 1277 if (err)
1278 return err;
1278 1279
1279 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) 1280 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
1280 return 0; 1281 return 0;
1281 1282
1282 if (attr->ia_valid & ATTR_SIZE) { 1283 if (attr->ia_valid & ATTR_SIZE)
1283 err = inode_newsize_ok(inode, attr->ia_size);
1284 if (err)
1285 return err;
1286 is_truncate = true; 1284 is_truncate = true;
1287 }
1288 1285
1289 req = fuse_get_req(fc); 1286 req = fuse_get_req(fc);
1290 if (IS_ERR(req)) 1287 if (IS_ERR(req))
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb..c8224587123 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
706 return 0; 706 return 0;
707} 707}
708 708
709static void fuse_write_update_size(struct inode *inode, loff_t pos) 709void fuse_write_update_size(struct inode *inode, loff_t pos)
710{ 710{
711 struct fuse_conn *fc = get_fuse_conn(inode); 711 struct fuse_conn *fc = get_fuse_conn(inode);
712 struct fuse_inode *fi = get_fuse_inode(inode); 712 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1144 1144
1145/* Called under fc->lock, may release and reacquire it */ 1145/* Called under fc->lock, may release and reacquire it */
1146static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1146static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1147__releases(&fc->lock) 1147__releases(fc->lock)
1148__acquires(&fc->lock) 1148__acquires(fc->lock)
1149{ 1149{
1150 struct fuse_inode *fi = get_fuse_inode(req->inode); 1150 struct fuse_inode *fi = get_fuse_inode(req->inode);
1151 loff_t size = i_size_read(req->inode); 1151 loff_t size = i_size_read(req->inode);
@@ -1183,8 +1183,8 @@ __acquires(&fc->lock)
1183 * Called with fc->lock 1183 * Called with fc->lock
1184 */ 1184 */
1185void fuse_flush_writepages(struct inode *inode) 1185void fuse_flush_writepages(struct inode *inode)
1186__releases(&fc->lock) 1186__releases(fc->lock)
1187__acquires(&fc->lock) 1187__acquires(fc->lock)
1188{ 1188{
1189 struct fuse_conn *fc = get_fuse_conn(inode); 1189 struct fuse_conn *fc = get_fuse_conn(inode);
1190 struct fuse_inode *fi = get_fuse_inode(inode); 1190 struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064..57d4a3a0f10 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
272 struct fuse_write_in in; 272 struct fuse_write_in in;
273 struct fuse_write_out out; 273 struct fuse_write_out out;
274 } write; 274 } write;
275 struct fuse_notify_retrieve_in retrieve_in;
275 struct fuse_lk_in lk_in; 276 struct fuse_lk_in lk_in;
276 } misc; 277 } misc;
277 278
@@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
748unsigned fuse_file_poll(struct file *file, poll_table *wait); 749unsigned fuse_file_poll(struct file *file, poll_table *wait);
749int fuse_dev_release(struct inode *inode, struct file *file); 750int fuse_dev_release(struct inode *inode, struct file *file);
750 751
752void fuse_write_update_size(struct inode *inode, loff_t pos);
753
751#endif /* _FS_FUSE_I_H */ 754#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ec14d19ce50..cfce3ad86a9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -122,8 +122,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
122 fuse_request_send_noreply(fc, req); 122 fuse_request_send_noreply(fc, req);
123} 123}
124 124
125static void fuse_clear_inode(struct inode *inode) 125static void fuse_evict_inode(struct inode *inode)
126{ 126{
127 truncate_inode_pages(&inode->i_data, 0);
128 end_writeback(inode);
127 if (inode->i_sb->s_flags & MS_ACTIVE) { 129 if (inode->i_sb->s_flags & MS_ACTIVE) {
128 struct fuse_conn *fc = get_fuse_conn(inode); 130 struct fuse_conn *fc = get_fuse_conn(inode);
129 struct fuse_inode *fi = get_fuse_inode(inode); 131 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -736,7 +738,7 @@ static const struct export_operations fuse_export_operations = {
736static const struct super_operations fuse_super_operations = { 738static const struct super_operations fuse_super_operations = {
737 .alloc_inode = fuse_alloc_inode, 739 .alloc_inode = fuse_alloc_inode,
738 .destroy_inode = fuse_destroy_inode, 740 .destroy_inode = fuse_destroy_inode,
739 .clear_inode = fuse_clear_inode, 741 .evict_inode = fuse_evict_inode,
740 .drop_inode = generic_delete_inode, 742 .drop_inode = generic_delete_inode,
741 .remount_fs = fuse_remount_fs, 743 .remount_fs = fuse_remount_fs,
742 .put_super = fuse_put_super, 744 .put_super = fuse_put_super,
@@ -1039,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1039 return err; 1041 return err;
1040} 1042}
1041 1043
1042static int fuse_get_sb(struct file_system_type *fs_type, 1044static struct dentry *fuse_mount(struct file_system_type *fs_type,
1043 int flags, const char *dev_name, 1045 int flags, const char *dev_name,
1044 void *raw_data, struct vfsmount *mnt) 1046 void *raw_data)
1045{ 1047{
1046 return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); 1048 return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
1047} 1049}
1048 1050
1049static void fuse_kill_sb_anon(struct super_block *sb) 1051static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1063,17 +1065,16 @@ static struct file_system_type fuse_fs_type = {
1063 .owner = THIS_MODULE, 1065 .owner = THIS_MODULE,
1064 .name = "fuse", 1066 .name = "fuse",
1065 .fs_flags = FS_HAS_SUBTYPE, 1067 .fs_flags = FS_HAS_SUBTYPE,
1066 .get_sb = fuse_get_sb, 1068 .mount = fuse_mount,
1067 .kill_sb = fuse_kill_sb_anon, 1069 .kill_sb = fuse_kill_sb_anon,
1068}; 1070};
1069 1071
1070#ifdef CONFIG_BLOCK 1072#ifdef CONFIG_BLOCK
1071static int fuse_get_sb_blk(struct file_system_type *fs_type, 1073static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
1072 int flags, const char *dev_name, 1074 int flags, const char *dev_name,
1073 void *raw_data, struct vfsmount *mnt) 1075 void *raw_data)
1074{ 1076{
1075 return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super, 1077 return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
1076 mnt);
1077} 1078}
1078 1079
1079static void fuse_kill_sb_blk(struct super_block *sb) 1080static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1092,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
1092static struct file_system_type fuseblk_fs_type = { 1093static struct file_system_type fuseblk_fs_type = {
1093 .owner = THIS_MODULE, 1094 .owner = THIS_MODULE,
1094 .name = "fuseblk", 1095 .name = "fuseblk",
1095 .get_sb = fuse_get_sb_blk, 1096 .mount = fuse_mount_blk,
1096 .kill_sb = fuse_kill_sb_blk, 1097 .kill_sb = fuse_kill_sb_blk,
1097 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, 1098 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
1098}; 1099};
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e56415..6bc9e3a5a69 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
94 if (error < 0) 94 if (error < 0)
95 goto failed; 95 goto failed;
96 inode->i_mode = mode; 96 inode->i_mode = mode;
97 inode->i_ctime = CURRENT_TIME;
97 if (error == 0) { 98 if (error == 0) {
98 posix_acl_release(acl); 99 posix_acl_release(acl);
99 acl = NULL; 100 acl = NULL;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b4310711..c465ae066c6 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,13 +1,12 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
7 select IP_SCTP if DLM_SCTP 7 select IP_SCTP if DLM_SCTP
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK
11 select QUOTACTL 10 select QUOTACTL
12 help 11 help
13 A cluster filesystem. 12 A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9f8b52500d6..4f36f8832b9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
136 if (ret <= 0) 136 if (ret <= 0)
137 return ret; 137 return ret;
138 138
139 ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); 139 return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
140 if (ret == -EAGAIN)
141 ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
142 return ret;
143} 140}
144 141
145/** 142/**
@@ -618,10 +615,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
618 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
619 int alloc_required; 616 int alloc_required;
620 int error = 0; 617 int error = 0;
621 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
622 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
623 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
624 unsigned to = from + len;
625 struct page *page; 621 struct page *page;
626 622
627 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 623 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -637,9 +633,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
637 } 633 }
638 } 634 }
639 635
640 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 636 alloc_required = gfs2_write_alloc_required(ip, pos, len);
641 if (error)
642 goto out_unlock;
643 637
644 if (alloc_required || gfs2_is_jdata(ip)) 638 if (alloc_required || gfs2_is_jdata(ip))
645 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); 639 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
@@ -668,6 +662,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
668 rblocks += RES_STATFS + RES_QUOTA; 662 rblocks += RES_STATFS + RES_QUOTA;
669 if (&ip->i_inode == sdp->sd_rindex) 663 if (&ip->i_inode == sdp->sd_rindex)
670 rblocks += 2 * RES_STATFS; 664 rblocks += 2 * RES_STATFS;
665 if (alloc_required)
666 rblocks += gfs2_rg_blocks(al);
671 667
672 error = gfs2_trans_begin(sdp, rblocks, 668 error = gfs2_trans_begin(sdp, rblocks,
673 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -694,20 +690,18 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
694 } 690 }
695 691
696prepare_write: 692prepare_write:
697 error = block_prepare_write(page, from, to, gfs2_block_map); 693 error = __block_write_begin(page, from, len, gfs2_block_map);
698out: 694out:
699 if (error == 0) 695 if (error == 0)
700 return 0; 696 return 0;
701 697
702 page_cache_release(page); 698 page_cache_release(page);
703 699
704 /* 700 gfs2_trans_end(sdp);
705 * XXX(hch): the call below should probably be replaced with
706 * a call to the gfs2-specific truncate blocks helper to actually
707 * release disk blocks..
708 */
709 if (pos + len > ip->i_inode.i_size) 701 if (pos + len > ip->i_inode.i_size)
710 simple_setsize(&ip->i_inode, ip->i_inode.i_size); 702 gfs2_trim_blocks(&ip->i_inode);
703 goto out_trans_fail;
704
711out_endtrans: 705out_endtrans:
712 gfs2_trans_end(sdp); 706 gfs2_trans_end(sdp);
713out_trans_fail: 707out_trans_fail:
@@ -807,10 +801,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
807 page_cache_release(page); 801 page_cache_release(page);
808 802
809 if (copied) { 803 if (copied) {
810 if (inode->i_size < to) { 804 if (inode->i_size < to)
811 i_size_write(inode, to); 805 i_size_write(inode, to);
812 ip->i_disksize = inode->i_size;
813 }
814 gfs2_dinode_out(ip, di); 806 gfs2_dinode_out(ip, di);
815 mark_inode_dirty(inode); 807 mark_inode_dirty(inode);
816 } 808 }
@@ -881,8 +873,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
881 873
882 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 874 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
883 if (ret > 0) { 875 if (ret > 0) {
884 if (inode->i_size > ip->i_disksize)
885 ip->i_disksize = inode->i_size;
886 gfs2_dinode_out(ip, dibh->b_data); 876 gfs2_dinode_out(ip, dibh->b_data);
887 mark_inode_dirty(inode); 877 mark_inode_dirty(inode);
888 } 878 }
@@ -1047,9 +1037,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1047 if (rv != 1) 1037 if (rv != 1)
1048 goto out; /* dio not valid, fall back to buffered i/o */ 1038 goto out; /* dio not valid, fall back to buffered i/o */
1049 1039
1050 rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, 1040 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1051 iov, offset, nr_segs, 1041 offset, nr_segs, gfs2_get_block_direct,
1052 gfs2_get_block_direct, NULL); 1042 NULL, NULL, 0);
1053out: 1043out:
1054 gfs2_glock_dq_m(1, &gh); 1044 gfs2_glock_dq_m(1, &gh);
1055 gfs2_holder_uninit(&gh); 1045 gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 84da64b551b..5476c066d4e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
885} 884}
886 885
887/** 886/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 887 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 888 *
958 * This is partly borrowed from ext3. 889 * This is partly borrowed from ext3.
959 */ 890 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 891static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 892{
962 struct inode *inode = mapping->host; 893 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 894 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 895 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 896 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 897 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
1023 return err; 953 return err;
1024} 954}
1025 955
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 956static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 957{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 958 struct gfs2_inode *ip = GFS2_I(inode);
959 struct gfs2_sbd *sdp = GFS2_SB(inode);
960 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 961 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 962 int journaled = gfs2_is_jdata(ip);
1031 int error; 963 int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 971 if (error)
1040 goto out; 972 goto out;
1041 973
974 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
975
1042 if (gfs2_is_stuffed(ip)) { 976 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_inode); 977 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 978 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 979 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 980 error = gfs2_block_truncate_page(mapping, newsize);
1055 981 if (error)
1056 if (!error) { 982 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 983 }
984 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 985 }
1064 986
1065 brelse(dibh); 987 i_size_write(inode, newsize);
988 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
989 gfs2_dinode_out(ip, dibh->b_data);
1066 990
991 truncate_pagecache(inode, oldsize, newsize);
992out_brelse:
993 brelse(dibh);
1067out: 994out:
1068 gfs2_trans_end(sdp); 995 gfs2_trans_end(sdp);
1069 return error; 996 return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1050 if (error)
1124 goto out; 1051 goto out;
1125 1052
1126 if (!ip->i_disksize) { 1053 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1054 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1055 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1056 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
1143 1070
1144/** 1071/**
1145 * do_shrink - make a file smaller 1072 * do_shrink - make a file smaller
1146 * @ip: the inode 1073 * @inode: the inode
1147 * @size: the size to make the file 1074 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1075 * @newsize: the size to make the file
1149 * 1076 *
1150 * Called with an exclusive lock on @ip. 1077 * Called with an exclusive lock on @inode. The @size must
1078 * be equal to or smaller than the current inode size.
1151 * 1079 *
1152 * Returns: errno 1080 * Returns: errno
1153 */ 1081 */
1154 1082
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1083static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1084{
1085 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1086 int error;
1158 1087
1159 error = trunc_start(ip, size); 1088 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1089 if (error < 0)
1161 return error; 1090 return error;
1162 if (error > 0) 1091 if (gfs2_is_stuffed(ip))
1163 return 0; 1092 return 0;
1164 1093
1165 error = trunc_dealloc(ip, size); 1094 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1095 if (error == 0)
1167 error = trunc_end(ip); 1096 error = trunc_end(ip);
1168 1097
1169 return error; 1098 return error;
1170} 1099}
1171 1100
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1101void gfs2_trim_blocks(struct inode *inode)
1173{ 1102{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1103 u64 size = inode->i_size;
1104 int ret;
1105
1106 ret = do_shrink(inode, size, size);
1107 WARN_ON(ret != 0);
1108}
1109
1110/**
1111 * do_grow - Touch and update inode size
1112 * @inode: The inode
1113 * @size: The new size
1114 *
1115 * This function updates the timestamps on the inode and
1116 * may also increase the size of the inode. This function
1117 * must not be called with @size any smaller than the current
1118 * inode size.
1119 *
1120 * Although it is not strictly required to unstuff files here,
1121 * earlier versions of GFS2 have a bug in the stuffed file reading
1122 * code which will result in a buffer overrun if the size is larger
1123 * than the max stuffed file size. In order to prevent this from
1124 * occuring, such files are unstuffed, but in other cases we can
1125 * just update the inode size directly.
1126 *
1127 * Returns: 0 on success, or -ve on error
1128 */
1129
1130static int do_grow(struct inode *inode, u64 size)
1131{
1132 struct gfs2_inode *ip = GFS2_I(inode);
1133 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1134 struct buffer_head *dibh;
1135 struct gfs2_alloc *al = NULL;
1176 int error; 1136 int error;
1177 1137
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1138 if (gfs2_is_stuffed(ip) &&
1139 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1140 al = gfs2_alloc_get(ip);
1141 if (al == NULL)
1142 return -ENOMEM;
1143
1144 error = gfs2_quota_lock_check(ip);
1145 if (error)
1146 goto do_grow_alloc_put;
1147
1148 al->al_requested = 1;
1149 error = gfs2_inplace_reserve(ip);
1150 if (error)
1151 goto do_grow_qunlock;
1152 }
1153
1154 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1155 if (error)
1180 return error; 1156 goto do_grow_release;
1181 1157
1182 down_write(&ip->i_rw_mutex); 1158 if (al) {
1159 error = gfs2_unstuff_dinode(ip, NULL);
1160 if (error)
1161 goto do_end_trans;
1162 }
1183 1163
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1165 if (error)
1186 goto do_touch_out; 1166 goto do_end_trans;
1187 1167
1168 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1169 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1170 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1171 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1172 brelse(dibh);
1192 1173
1193do_touch_out: 1174do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1175 gfs2_trans_end(sdp);
1176do_grow_release:
1177 if (al) {
1178 gfs2_inplace_release(ip);
1179do_grow_qunlock:
1180 gfs2_quota_unlock(ip);
1181do_grow_alloc_put:
1182 gfs2_alloc_put(ip);
1183 }
1196 return error; 1184 return error;
1197} 1185}
1198 1186
1199/** 1187/**
1200 * gfs2_truncatei - make a file a given size 1188 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1189 * @inode: the inode
1202 * @size: the size to make the file 1190 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1191 *
1205 * The file size can grow, shrink, or stay the same size. 1192 * The file size can grow, shrink, or stay the same size. This
1193 * is called holding i_mutex and an exclusive glock on the inode
1194 * in question.
1206 * 1195 *
1207 * Returns: errno 1196 * Returns: errno
1208 */ 1197 */
1209 1198
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1199int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1200{
1212 int error; 1201 int ret;
1202 u64 oldsize;
1213 1203
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1204 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1205
1217 if (size > ip->i_disksize) 1206 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1207 if (ret)
1219 else if (size < ip->i_disksize) 1208 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1209
1225 return error; 1210 oldsize = inode->i_size;
1211 if (newsize >= oldsize)
1212 return do_grow(inode, newsize);
1213
1214 return do_shrink(inode, oldsize, newsize);
1226} 1215}
1227 1216
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1217int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1218{
1230 int error; 1219 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1220 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1221 if (!error)
1233 error = trunc_end(ip); 1222 error = trunc_end(ip);
1234 return error; 1223 return error;
@@ -1244,13 +1233,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1244 * @ip: the file being written to 1233 * @ip: the file being written to
1245 * @offset: the offset to write to 1234 * @offset: the offset to write to
1246 * @len: the number of bytes being written 1235 * @len: the number of bytes being written
1247 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1248 * 1236 *
1249 * Returns: errno 1237 * Returns: 1 if an alloc is required, 0 otherwise
1250 */ 1238 */
1251 1239
1252int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 1240int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1253 unsigned int len, int *alloc_required) 1241 unsigned int len)
1254{ 1242{
1255 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1243 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1256 struct buffer_head bh; 1244 struct buffer_head bh;
@@ -1258,26 +1246,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1258 u64 lblock, lblock_stop, size; 1246 u64 lblock, lblock_stop, size;
1259 u64 end_of_file; 1247 u64 end_of_file;
1260 1248
1261 *alloc_required = 0;
1262
1263 if (!len) 1249 if (!len)
1264 return 0; 1250 return 0;
1265 1251
1266 if (gfs2_is_stuffed(ip)) { 1252 if (gfs2_is_stuffed(ip)) {
1267 if (offset + len > 1253 if (offset + len >
1268 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) 1254 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1269 *alloc_required = 1; 1255 return 1;
1270 return 0; 1256 return 0;
1271 } 1257 }
1272 1258
1273 *alloc_required = 1;
1274 shift = sdp->sd_sb.sb_bsize_shift; 1259 shift = sdp->sd_sb.sb_bsize_shift;
1275 BUG_ON(gfs2_is_dir(ip)); 1260 BUG_ON(gfs2_is_dir(ip));
1276 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1261 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1277 lblock = offset >> shift; 1262 lblock = offset >> shift;
1278 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1263 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1279 if (lblock_stop > end_of_file) 1264 if (lblock_stop > end_of_file)
1280 return 0; 1265 return 1;
1281 1266
1282 size = (lblock_stop - lblock) << shift; 1267 size = (lblock_stop - lblock) << shift;
1283 do { 1268 do {
@@ -1285,12 +1270,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1285 bh.b_size = size; 1270 bh.b_size = size;
1286 gfs2_block_map(&ip->i_inode, lblock, &bh, 0); 1271 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1287 if (!buffer_mapped(&bh)) 1272 if (!buffer_mapped(&bh))
1288 return 0; 1273 return 1;
1289 size -= bh.b_size; 1274 size -= bh.b_size;
1290 lblock += (bh.b_size >> ip->i_inode.i_blkbits); 1275 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1291 } while(size > 0); 1276 } while(size > 0);
1292 1277
1293 *alloc_required = 0;
1294 return 0; 1278 return 0;
1295} 1279}
1296 1280
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05a..42fea03e2bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len, int *alloc_required); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d..6798755b385 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
49 ip = GFS2_I(inode); 49 ip = GFS2_I(inode);
50 } 50 }
51 51
52 if (sdp->sd_args.ar_localcaching) 52 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 53 goto valid;
54 54
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6b48d7c268b..5c356d09c32 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -955,7 +959,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
955 /* Change the pointers. 959 /* Change the pointers.
956 Don't bother distinguishing stuffed from non-stuffed. 960 Don't bother distinguishing stuffed from non-stuffed.
957 This code is complicated enough already. */ 961 This code is complicated enough already. */
958 lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); 962 lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
963 if (!lp) {
964 error = -ENOMEM;
965 goto fail_brelse;
966 }
967
959 /* Change the pointers */ 968 /* Change the pointers */
960 for (x = 0; x < half_len; x++) 969 for (x = 0; x < half_len; x++)
961 lp[x] = cpu_to_be64(bn); 970 lp[x] = cpu_to_be64(bn);
@@ -1052,20 +1061,23 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1052 u64 *buf; 1061 u64 *buf;
1053 u64 *from, *to; 1062 u64 *from, *to;
1054 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1055 int x; 1065 int x;
1056 int error = 0; 1066 int error = 0;
1057 1067
1058 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1059 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1060 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1061 return -EIO; 1071 return -EIO;
1062 } 1072 }
1063 1073
1064 /* Allocate both the "from" and "to" buffers in one big chunk */ 1074 /* Allocate both the "from" and "to" buffers in one big chunk */
1065 1075
1066 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); 1076 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
1077 if (!buf)
1078 return -ENOMEM;
1067 1079
1068 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1069 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1070 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1071 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1363,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1363 unsigned depth = 0; 1375 unsigned depth = 0;
1364 1376
1365 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1366 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1367 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1368 return -EIO; 1380 return -EIO;
1369 } 1381 }
@@ -1777,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1777 int error = 0; 1789 int error = 0;
1778 1790
1779 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1780 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1781 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1782 return -EIO; 1794 return -EIO;
1783 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3b..a98f644bd3d 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8a..06d582732d3 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot;
130 struct dentry *dentry; 129 struct dentry *dentry;
131 130
132 /* 131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry)) 132 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops; 133 dentry->d_op = &gfs2_dops;
141 return dentry; 134 return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ed9a94f0ef1..aa996471ec5 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
351 unsigned long last_index; 351 unsigned long last_index;
352 u64 pos = page->index << PAGE_CACHE_SHIFT; 352 u64 pos = page->index << PAGE_CACHE_SHIFT;
353 unsigned int data_blocks, ind_blocks, rblocks; 353 unsigned int data_blocks, ind_blocks, rblocks;
354 int alloc_required = 0;
355 struct gfs2_holder gh; 354 struct gfs2_holder gh;
356 struct gfs2_alloc *al; 355 struct gfs2_alloc *al;
357 int ret; 356 int ret;
@@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
364 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 363 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
365 set_bit(GIF_SW_PAGED, &ip->i_flags); 364 set_bit(GIF_SW_PAGED, &ip->i_flags);
366 365
367 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 366 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
368 if (ret || !alloc_required)
369 goto out_unlock; 367 goto out_unlock;
370 ret = -ENOMEM; 368 ret = -ENOMEM;
371 al = gfs2_alloc_get(ip); 369 al = gfs2_alloc_get(ip);
@@ -384,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
384 rblocks = RES_DINODE + ind_blocks; 382 rblocks = RES_DINODE + ind_blocks;
385 if (gfs2_is_jdata(ip)) 383 if (gfs2_is_jdata(ip))
386 rblocks += data_blocks ? data_blocks : 1; 384 rblocks += data_blocks ? data_blocks : 1;
387 if (ind_blocks || data_blocks) 385 if (ind_blocks || data_blocks) {
388 rblocks += RES_STATFS + RES_QUOTA; 386 rblocks += RES_STATFS + RES_QUOTA;
387 rblocks += gfs2_rg_blocks(al);
388 }
389 ret = gfs2_trans_begin(sdp, rblocks, 0); 389 ret = gfs2_trans_begin(sdp, rblocks, 0);
390 if (ret) 390 if (ret)
391 goto out_trans_fail; 391 goto out_trans_fail;
@@ -493,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
493 goto fail; 493 goto fail;
494 494
495 if (!(file->f_flags & O_LARGEFILE) && 495 if (!(file->f_flags & O_LARGEFILE) &&
496 ip->i_disksize > MAX_NON_LFS) { 496 i_size_read(inode) > MAX_NON_LFS) {
497 error = -EOVERFLOW; 497 error = -EOVERFLOW;
498 goto fail_gunlock; 498 goto fail_gunlock;
499 } 499 }
@@ -622,6 +622,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
622 * cluster; until we do, disable leases (by just returning -EINVAL), 622 * cluster; until we do, disable leases (by just returning -EINVAL),
623 * unless the administrator has requested purely local locking. 623 * unless the administrator has requested purely local locking.
624 * 624 *
625 * Locking: called under lock_flocks
626 *
625 * Returns: errno 627 * Returns: errno
626 */ 628 */
627 629
@@ -773,6 +775,7 @@ const struct file_operations gfs2_dir_fops = {
773 .fsync = gfs2_fsync, 775 .fsync = gfs2_fsync,
774 .lock = gfs2_lock, 776 .lock = gfs2_lock,
775 .flock = gfs2_flock, 777 .flock = gfs2_flock,
778 .llseek = default_llseek,
776}; 779};
777 780
778#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 781#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -799,5 +802,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
799 .open = gfs2_open, 802 .open = gfs2_open,
800 .release = gfs2_close, 803 .release = gfs2_close,
801 .fsync = gfs2_fsync, 804 .fsync = gfs2_fsync,
805 .llseek = default_llseek,
802}; 806};
803 807
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0898f3ec821..87778857f09 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
328} 328}
329 329
330/** 330/**
331 * do_error - Something unexpected has happened during a lock request
332 *
333 */
334
335static inline void do_error(struct gfs2_glock *gl, const int ret)
336{
337 struct gfs2_holder *gh, *tmp;
338
339 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
340 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
341 continue;
342 if (ret & LM_OUT_ERROR)
343 gh->gh_error = -EIO;
344 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
345 gh->gh_error = GLR_TRYFAILED;
346 else
347 continue;
348 list_del_init(&gh->gh_list);
349 trace_gfs2_glock_queue(gh, 0);
350 gfs2_holder_wake(gh);
351 }
352}
353
354/**
331 * do_promote - promote as many requests as possible on the current queue 355 * do_promote - promote as many requests as possible on the current queue
332 * @gl: The glock 356 * @gl: The glock
333 * 357 *
@@ -375,36 +399,13 @@ restart:
375 } 399 }
376 if (gh->gh_list.prev == &gl->gl_holders) 400 if (gh->gh_list.prev == &gl->gl_holders)
377 return 1; 401 return 1;
402 do_error(gl, 0);
378 break; 403 break;
379 } 404 }
380 return 0; 405 return 0;
381} 406}
382 407
383/** 408/**
384 * do_error - Something unexpected has happened during a lock request
385 *
386 */
387
388static inline void do_error(struct gfs2_glock *gl, const int ret)
389{
390 struct gfs2_holder *gh, *tmp;
391
392 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
393 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
394 continue;
395 if (ret & LM_OUT_ERROR)
396 gh->gh_error = -EIO;
397 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
398 gh->gh_error = GLR_TRYFAILED;
399 else
400 continue;
401 list_del_init(&gh->gh_list);
402 trace_gfs2_glock_queue(gh, 0);
403 gfs2_holder_wake(gh);
404 }
405}
406
407/**
408 * find_first_waiter - find the first gh that's waiting for the glock 409 * find_first_waiter - find the first gh that's waiting for the glock
409 * @gl: the glock 410 * @gl: the glock
410 */ 411 */
@@ -440,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
440 else 441 else
441 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
442 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
443 446
444 gl->gl_state = new_state; 447 gl->gl_state = new_state;
445 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -706,18 +709,8 @@ static void glock_work_func(struct work_struct *work)
706{ 709{
707 unsigned long delay = 0; 710 unsigned long delay = 0;
708 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); 711 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
709 struct gfs2_holder *gh;
710 int drop_ref = 0; 712 int drop_ref = 0;
711 713
712 if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
713 spin_lock(&gl->gl_spin);
714 gh = find_first_waiter(gl);
715 if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
716 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
717 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
718 spin_unlock(&gl->gl_spin);
719 }
720
721 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { 714 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
722 finish_xmote(gl, gl->gl_reply); 715 finish_xmote(gl, gl->gl_reply);
723 drop_ref = 1; 716 drop_ref = 1;
@@ -1021,6 +1014,7 @@ fail:
1021 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1014 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1022 insert_pt = &gh2->gh_list; 1015 insert_pt = &gh2->gh_list;
1023 } 1016 }
1017 set_bit(GLF_QUEUED, &gl->gl_flags);
1024 if (likely(insert_pt == NULL)) { 1018 if (likely(insert_pt == NULL)) {
1025 list_add_tail(&gh->gh_list, &gl->gl_holders); 1019 list_add_tail(&gh->gh_list, &gl->gl_holders);
1026 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1020 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1072,6 +1066,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1072 1066
1073 spin_lock(&gl->gl_spin); 1067 spin_lock(&gl->gl_spin);
1074 add_to_queue(gh); 1068 add_to_queue(gh);
1069 if ((LM_FLAG_NOEXP & gh->gh_flags) &&
1070 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
1071 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1075 run_queue(gl, 1); 1072 run_queue(gl, 1);
1076 spin_unlock(&gl->gl_spin); 1073 spin_unlock(&gl->gl_spin);
1077 1074
@@ -1316,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1316 1313
1317 gfs2_glock_hold(gl); 1314 gfs2_glock_hold(gl);
1318 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1315 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1319 if (time_before(now, holdtime)) 1316 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1320 delay = holdtime - now; 1317 if (time_before(now, holdtime))
1321 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1318 delay = holdtime - now;
1322 delay = gl->gl_ops->go_min_hold_time; 1319 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1320 delay = gl->gl_ops->go_min_hold_time;
1321 }
1323 1322
1324 spin_lock(&gl->gl_spin); 1323 spin_lock(&gl->gl_spin);
1325 handle_callback(gl, state, delay); 1324 handle_callback(gl, state, delay);
@@ -1329,6 +1328,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1329} 1328}
1330 1329
1331/** 1330/**
1331 * gfs2_should_freeze - Figure out if glock should be frozen
1332 * @gl: The glock in question
1333 *
1334 * Glocks are not frozen if (a) the result of the dlm operation is
1335 * an error, (b) the locking operation was an unlock operation or
1336 * (c) if there is a "noexp" flagged request anywhere in the queue
1337 *
1338 * Returns: 1 if freezing should occur, 0 otherwise
1339 */
1340
1341static int gfs2_should_freeze(const struct gfs2_glock *gl)
1342{
1343 const struct gfs2_holder *gh;
1344
1345 if (gl->gl_reply & ~LM_OUT_ST_MASK)
1346 return 0;
1347 if (gl->gl_target == LM_ST_UNLOCKED)
1348 return 0;
1349
1350 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1351 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1352 continue;
1353 if (LM_FLAG_NOEXP & gh->gh_flags)
1354 return 0;
1355 }
1356
1357 return 1;
1358}
1359
1360/**
1332 * gfs2_glock_complete - Callback used by locking 1361 * gfs2_glock_complete - Callback used by locking
1333 * @gl: Pointer to the glock 1362 * @gl: Pointer to the glock
1334 * @ret: The return value from the dlm 1363 * @ret: The return value from the dlm
@@ -1338,18 +1367,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1338void gfs2_glock_complete(struct gfs2_glock *gl, int ret) 1367void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1339{ 1368{
1340 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 1369 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1370
1341 gl->gl_reply = ret; 1371 gl->gl_reply = ret;
1372
1342 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1373 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1343 struct gfs2_holder *gh;
1344 spin_lock(&gl->gl_spin); 1374 spin_lock(&gl->gl_spin);
1345 gh = find_first_waiter(gl); 1375 if (gfs2_should_freeze(gl)) {
1346 if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
1347 (gl->gl_target != LM_ST_UNLOCKED)) ||
1348 ((ret & ~LM_OUT_ST_MASK) != 0))
1349 set_bit(GLF_FROZEN, &gl->gl_flags); 1376 set_bit(GLF_FROZEN, &gl->gl_flags);
1350 spin_unlock(&gl->gl_spin); 1377 spin_unlock(&gl->gl_spin);
1351 if (test_bit(GLF_FROZEN, &gl->gl_flags))
1352 return; 1378 return;
1379 }
1380 spin_unlock(&gl->gl_spin);
1353 } 1381 }
1354 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1382 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1355 gfs2_glock_hold(gl); 1383 gfs2_glock_hold(gl);
@@ -1489,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
1489 spin_unlock(&lru_lock); 1517 spin_unlock(&lru_lock);
1490 1518
1491 spin_lock(&gl->gl_spin); 1519 spin_lock(&gl->gl_spin);
1492 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1520 if (gl->gl_state != LM_ST_UNLOCKED)
1493 handle_callback(gl, LM_ST_UNLOCKED, 0); 1521 handle_callback(gl, LM_ST_UNLOCKED, 0);
1494 spin_unlock(&gl->gl_spin); 1522 spin_unlock(&gl->gl_spin);
1495 gfs2_glock_hold(gl); 1523 gfs2_glock_hold(gl);
@@ -1637,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1637 *p++ = 'I'; 1665 *p++ = 'I';
1638 if (test_bit(GLF_FROZEN, gflags)) 1666 if (test_bit(GLF_FROZEN, gflags))
1639 *p++ = 'F'; 1667 *p++ = 'F';
1668 if (test_bit(GLF_QUEUED, gflags))
1669 *p++ = 'q';
1640 *p = 0; 1670 *p = 0;
1641 return buf; 1671 return buf;
1642} 1672}
@@ -1753,10 +1783,12 @@ int __init gfs2_glock_init(void)
1753 } 1783 }
1754#endif 1784#endif
1755 1785
1756 glock_workqueue = create_workqueue("glock_workqueue"); 1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1757 if (IS_ERR(glock_workqueue)) 1788 if (IS_ERR(glock_workqueue))
1758 return PTR_ERR(glock_workqueue); 1789 return PTR_ERR(glock_workqueue);
1759 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
1791 WQ_FREEZEABLE, 0);
1760 if (IS_ERR(gfs2_delete_workqueue)) { 1792 if (IS_ERR(gfs2_delete_workqueue)) {
1761 destroy_workqueue(glock_workqueue); 1793 destroy_workqueue(glock_workqueue);
1762 return PTR_ERR(gfs2_delete_workqueue); 1794 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b15..db1c26d6d22 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 216
217/** 217/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 218 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 219 * @gl: the glock
220 * @state: the state we're requesting 220 * @state: the state we're requesting
221 * @flags: the modifier flags 221 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb69..0d149dcc04e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 452 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 453 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 454 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 455 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 456 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 457 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d7363b22d..764fbb49efc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/slow-work.h>
16#include <linux/dlm.h> 15#include <linux/dlm.h>
17#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
18 17
@@ -197,6 +196,7 @@ enum {
197 GLF_REPLY_PENDING = 9, 196 GLF_REPLY_PENDING = 9,
198 GLF_INITIAL = 10, 197 GLF_INITIAL = 10,
199 GLF_FROZEN = 11, 198 GLF_FROZEN = 11,
199 GLF_QUEUED = 12,
200}; 200};
201 201
202struct gfs2_glock { 202struct gfs2_glock {
@@ -268,7 +268,6 @@ struct gfs2_inode {
268 u64 i_no_formal_ino; 268 u64 i_no_formal_ino;
269 u64 i_generation; 269 u64 i_generation;
270 u64 i_eattr; 270 u64 i_eattr;
271 loff_t i_disksize;
272 unsigned long i_flags; /* GIF_... */ 271 unsigned long i_flags; /* GIF_... */
273 struct gfs2_glock *i_gl; /* Move into i_gh? */ 272 struct gfs2_glock *i_gl; /* Move into i_gh? */
274 struct gfs2_holder i_iopen_gh; 273 struct gfs2_holder i_iopen_gh;
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
383struct gfs2_jdesc { 382struct gfs2_jdesc {
384 struct list_head jd_list; 383 struct list_head jd_list;
385 struct list_head extent_list; 384 struct list_head extent_list;
386 struct slow_work jd_work; 385 struct work_struct jd_work;
387 struct inode *jd_inode; 386 struct inode *jd_inode;
388 unsigned long jd_flags; 387 unsigned long jd_flags;
389#define JDF_RECOVERY 1 388#define JDF_RECOVERY 1
@@ -417,11 +416,8 @@ struct gfs2_args {
417 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
418 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
419 unsigned int ar_spectator:1; /* Don't get a journal */ 418 unsigned int ar_spectator:1; /* Don't get a journal */
420 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
421 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 419 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
422 unsigned int ar_localcaching:1; /* Local caching */
423 unsigned int ar_debug:1; /* Oops on errors */ 420 unsigned int ar_debug:1; /* Oops on errors */
424 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
425 unsigned int ar_posix_acl:1; /* Enable posix acls */ 421 unsigned int ar_posix_acl:1; /* Enable posix acls */
426 unsigned int ar_quota:2; /* off/account/on */ 422 unsigned int ar_quota:2; /* off/account/on */
427 unsigned int ar_suiddir:1; /* suiddir support */ 423 unsigned int ar_suiddir:1; /* suiddir support */
@@ -460,6 +456,7 @@ enum {
460 SDF_NOBARRIERS = 3, 456 SDF_NOBARRIERS = 3,
461 SDF_NORECOVERY = 4, 457 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5, 458 SDF_DEMOTE = 5,
459 SDF_NOJOURNALID = 6,
463}; 460};
464 461
465#define GFS2_FSNAME_LEN 256 462#define GFS2_FSNAME_LEN 256
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
497 */ 494 */
498 495
499struct lm_lockstruct { 496struct lm_lockstruct {
500 unsigned int ls_jid; 497 int ls_jid;
501 unsigned int ls_first; 498 unsigned int ls_first;
502 unsigned int ls_first_done; 499 unsigned int ls_first_done;
503 unsigned int ls_nodir; 500 unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 569 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 570 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 571 unsigned int sd_rgrps;
572 unsigned int sd_max_rg_data;
575 573
576 /* Journal index stuff */ 574 /* Journal index stuff */
577 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f03afd9c44b..06370f8bd8c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
85 85
86 if (ip->i_no_addr == data->no_addr) { 86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1; 88 data->skipped = 1;
89 return 0; 89 return 0;
90 } 90 }
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 359 * to do that.
360 */ 360 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 362 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 363 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 364 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 365 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -991,18 +990,29 @@ fail:
991 990
992static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 991static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
993{ 992{
993 struct inode *inode = &ip->i_inode;
994 struct buffer_head *dibh; 994 struct buffer_head *dibh;
995 int error; 995 int error;
996 996
997 error = gfs2_meta_inode_buffer(ip, &dibh); 997 error = gfs2_meta_inode_buffer(ip, &dibh);
998 if (!error) { 998 if (error)
999 error = inode_setattr(&ip->i_inode, attr); 999 return error;
1000 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); 1000
1001 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1001 if ((attr->ia_valid & ATTR_SIZE) &&
1002 gfs2_dinode_out(ip, dibh->b_data); 1002 attr->ia_size != i_size_read(inode)) {
1003 brelse(dibh); 1003 error = vmtruncate(inode, attr->ia_size);
1004 if (error)
1005 return error;
1004 } 1006 }
1005 return error; 1007
1008 setattr_copy(inode, attr);
1009 mark_inode_dirty(inode);
1010
1011 gfs2_assert_warn(GFS2_SB(inode), !error);
1012 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1013 gfs2_dinode_out(ip, dibh->b_data);
1014 brelse(dibh);
1015 return 0;
1006} 1016}
1007 1017
1008/** 1018/**
@@ -1044,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1044 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1054 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1045 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1055 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1046 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1056 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1047 str->di_size = cpu_to_be64(ip->i_disksize); 1057 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1048 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1058 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1049 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1059 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1050 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1060 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1074,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1074 (unsigned long long)ip->i_no_formal_ino); 1084 (unsigned long long)ip->i_no_formal_ino);
1075 printk(KERN_INFO " no_addr = %llu\n", 1085 printk(KERN_INFO " no_addr = %llu\n",
1076 (unsigned long long)ip->i_no_addr); 1086 (unsigned long long)ip->i_no_addr);
1077 printk(KERN_INFO " i_disksize = %llu\n", 1087 printk(KERN_INFO " i_size = %llu\n",
1078 (unsigned long long)ip->i_disksize); 1088 (unsigned long long)i_size_read(&ip->i_inode));
1079 printk(KERN_INFO " blocks = %llu\n", 1089 printk(KERN_INFO " blocks = %llu\n",
1080 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1090 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1081 printk(KERN_INFO " i_goal = %llu\n", 1091 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21d..6720d7d5fbc 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c..1c09425b45f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6a857e24f94..eb01f3575e1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
592 lh->lh_hash = cpu_to_be32(hash); 592 lh->lh_hash = cpu_to_be32(hash);
593 593
594 bh->b_end_io = end_buffer_write_sync; 594 bh->b_end_io = end_buffer_write_sync;
595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
596 goto skip_barrier;
597 get_bh(bh); 595 get_bh(bh);
598 submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); 596 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
597 submit_bh(WRITE_SYNC | REQ_META, bh);
598 else
599 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
599 wait_on_buffer(bh); 600 wait_on_buffer(bh);
600 if (buffer_eopnotsupp(bh)) { 601
601 clear_buffer_eopnotsupp(bh);
602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
605 lock_buffer(bh);
606skip_barrier:
607 get_bh(bh);
608 submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
609 wait_on_buffer(bh);
610 }
611 if (!buffer_uptodate(bh)) 602 if (!buffer_uptodate(bh))
612 gfs2_io_error_bh(sdp, bh); 603 gfs2_io_error_bh(sdp, bh);
613 brelse(bh); 604 brelse(bh);
@@ -932,7 +923,7 @@ int gfs2_logd(void *data)
932 923
933 do { 924 do {
934 prepare_to_wait(&sdp->sd_logd_waitq, &wait, 925 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
935 TASK_UNINTERRUPTIBLE); 926 TASK_INTERRUPTIBLE);
936 if (!gfs2_ail_flush_reqd(sdp) && 927 if (!gfs2_ail_flush_reqd(sdp) &&
937 !gfs2_jrnl_flush_reqd(sdp) && 928 !gfs2_jrnl_flush_reqd(sdp) &&
938 !kthread_should_stop()) 929 !kthread_should_stop())
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index fb2a5f93b7c..ebef7ab6e17 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <asm/atomic.h> 17#include <asm/atomic.h>
18#include <linux/slow-work.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -24,6 +23,8 @@
24#include "util.h" 23#include "util.h"
25#include "glock.h" 24#include "glock.h"
26#include "quota.h" 25#include "quota.h"
26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -138,9 +142,11 @@ static int __init init_gfs2_fs(void)
138 if (error) 142 if (error)
139 goto fail_unregister; 143 goto fail_unregister;
140 144
141 error = slow_work_register_user(THIS_MODULE); 145 error = -ENOMEM;
142 if (error) 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 goto fail_slow; 147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
148 if (!gfs_recovery_wq)
149 goto fail_wq;
144 150
145 gfs2_register_debugfs(); 151 gfs2_register_debugfs();
146 152
@@ -148,7 +154,7 @@ static int __init init_gfs2_fs(void)
148 154
149 return 0; 155 return 0;
150 156
151fail_slow: 157fail_wq:
152 unregister_filesystem(&gfs2meta_fs_type); 158 unregister_filesystem(&gfs2meta_fs_type);
153fail_unregister: 159fail_unregister:
154 unregister_filesystem(&gfs2_fs_type); 160 unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +196,7 @@ static void __exit exit_gfs2_fs(void)
190 gfs2_unregister_debugfs(); 196 gfs2_unregister_debugfs();
191 unregister_filesystem(&gfs2_fs_type); 197 unregister_filesystem(&gfs2_fs_type);
192 unregister_filesystem(&gfs2meta_fs_type); 198 unregister_filesystem(&gfs2meta_fs_type);
193 slow_work_unregister_user(THIS_MODULE); 199 destroy_workqueue(gfs_recovery_wq);
194 200
195 kmem_cache_destroy(gfs2_quotad_cachep); 201 kmem_cache_destroy(gfs2_quotad_cachep);
196 kmem_cache_destroy(gfs2_rgrpd_cachep); 202 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 18176d0b75d..939739c7b3f 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
36{ 36{
37 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
38 int nr_underway = 0; 38 int nr_underway = 0;
39 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? 39 int write_op = REQ_META |
40 WRITE_SYNC_PLUG : WRITE)); 40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
41 41
42 BUG_ON(!PageLocked(page)); 42 BUG_ON(!PageLocked(page));
43 BUG_ON(!page_has_buffers(page)); 43 BUG_ON(!page_has_buffers(page));
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
55 * activity, but those code paths have their own higher-level 55 * activity, but those code paths have their own higher-level
56 * throttling. 56 * throttling.
57 */ 57 */
58 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 58 if (wbc->sync_mode != WB_SYNC_NONE) {
59 lock_buffer(bh); 59 lock_buffer(bh);
60 } else if (!trylock_buffer(bh)) { 60 } else if (!trylock_buffer(bh)) {
61 redirty_page_for_writepage(wbc, page); 61 redirty_page_for_writepage(wbc, page);
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
225 } 225 }
226 bh->b_end_io = end_buffer_read_sync; 226 bh->b_end_io = end_buffer_read_sync;
227 get_bh(bh); 227 get_bh(bh);
228 submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); 228 submit_bh(READ_SYNC | REQ_META, bh);
229 if (!(flags & DIO_WAIT)) 229 if (!(flags & DIO_WAIT))
230 return 0; 230 return 0;
231 231
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
432 if (buffer_uptodate(first_bh)) 432 if (buffer_uptodate(first_bh))
433 goto out; 433 goto out;
434 if (!buffer_locked(first_bh)) 434 if (!buffer_locked(first_bh))
435 ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); 435 ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
436 436
437 dblock++; 437 dblock++;
438 extlen--; 438 extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3593b3a7290..3eb1393f7b8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h>
21#include <linux/quotaops.h> 20#include <linux/quotaops.h>
22 21
23#include "gfs2.h" 22#include "gfs2.h"
@@ -39,14 +38,6 @@
39#define DO 0 38#define DO 0
40#define UNDO 1 39#define UNDO 1
41 40
42static const u32 gfs2_old_fs_formats[] = {
43 0
44};
45
46static const u32 gfs2_old_multihost_formats[] = {
47 0
48};
49
50/** 41/**
51 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
52 * @gt: tune 43 * @gt: tune
@@ -76,7 +67,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
76 67
77 sb->s_fs_info = sdp; 68 sb->s_fs_info = sdp;
78 sdp->sd_vfs = sb; 69 sdp->sd_vfs = sb;
79 70 set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
80 gfs2_tune_init(&sdp->sd_tune); 71 gfs2_tune_init(&sdp->sd_tune);
81 72
82 init_waitqueue_head(&sdp->sd_glock_wait); 73 init_waitqueue_head(&sdp->sd_glock_wait);
@@ -136,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
136 127
137static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
138{ 129{
139 unsigned int x;
140
141 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
142 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
143 if (!silent) 132 if (!silent)
@@ -151,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
151 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
152 return 0; 141 return 0;
153 142
154 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
155 for (x = 0; gfs2_old_fs_formats[x]; x++)
156 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
157 break;
158
159 if (!gfs2_old_fs_formats[x]) {
160 printk(KERN_WARNING
161 "GFS2: code version (%u, %u) is incompatible "
162 "with ondisk format (%u, %u)\n",
163 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
164 sb->sb_fs_format, sb->sb_multihost_format);
165 printk(KERN_WARNING
166 "GFS2: I don't know how to upgrade this FS\n");
167 return -EINVAL;
168 }
169 }
170
171 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
172 for (x = 0; gfs2_old_multihost_formats[x]; x++)
173 if (gfs2_old_multihost_formats[x] ==
174 sb->sb_multihost_format)
175 break;
176
177 if (!gfs2_old_multihost_formats[x]) {
178 printk(KERN_WARNING
179 "GFS2: code version (%u, %u) is incompatible "
180 "with ondisk format (%u, %u)\n",
181 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
182 sb->sb_fs_format, sb->sb_multihost_format);
183 printk(KERN_WARNING
184 "GFS2: I don't know how to upgrade this FS\n");
185 return -EINVAL;
186 }
187 }
188
189 if (!sdp->sd_args.ar_upgrade) {
190 printk(KERN_WARNING
191 "GFS2: code version (%u, %u) is incompatible "
192 "with ondisk format (%u, %u)\n",
193 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
194 sb->sb_fs_format, sb->sb_multihost_format);
195 printk(KERN_INFO
196 "GFS2: Use the \"upgrade\" mount option to upgrade "
197 "the FS\n");
198 printk(KERN_INFO "GFS2: See the manual for more details\n");
199 return -EINVAL;
200 }
201 144
202 return 0; 145 return -EINVAL;
203} 146}
204 147
205static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -275,7 +218,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
275 218
276 bio->bi_end_io = end_bio_io_page; 219 bio->bi_end_io = end_bio_io_page;
277 bio->bi_private = page; 220 bio->bi_private = page;
278 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); 221 submit_bio(READ_SYNC | REQ_META, bio);
279 wait_on_page_locked(page); 222 wait_on_page_locked(page);
280 bio_put(bio); 223 bio_put(bio);
281 if (!PageUptodate(page)) { 224 if (!PageUptodate(page)) {
@@ -587,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
587 530
588 prev_db = 0; 531 prev_db = 0;
589 532
590 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 533 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
591 bh.b_state = 0; 534 bh.b_state = 0;
592 bh.b_blocknr = 0; 535 bh.b_blocknr = 0;
593 bh.b_size = 1 << ip->i_inode.i_blkbits; 536 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -673,7 +616,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
673 break; 616 break;
674 617
675 INIT_LIST_HEAD(&jd->extent_list); 618 INIT_LIST_HEAD(&jd->extent_list);
676 slow_work_init(&jd->jd_work, &gfs2_recover_ops); 619 INIT_WORK(&jd->jd_work, gfs2_recover_func);
677 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 620 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
678 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 621 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
679 if (!jd->jd_inode) 622 if (!jd->jd_inode)
@@ -782,7 +725,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
782 if (sdp->sd_lockstruct.ls_first) { 725 if (sdp->sd_lockstruct.ls_first) {
783 unsigned int x; 726 unsigned int x;
784 for (x = 0; x < sdp->sd_journals; x++) { 727 for (x = 0; x < sdp->sd_journals; x++) {
785 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); 728 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
729 true);
786 if (error) { 730 if (error) {
787 fs_err(sdp, "error recovering journal %u: %d\n", 731 fs_err(sdp, "error recovering journal %u: %d\n",
788 x, error); 732 x, error);
@@ -792,7 +736,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
792 736
793 gfs2_others_may_mount(sdp); 737 gfs2_others_may_mount(sdp);
794 } else if (!sdp->sd_args.ar_spectator) { 738 } else if (!sdp->sd_args.ar_spectator) {
795 error = gfs2_recover_journal(sdp->sd_jdesc); 739 error = gfs2_recover_journal(sdp->sd_jdesc, true);
796 if (error) { 740 if (error) {
797 fs_err(sdp, "error recovering my journal: %d\n", error); 741 fs_err(sdp, "error recovering my journal: %d\n", error);
798 goto fail_jinode_gh; 742 goto fail_jinode_gh;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 966 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 967 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 968 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 969#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 970 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 971 lm = &gfs2_dlm_ops;
@@ -1050,7 +993,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1050 ret = match_int(&tmp[0], &option); 993 ret = match_int(&tmp[0], &option);
1051 if (ret || option < 0) 994 if (ret || option < 0)
1052 goto hostdata_error; 995 goto hostdata_error;
1053 ls->ls_jid = option; 996 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
997 ls->ls_jid = option;
1054 break; 998 break;
1055 case Opt_id: 999 case Opt_id:
1056 /* Obsolete, but left for backward compat purposes */ 1000 /* Obsolete, but left for backward compat purposes */
@@ -1102,6 +1046,22 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1102 lm->lm_unmount(sdp); 1046 lm->lm_unmount(sdp);
1103} 1047}
1104 1048
1049static int gfs2_journalid_wait(void *word)
1050{
1051 if (signal_pending(current))
1052 return -EINTR;
1053 schedule();
1054 return 0;
1055}
1056
1057static int wait_on_journal(struct gfs2_sbd *sdp)
1058{
1059 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1060 return 0;
1061
1062 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
1063}
1064
1105void gfs2_online_uevent(struct gfs2_sbd *sdp) 1065void gfs2_online_uevent(struct gfs2_sbd *sdp)
1106{ 1066{
1107 struct super_block *sb = sdp->sd_vfs; 1067 struct super_block *sb = sdp->sd_vfs;
@@ -1194,6 +1154,24 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1194 if (error) 1154 if (error)
1195 goto fail_locking; 1155 goto fail_locking;
1196 1156
1157 error = wait_on_journal(sdp);
1158 if (error)
1159 goto fail_sb;
1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1197 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1198 if (error) 1176 if (error)
1199 goto fail_sb; 1177 goto fail_sb;
@@ -1241,7 +1219,6 @@ fail_sb:
1241fail_locking: 1219fail_locking:
1242 init_locking(sdp, &mount_gh, UNDO); 1220 init_locking(sdp, &mount_gh, UNDO);
1243fail_lm: 1221fail_lm:
1244 invalidate_inodes(sb);
1245 gfs2_gl_hash_clear(sdp); 1222 gfs2_gl_hash_clear(sdp);
1246 gfs2_lm_unmount(sdp); 1223 gfs2_lm_unmount(sdp);
1247fail_sys: 1224fail_sys:
@@ -1273,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1273} 1250}
1274 1251
1275/** 1252/**
1276 * gfs2_get_sb - Get the GFS2 superblock 1253 * gfs2_mount - Get the GFS2 superblock
1277 * @fs_type: The GFS2 filesystem type 1254 * @fs_type: The GFS2 filesystem type
1278 * @flags: Mount flags 1255 * @flags: Mount flags
1279 * @dev_name: The name of the device 1256 * @dev_name: The name of the device
1280 * @data: The mount arguments 1257 * @data: The mount arguments
1281 * @mnt: The vfsmnt for this mount
1282 * 1258 *
1283 * Q. Why not use get_sb_bdev() ? 1259 * Q. Why not use get_sb_bdev() ?
1284 * A. We need to select one of two root directories to mount, independent 1260 * A. We need to select one of two root directories to mount, independent
@@ -1287,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1287 * Returns: 0 or -ve on error 1263 * Returns: 0 or -ve on error
1288 */ 1264 */
1289 1265
1290static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1266static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1291 const char *dev_name, void *data, struct vfsmount *mnt) 1267 const char *dev_name, void *data)
1292{ 1268{
1293 struct block_device *bdev; 1269 struct block_device *bdev;
1294 struct super_block *s; 1270 struct super_block *s;
@@ -1302,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1302 1278
1303 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1279 bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1304 if (IS_ERR(bdev)) 1280 if (IS_ERR(bdev))
1305 return PTR_ERR(bdev); 1281 return ERR_CAST(bdev);
1306 1282
1307 /* 1283 /*
1308 * once the super is inserted into the list by sget, s_umount 1284 * once the super is inserted into the list by sget, s_umount
@@ -1321,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1321 if (IS_ERR(s)) 1297 if (IS_ERR(s))
1322 goto error_bdev; 1298 goto error_bdev;
1323 1299
1300 if (s->s_root)
1301 close_bdev_exclusive(bdev, mode);
1302
1324 memset(&args, 0, sizeof(args)); 1303 memset(&args, 0, sizeof(args));
1325 args.ar_quota = GFS2_QUOTA_DEFAULT; 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
1326 args.ar_data = GFS2_DATA_DEFAULT; 1305 args.ar_data = GFS2_DATA_DEFAULT;
@@ -1332,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1332 error = gfs2_mount_args(&args, data); 1311 error = gfs2_mount_args(&args, data);
1333 if (error) { 1312 if (error) {
1334 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1313 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1335 if (s->s_root) 1314 goto error_super;
1336 goto error_super;
1337 deactivate_locked_super(s);
1338 return error;
1339 } 1315 }
1340 1316
1341 if (s->s_root) { 1317 if (s->s_root) {
1342 error = -EBUSY; 1318 error = -EBUSY;
1343 if ((flags ^ s->s_flags) & MS_RDONLY) 1319 if ((flags ^ s->s_flags) & MS_RDONLY)
1344 goto error_super; 1320 goto error_super;
1345 close_bdev_exclusive(bdev, mode);
1346 } else { 1321 } else {
1347 char b[BDEVNAME_SIZE]; 1322 char b[BDEVNAME_SIZE];
1348 1323
@@ -1351,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1351 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 1326 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1352 sb_set_blocksize(s, block_size(bdev)); 1327 sb_set_blocksize(s, block_size(bdev));
1353 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); 1328 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
1354 if (error) { 1329 if (error)
1355 deactivate_locked_super(s); 1330 goto error_super;
1356 return error;
1357 }
1358 s->s_flags |= MS_ACTIVE; 1331 s->s_flags |= MS_ACTIVE;
1359 bdev->bd_super = s; 1332 bdev->bd_super = s;
1360 } 1333 }
1361 1334
1362 sdp = s->s_fs_info; 1335 sdp = s->s_fs_info;
1363 mnt->mnt_sb = s;
1364 if (args.ar_meta) 1336 if (args.ar_meta)
1365 mnt->mnt_root = dget(sdp->sd_master_dir); 1337 return dget(sdp->sd_master_dir);
1366 else 1338 else
1367 mnt->mnt_root = dget(sdp->sd_root_dir); 1339 return dget(sdp->sd_root_dir);
1368 return 0;
1369 1340
1370error_super: 1341error_super:
1371 deactivate_locked_super(s); 1342 deactivate_locked_super(s);
1343 return ERR_PTR(error);
1372error_bdev: 1344error_bdev:
1373 close_bdev_exclusive(bdev, mode); 1345 close_bdev_exclusive(bdev, mode);
1374 return error; 1346 return ERR_PTR(error);
1375} 1347}
1376 1348
1377static int set_meta_super(struct super_block *s, void *ptr) 1349static int set_meta_super(struct super_block *s, void *ptr)
@@ -1379,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
1379 return -EINVAL; 1351 return -EINVAL;
1380} 1352}
1381 1353
1382static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1354static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1383 const char *dev_name, void *data, struct vfsmount *mnt) 1355 int flags, const char *dev_name, void *data)
1384{ 1356{
1385 struct super_block *s; 1357 struct super_block *s;
1386 struct gfs2_sbd *sdp; 1358 struct gfs2_sbd *sdp;
@@ -1391,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1391 if (error) { 1363 if (error) {
1392 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1364 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1393 dev_name, error); 1365 dev_name, error);
1394 return error; 1366 return ERR_PTR(error);
1395 } 1367 }
1396 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, 1368 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
1397 path.dentry->d_inode->i_sb->s_bdev); 1369 path.dentry->d_inode->i_sb->s_bdev);
1398 path_put(&path); 1370 path_put(&path);
1399 if (IS_ERR(s)) { 1371 if (IS_ERR(s)) {
1400 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1372 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1401 return PTR_ERR(s); 1373 return ERR_CAST(s);
1402 } 1374 }
1403 if ((flags ^ s->s_flags) & MS_RDONLY) { 1375 if ((flags ^ s->s_flags) & MS_RDONLY) {
1404 deactivate_locked_super(s); 1376 deactivate_locked_super(s);
1405 return -EBUSY; 1377 return ERR_PTR(-EBUSY);
1406 } 1378 }
1407 sdp = s->s_fs_info; 1379 sdp = s->s_fs_info;
1408 mnt->mnt_sb = s; 1380 return dget(sdp->sd_master_dir);
1409 mnt->mnt_root = dget(sdp->sd_master_dir);
1410 return 0;
1411} 1381}
1412 1382
1413static void gfs2_kill_sb(struct super_block *sb) 1383static void gfs2_kill_sb(struct super_block *sb)
@@ -1433,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
1433struct file_system_type gfs2_fs_type = { 1403struct file_system_type gfs2_fs_type = {
1434 .name = "gfs2", 1404 .name = "gfs2",
1435 .fs_flags = FS_REQUIRES_DEV, 1405 .fs_flags = FS_REQUIRES_DEV,
1436 .get_sb = gfs2_get_sb, 1406 .mount = gfs2_mount,
1437 .kill_sb = gfs2_kill_sb, 1407 .kill_sb = gfs2_kill_sb,
1438 .owner = THIS_MODULE, 1408 .owner = THIS_MODULE,
1439}; 1409};
@@ -1441,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
1441struct file_system_type gfs2meta_fs_type = { 1411struct file_system_type gfs2meta_fs_type = {
1442 .name = "gfs2meta", 1412 .name = "gfs2meta",
1443 .fs_flags = FS_REQUIRES_DEV, 1413 .fs_flags = FS_REQUIRES_DEV,
1444 .get_sb = gfs2_get_sb_meta, 1414 .mount = gfs2_mount_meta,
1445 .owner = THIS_MODULE, 1415 .owner = THIS_MODULE,
1446}; 1416};
1447 1417
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 98cdd05f331..12cbea7502c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
21#include <asm/uaccess.h> 23#include <asm/uaccess.h>
22 24
23#include "gfs2.h" 25#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 219 goto out_gunlock_q;
218 220
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 221 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 222 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 223 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 224 RES_QUOTA, 0);
223 if (error) 225 if (error)
@@ -253,7 +255,7 @@ out_parent:
253 gfs2_holder_uninit(ghs); 255 gfs2_holder_uninit(ghs);
254 gfs2_holder_uninit(ghs + 1); 256 gfs2_holder_uninit(ghs + 1);
255 if (!error) { 257 if (!error) {
256 atomic_inc(&inode->i_count); 258 ihold(inode);
257 d_instantiate(dentry, inode); 259 d_instantiate(dentry, inode);
258 mark_inode_dirty(inode); 260 mark_inode_dirty(inode);
259 } 261 }
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 408
407 ip = ghs[1].gh_gl->gl_object; 409 ip = ghs[1].gh_gl->gl_object;
408 410
409 ip->i_disksize = size;
410 i_size_write(inode, size); 411 i_size_write(inode, size);
411 412
412 error = gfs2_meta_inode_buffer(ip, &dibh); 413 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 462 ip = ghs[1].gh_gl->gl_object;
462 463
463 ip->i_inode.i_nlink = 2; 464 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 465 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 466 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 467 ip->i_entries = 2;
467 468
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 471 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 472 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 473 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 474
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 475 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 476 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 477 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 478 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 479 di->di_entries = cpu_to_be32(1);
481 480
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 481 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 482 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 483
486 gfs2_inum_out(dip, dent); 484 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 485 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 520static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 521 struct gfs2_inode *ip)
524{ 522{
525 struct qstr dotname;
526 int error; 523 int error;
527 524
528 if (ip->i_entries != 2) { 525 if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 536 if (error)
540 return error; 537 return error;
541 538
542 gfs2_str2qstr(&dotname, "."); 539 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 540 if (error)
545 return error; 541 return error;
546 542
547 gfs2_str2qstr(&dotname, ".."); 543 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 544 if (error)
550 return error; 545 return error;
551 546
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 689 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 690 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 691 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 692 int error = 0;
699 693
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 694 igrab(dir);
703 695
704 for (;;) { 696 for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 703 break;
712 } 704 }
713 705
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 706 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 707 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 708 error = PTR_ERR(tmp);
717 break; 709 break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 736 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 737 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 738 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 739 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 740 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 741 unsigned int num_gh;
750 int dir_rename = 0; 742 int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 750 return 0;
759 } 751 }
760 752
753 error = gfs2_rindex_hold(sdp, &ri_gh);
754 if (error)
755 return error;
761 756
762 if (odip != ndip) { 757 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 758 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 882
888 al->al_requested = sdp->sd_max_dirres; 883 al->al_requested = sdp->sd_max_dirres;
889 884
890 error = gfs2_inplace_reserve(ndip); 885 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 886 if (error)
892 goto out_gunlock_q; 887 goto out_gunlock_q;
893 888
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 889 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 890 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 891 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 892 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 893 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 915 }
921 916
922 if (dir_rename) { 917 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 918 error = gfs2_change_nlink(ndip, +1);
927 if (error) 919 if (error)
928 goto out_end_trans; 920 goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 922 if (error)
931 goto out_end_trans; 923 goto out_end_trans;
932 924
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 925 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 926 if (error)
935 goto out_end_trans; 927 goto out_end_trans;
936 } else { 928 } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 964 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 965 gfs2_glock_dq_uninit(&r_gh);
974out: 966out:
967 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 968 return error;
976} 969}
977 970
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 983 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 984 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 985 struct buffer_head *dibh;
993 unsigned int x; 986 unsigned int x, size;
994 char *buf; 987 char *buf;
995 int error; 988 int error;
996 989
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 995 return NULL;
1003 } 996 }
1004 997
1005 if (!ip->i_disksize) { 998 size = (unsigned int)i_size_read(&ip->i_inode);
999 if (size == 0) {
1006 gfs2_consist_inode(ip); 1000 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 1001 buf = ERR_PTR(-EIO);
1008 goto out; 1002 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1008 goto out;
1015 } 1009 }
1016 1010
1017 x = ip->i_disksize + 1; 1011 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1012 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1013 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1014 buf = ERR_PTR(-ENOMEM);
@@ -1071,32 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX: should be changed to have proper ordering by opencoding simple_setsize
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 error = simple_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 if (error)
1090 return error;
1091 }
1092
1093 error = gfs2_truncatei(ip, attr->ia_size);
1094 if (error && (inode->i_size != ip->i_disksize))
1095 i_size_write(inode, ip->i_disksize);
1096
1097 return error;
1098}
1099
1100static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1101{ 1069{
1102 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1136,8 +1104,16 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1136 if (error) 1104 if (error)
1137 goto out_end_trans; 1105 goto out_end_trans;
1138 1106
1139 error = inode_setattr(inode, attr); 1107 if ((attr->ia_valid & ATTR_SIZE) &&
1140 gfs2_assert_warn(sdp, !error); 1108 attr->ia_size != i_size_read(inode)) {
1109 int error;
1110
1111 error = vmtruncate(inode, attr->ia_size);
1112 gfs2_assert_warn(sdp, !error);
1113 }
1114
1115 setattr_copy(inode, attr);
1116 mark_inode_dirty(inode);
1141 1117
1142 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1118 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1143 gfs2_dinode_out(ip, dibh->b_data); 1119 gfs2_dinode_out(ip, dibh->b_data);
@@ -1189,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1189 goto out; 1165 goto out;
1190 1166
1191 if (attr->ia_valid & ATTR_SIZE) 1167 if (attr->ia_valid & ATTR_SIZE)
1192 error = setattr_size(inode, attr); 1168 error = gfs2_setattr_size(inode, attr->ia_size);
1193 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1169 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1194 error = setattr_chown(inode, attr); 1170 error = setattr_chown(inode, attr);
1195 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1171 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1295,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1295 return ret; 1271 return ret;
1296} 1272}
1297 1273
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = __block_write_begin(page, start, end - start,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1298static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1299 u64 start, u64 len) 1526 u64 start, u64 len)
1300{ 1527{
@@ -1345,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
1345 .getxattr = gfs2_getxattr, 1572 .getxattr = gfs2_getxattr,
1346 .listxattr = gfs2_listxattr, 1573 .listxattr = gfs2_listxattr,
1347 .removexattr = gfs2_removexattr, 1574 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1348 .fiemap = gfs2_fiemap, 1576 .fiemap = gfs2_fiemap,
1349}; 1577};
1350 1578
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8f02d3db8f4..58a9b9998b4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
735 goto out; 735 goto out;
736 736
737 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 738 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 739 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 740 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 741 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 742 gfs2_dinode_out(ip, dibh->b_data);
@@ -787,15 +785,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
787 goto out; 785 goto out;
788 786
789 for (x = 0; x < num_qd; x++) { 787 for (x = 0; x < num_qd; x++) {
790 int alloc_required;
791
792 offset = qd2offset(qda[x]); 788 offset = qd2offset(qda[x]);
793 error = gfs2_write_alloc_required(ip, offset, 789 if (gfs2_write_alloc_required(ip, offset,
794 sizeof(struct gfs2_quota), 790 sizeof(struct gfs2_quota)))
795 &alloc_required);
796 if (error)
797 goto out_gunlock;
798 if (alloc_required)
799 nalloc++; 791 nalloc++;
800 } 792 }
801 793
@@ -823,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
823 goto out_alloc; 815 goto out_alloc;
824 816
825 if (nalloc) 817 if (nalloc)
826 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 818 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
827 819
828 error = gfs2_trans_begin(sdp, blocks, 0); 820 error = gfs2_trans_begin(sdp, blocks, 0);
829 if (error) 821 if (error)
@@ -1196,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1196int gfs2_quota_init(struct gfs2_sbd *sdp) 1188int gfs2_quota_init(struct gfs2_sbd *sdp)
1197{ 1189{
1198 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1190 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1199 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1191 u64 size = i_size_read(sdp->sd_qc_inode);
1192 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1200 unsigned int x, slot = 0; 1193 unsigned int x, slot = 0;
1201 unsigned int found = 0; 1194 unsigned int found = 0;
1202 u64 dblock; 1195 u64 dblock;
1203 u32 extlen = 0; 1196 u32 extlen = 0;
1204 int error; 1197 int error;
1205 1198
1206 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1199 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1207 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1208 gfs2_consist_inode(ip);
1209 return -EIO; 1200 return -EIO;
1210 } 1201
1211 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1202 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1212 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1203 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1213 1204
@@ -1455,10 +1446,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1455 1446
1456 switch (sdp->sd_args.ar_quota) { 1447 switch (sdp->sd_args.ar_quota) {
1457 case GFS2_QUOTA_ON: 1448 case GFS2_QUOTA_ON:
1458 fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1449 fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
1459 /*FALLTHRU*/ 1450 /*FALLTHRU*/
1460 case GFS2_QUOTA_ACCOUNT: 1451 case GFS2_QUOTA_ACCOUNT:
1461 fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1452 fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
1462 break; 1453 break;
1463 case GFS2_QUOTA_OFF: 1454 case GFS2_QUOTA_OFF:
1464 break; 1455 break;
@@ -1504,7 +1495,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1504 1495
1505 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1496 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
1506 fdq->d_version = FS_DQUOT_VERSION; 1497 fdq->d_version = FS_DQUOT_VERSION;
1507 fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; 1498 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1508 fdq->d_id = id; 1499 fdq->d_id = id;
1509 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1500 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
1510 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1501 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1539,12 +1530,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1539 switch(type) { 1530 switch(type) {
1540 case USRQUOTA: 1531 case USRQUOTA:
1541 type = QUOTA_USER; 1532 type = QUOTA_USER;
1542 if (fdq->d_flags != XFS_USER_QUOTA) 1533 if (fdq->d_flags != FS_USER_QUOTA)
1543 return -EINVAL; 1534 return -EINVAL;
1544 break; 1535 break;
1545 case GRPQUOTA: 1536 case GRPQUOTA:
1546 type = QUOTA_GROUP; 1537 type = QUOTA_GROUP;
1547 if (fdq->d_flags != XFS_GROUP_QUOTA) 1538 if (fdq->d_flags != FS_GROUP_QUOTA)
1548 return -EINVAL; 1539 return -EINVAL;
1549 break; 1540 break;
1550 default: 1541 default:
@@ -1584,10 +1575,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1584 goto out_i; 1575 goto out_i;
1585 1576
1586 offset = qd2offset(qd); 1577 offset = qd2offset(qd);
1587 error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), 1578 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
1588 &alloc_required);
1589 if (error)
1590 goto out_i;
1591 if (alloc_required) { 1579 if (alloc_required) {
1592 al = gfs2_alloc_get(ip); 1580 al = gfs2_alloc_get(ip);
1593 if (al == NULL) 1581 if (al == NULL)
@@ -1598,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1598 error = gfs2_inplace_reserve(ip); 1586 error = gfs2_inplace_reserve(ip);
1599 if (error) 1587 if (error)
1600 goto out_alloc; 1588 goto out_alloc;
1589 blocks += gfs2_rg_blocks(al);
1601 } 1590 }
1602 1591
1603 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1592 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d43..f2a02edcac8 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/slow-work.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -28,6 +27,8 @@
28#include "util.h" 27#include "util.h"
29#include "dir.h" 28#include "dir.h"
30 29
30struct workqueue_struct *gfs_recovery_wq;
31
31int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
32 struct buffer_head **bh) 33 struct buffer_head **bh)
33{ 34{
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
443 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 444 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
444} 445}
445 446
446static int gfs2_recover_get_ref(struct slow_work *work) 447void gfs2_recover_func(struct work_struct *work)
447{
448 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
449 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
450 return -EBUSY;
451 return 0;
452}
453
454static void gfs2_recover_put_ref(struct slow_work *work)
455{
456 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
457 clear_bit(JDF_RECOVERY, &jd->jd_flags);
458 smp_mb__after_clear_bit();
459 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
460}
461
462static void gfs2_recover_work(struct slow_work *work)
463{ 448{
464 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); 449 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
465 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 450 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -470,11 +455,13 @@ static void gfs2_recover_work(struct slow_work *work)
470 int ro = 0; 455 int ro = 0;
471 unsigned int pass; 456 unsigned int pass;
472 int error; 457 int error;
458 int jlocked = 0;
473 459
474 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
475 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
476 jd->jd_jid); 463 jd->jd_jid);
477 464 jlocked = 1;
478 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
479 466
480 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -569,53 +556,55 @@ static void gfs2_recover_work(struct slow_work *work)
569 jd->jd_jid, t); 556 jd->jd_jid, t);
570 } 557 }
571 558
572 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
573 gfs2_glock_dq_uninit(&ji_gh);
574
575 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
576 560
577 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
578 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
579 565
580 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
581 return; 567 goto done;
582 568
583fail_gunlock_tr: 569fail_gunlock_tr:
584 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
585fail_gunlock_ji: 571fail_gunlock_ji:
586 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
587 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
588fail_gunlock_j: 574fail_gunlock_j:
589 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
590 } 576 }
591 577
592 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 578 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
593
594fail: 579fail:
595 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 580 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
581done:
582 clear_bit(JDF_RECOVERY, &jd->jd_flags);
583 smp_mb__after_clear_bit();
584 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
596} 585}
597 586
598struct slow_work_ops gfs2_recover_ops = {
599 .owner = THIS_MODULE,
600 .get_ref = gfs2_recover_get_ref,
601 .put_ref = gfs2_recover_put_ref,
602 .execute = gfs2_recover_work,
603};
604
605
606static int gfs2_recovery_wait(void *word) 587static int gfs2_recovery_wait(void *word)
607{ 588{
608 schedule(); 589 schedule();
609 return 0; 590 return 0;
610} 591}
611 592
612int gfs2_recover_journal(struct gfs2_jdesc *jd) 593int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
613{ 594{
614 int rv; 595 int rv;
615 rv = slow_work_enqueue(&jd->jd_work); 596
616 if (rv) 597 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
617 return rv; 598 return -EBUSY;
618 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); 599
600 /* we have JDF_RECOVERY, queue should always succeed */
601 rv = queue_work(gfs_recovery_wq, &jd->jd_work);
602 BUG_ON(!rv);
603
604 if (wait)
605 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
606 TASK_UNINTERRUPTIBLE);
607
619 return 0; 608 return 0;
620} 609}
621 610
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569..2226136c764 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15extern struct workqueue_struct *gfs_recovery_wq;
16
15static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) 17static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
16{ 18{
17 if (++*blk == sdp->sd_jdesc->jd_blocks) 19 if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 29
28extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 32extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
31extern struct slow_work_ops gfs2_recover_ops; 33extern void gfs2_recover_func(struct work_struct *work);
32 34
33#endif /* __RECOVERY_DOT_H__ */ 35#endif /* __RECOVERY_DOT_H__ */
34 36
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e4..bef3ab6cf5c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
609 if (rgd->rd_data > max_data)
610 max_data = rgd->rd_data;
611 sdp->sd_max_rg_data = max_data;
606 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
607 return 0; 613 return 0;
608} 614}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode; 629 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state; 630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
625 int error; 633 int error;
626 634
627 file_ra_state_init(&ra_state, inode->i_mapping); 635 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */ 637 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize) 639 i_size_read(inode))
632 break; 640 break;
633 error = read_rindex_entry(ip, &ra_state); 641 error = read_rindex_entry(ip, &ra_state);
634 if (error) { 642 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
636 return error; 644 return error;
637 } 645 }
638 } 646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
639 651
640 sdp->sd_rindex_uptodate = 1; 652 sdp->sd_rindex_uptodate = 1;
641 return 0; 653 return 0;
@@ -854,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 866 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 867 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 868 nr_sects, GFP_NOFS,
857 BLKDEV_IFL_WAIT | 869 0);
858 BLKDEV_IFL_BARRIER);
859 if (rv) 870 if (rv)
860 goto fail; 871 goto fail;
861 nr_sects = 0; 872 nr_sects = 0;
@@ -869,8 +880,7 @@ start_new_extent:
869 } 880 }
870 } 881 }
871 if (nr_sects) { 882 if (nr_sects) {
872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 883 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
874 if (rv) 884 if (rv)
875 goto fail; 885 goto fail;
876 } 886 }
@@ -1188,7 +1198,8 @@ out:
1188 * Returns: errno 1198 * Returns: errno
1189 */ 1199 */
1190 1200
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1201int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1202 char *file, unsigned int line)
1192{ 1203{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1204 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1205 struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1210,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1199 return -EINVAL; 1210 return -EINVAL;
1200 1211
1201try_again: 1212try_again:
1202 /* We need to hold the rindex unless the inode we're using is 1213 if (hold_rindex) {
1203 the rindex itself, in which case it's already held. */ 1214 /* We need to hold the rindex unless the inode we're using is
1204 if (ip != GFS2_I(sdp->sd_rindex)) 1215 the rindex itself, in which case it's already held. */
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1216 if (ip != GFS2_I(sdp->sd_rindex))
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ 1217 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1207 error = gfs2_ri_update_special(ip); 1218 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1219 in, so: */
1220 error = gfs2_ri_update_special(ip);
1221 }
1208 1222
1209 if (error) 1223 if (error)
1210 return error; 1224 return error;
@@ -1215,7 +1229,7 @@ try_again:
1215 try to free it, and try the allocation again. */ 1229 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1230 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) { 1231 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1232 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1233 gfs2_glock_dq_uninit(&al->al_ri_gh);
1220 if (error != -EAGAIN) 1234 if (error != -EAGAIN)
1221 return error; 1235 return error;
@@ -1257,7 +1271,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1271 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1272 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1273 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1274 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1275 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1276}
1263 1277
@@ -1496,11 +1510,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1510 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1511 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1512 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1513 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1514 u32 goal, blk;
1501 u64 block; 1515 u64 block;
1502 int error; 1516 int error;
1503 1517
1518 /* Only happens if there is a bug in gfs2, return something distinctive
1519 * to ensure that it is noticed.
1520 */
1521 if (al == NULL)
1522 return -ECANCELED;
1523
1524 rgd = al->al_rgd;
1525
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1526 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1527 goal = ip->i_goal - rgd->rd_data0;
1506 else 1528 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d8955..0e35c0466f9 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4d1aad38f1b..2b2c4997430 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,23 +343,19 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
345 int ar; 346 u64 size = i_size_read(jd->jd_inode);
346 int error;
347 347
348 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
349 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
350 gfs2_consist_inode(ip);
351 return -EIO; 349 return -EIO;
352 }
353 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
354 350
355 error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
356 if (!error && ar) { 352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
357 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
358 error = -EIO; 355 return -EIO;
359 } 356 }
360 357
361 return error; 358 return 0;
362} 359}
363 360
364/** 361/**
@@ -860,7 +857,6 @@ restart:
860 gfs2_clear_rgrpd(sdp); 857 gfs2_clear_rgrpd(sdp);
861 gfs2_jindex_free(sdp); 858 gfs2_jindex_free(sdp);
862 /* Take apart glock structures and buffer lists */ 859 /* Take apart glock structures and buffer lists */
863 invalidate_inodes(sdp->sd_vfs);
864 gfs2_gl_hash_clear(sdp); 860 gfs2_gl_hash_clear(sdp);
865 /* Unmount the locking protocol */ 861 /* Unmount the locking protocol */
866 gfs2_lm_unmount(sdp); 862 gfs2_lm_unmount(sdp);
@@ -1132,9 +1128,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1132 1128
1133 /* Some flags must not be changed */ 1129 /* Some flags must not be changed */
1134 if (args_neq(&args, &sdp->sd_args, spectator) || 1130 if (args_neq(&args, &sdp->sd_args, spectator) ||
1135 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1136 args_neq(&args, &sdp->sd_args, localflocks) || 1131 args_neq(&args, &sdp->sd_args, localflocks) ||
1137 args_neq(&args, &sdp->sd_args, localcaching) ||
1138 args_neq(&args, &sdp->sd_args, meta)) 1132 args_neq(&args, &sdp->sd_args, meta))
1139 return -EINVAL; 1133 return -EINVAL;
1140 1134
@@ -1191,7 +1185,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1191 * node for later deallocation. 1185 * node for later deallocation.
1192 */ 1186 */
1193 1187
1194static void gfs2_drop_inode(struct inode *inode) 1188static int gfs2_drop_inode(struct inode *inode)
1195{ 1189{
1196 struct gfs2_inode *ip = GFS2_I(inode); 1190 struct gfs2_inode *ip = GFS2_I(inode);
1197 1191
@@ -1200,26 +1194,7 @@ static void gfs2_drop_inode(struct inode *inode)
1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1194 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1201 clear_nlink(inode); 1195 clear_nlink(inode);
1202 } 1196 }
1203 generic_drop_inode(inode); 1197 return generic_drop_inode(inode);
1204}
1205
1206/**
1207 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
1208 * @inode: The VFS inode
1209 *
1210 */
1211
1212static void gfs2_clear_inode(struct inode *inode)
1213{
1214 struct gfs2_inode *ip = GFS2_I(inode);
1215
1216 ip->i_gl->gl_object = NULL;
1217 gfs2_glock_put(ip->i_gl);
1218 ip->i_gl = NULL;
1219 if (ip->i_iopen_gh.gh_gl) {
1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1222 }
1223} 1198}
1224 1199
1225static int is_ancestor(const struct dentry *d1, const struct dentry *d2) 1200static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
@@ -1256,16 +1231,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1256 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1231 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1257 if (args->ar_spectator) 1232 if (args->ar_spectator)
1258 seq_printf(s, ",spectator"); 1233 seq_printf(s, ",spectator");
1259 if (args->ar_ignore_local_fs)
1260 seq_printf(s, ",ignore_local_fs");
1261 if (args->ar_localflocks) 1234 if (args->ar_localflocks)
1262 seq_printf(s, ",localflocks"); 1235 seq_printf(s, ",localflocks");
1263 if (args->ar_localcaching)
1264 seq_printf(s, ",localcaching");
1265 if (args->ar_debug) 1236 if (args->ar_debug)
1266 seq_printf(s, ",debug"); 1237 seq_printf(s, ",debug");
1267 if (args->ar_upgrade)
1268 seq_printf(s, ",upgrade");
1269 if (args->ar_posix_acl) 1238 if (args->ar_posix_acl)
1270 seq_printf(s, ",acl"); 1239 seq_printf(s, ",acl");
1271 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1240 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -1347,13 +1316,16 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1347 * is safe, just less efficient. 1316 * is safe, just less efficient.
1348 */ 1317 */
1349 1318
1350static void gfs2_delete_inode(struct inode *inode) 1319static void gfs2_evict_inode(struct inode *inode)
1351{ 1320{
1352 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 1321 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
1353 struct gfs2_inode *ip = GFS2_I(inode); 1322 struct gfs2_inode *ip = GFS2_I(inode);
1354 struct gfs2_holder gh; 1323 struct gfs2_holder gh;
1355 int error; 1324 int error;
1356 1325
1326 if (inode->i_nlink)
1327 goto out;
1328
1357 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1329 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1358 if (unlikely(error)) { 1330 if (unlikely(error)) {
1359 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1331 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
@@ -1407,10 +1379,18 @@ out_unlock:
1407 gfs2_holder_uninit(&ip->i_iopen_gh); 1379 gfs2_holder_uninit(&ip->i_iopen_gh);
1408 gfs2_glock_dq_uninit(&gh); 1380 gfs2_glock_dq_uninit(&gh);
1409 if (error && error != GLR_TRYFAILED && error != -EROFS) 1381 if (error && error != GLR_TRYFAILED && error != -EROFS)
1410 fs_warn(sdp, "gfs2_delete_inode: %d\n", error); 1382 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1411out: 1383out:
1412 truncate_inode_pages(&inode->i_data, 0); 1384 truncate_inode_pages(&inode->i_data, 0);
1413 clear_inode(inode); 1385 end_writeback(inode);
1386
1387 ip->i_gl->gl_object = NULL;
1388 gfs2_glock_put(ip->i_gl);
1389 ip->i_gl = NULL;
1390 if (ip->i_iopen_gh.gh_gl) {
1391 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1392 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1393 }
1414} 1394}
1415 1395
1416static struct inode *gfs2_alloc_inode(struct super_block *sb) 1396static struct inode *gfs2_alloc_inode(struct super_block *sb)
@@ -1434,14 +1414,13 @@ const struct super_operations gfs2_super_ops = {
1434 .alloc_inode = gfs2_alloc_inode, 1414 .alloc_inode = gfs2_alloc_inode,
1435 .destroy_inode = gfs2_destroy_inode, 1415 .destroy_inode = gfs2_destroy_inode,
1436 .write_inode = gfs2_write_inode, 1416 .write_inode = gfs2_write_inode,
1437 .delete_inode = gfs2_delete_inode, 1417 .evict_inode = gfs2_evict_inode,
1438 .put_super = gfs2_put_super, 1418 .put_super = gfs2_put_super,
1439 .sync_fs = gfs2_sync_fs, 1419 .sync_fs = gfs2_sync_fs,
1440 .freeze_fs = gfs2_freeze, 1420 .freeze_fs = gfs2_freeze,
1441 .unfreeze_fs = gfs2_unfreeze, 1421 .unfreeze_fs = gfs2_unfreeze,
1442 .statfs = gfs2_statfs, 1422 .statfs = gfs2_statfs,
1443 .remount_fs = gfs2_remount_fs, 1423 .remount_fs = gfs2_remount_fs,
1444 .clear_inode = gfs2_clear_inode,
1445 .drop_inode = gfs2_drop_inode, 1424 .drop_inode = gfs2_drop_inode,
1446 .show_options = gfs2_show_options, 1425 .show_options = gfs2_show_options,
1447}; 1426};
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 37f5393e68e..748ccb557c1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
25#include "quota.h" 25#include "quota.h"
26#include "util.h" 26#include "util.h"
27#include "glops.h" 27#include "glops.h"
28#include "recovery.h"
28 29
29struct gfs2_attr { 30struct gfs2_attr {
30 struct attribute attr; 31 struct attribute attr;
@@ -229,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
229 230
230 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
231 return -EINVAL; 232 return -EINVAL;
232 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
233 if (glops == NULL) 237 if (glops == NULL)
234 return -EINVAL; 238 return -EINVAL;
235 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -325,6 +329,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
325 return sprintf(buf, "%d\n", ls->ls_first); 329 return sprintf(buf, "%d\n", ls->ls_first);
326} 330}
327 331
332static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
333{
334 unsigned first;
335 int rv;
336
337 rv = sscanf(buf, "%u", &first);
338 if (rv != 1 || first > 1)
339 return -EINVAL;
340 spin_lock(&sdp->sd_jindex_spin);
341 rv = -EBUSY;
342 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
343 goto out;
344 rv = -EINVAL;
345 if (sdp->sd_args.ar_spectator)
346 goto out;
347 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
348 goto out;
349 sdp->sd_lockstruct.ls_first = first;
350 rv = 0;
351out:
352 spin_unlock(&sdp->sd_jindex_spin);
353 return rv ? rv : len;
354}
355
328static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) 356static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
329{ 357{
330 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 358 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -352,7 +380,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
352 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 380 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
353 if (jd->jd_jid != jid) 381 if (jd->jd_jid != jid)
354 continue; 382 continue;
355 rv = slow_work_enqueue(&jd->jd_work); 383 rv = gfs2_recover_journal(jd, false);
356 break; 384 break;
357 } 385 }
358out: 386out:
@@ -374,7 +402,35 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
374 402
375static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
376{ 404{
377 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
406}
407
408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
409{
410 int jid;
411 int rv;
412
413 rv = sscanf(buf, "%d", &jid);
414 if (rv != 1)
415 return -EINVAL;
416
417 spin_lock(&sdp->sd_jindex_spin);
418 rv = -EINVAL;
419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
420 goto out;
421 rv = -EBUSY;
422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
429 smp_mb__after_clear_bit();
430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
431out:
432 spin_unlock(&sdp->sd_jindex_spin);
433 return rv ? rv : len;
378} 434}
379 435
380#define GDLM_ATTR(_name,_mode,_show,_store) \ 436#define GDLM_ATTR(_name,_mode,_show,_store) \
@@ -383,8 +439,8 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
383GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 439GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
384GDLM_ATTR(block, 0644, block_show, block_store); 440GDLM_ATTR(block, 0644, block_show, block_store);
385GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 441GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
386GDLM_ATTR(jid, 0444, jid_show, NULL); 442GDLM_ATTR(jid, 0644, jid_show, jid_store);
387GDLM_ATTR(first, 0444, lkfirst_show, NULL); 443GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store);
388GDLM_ATTR(first_done, 0444, first_done_show, NULL); 444GDLM_ATTR(first_done, 0444, first_done_show, NULL);
389GDLM_ATTR(recover, 0600, NULL, recover_store); 445GDLM_ATTR(recover, 0600, NULL, recover_store);
390GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 446GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
@@ -564,8 +620,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
564 620
565 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
566 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
567 if (!sdp->sd_args.ar_spectator) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
568 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
569 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
570 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
571 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c1417..cedb0bb96d9 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908..fb56b783e02 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 82f93da00d1..30b58f07c8a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
@@ -1296,6 +1296,7 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct inode *inode = &ip->i_inode;
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1300 struct gfs2_ea_location el; 1301 struct gfs2_ea_location el;
1301 struct buffer_head *dibh; 1302 struct buffer_head *dibh;
@@ -1321,14 +1322,25 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1321 return error; 1322 return error;
1322 1323
1323 error = gfs2_meta_inode_buffer(ip, &dibh); 1324 error = gfs2_meta_inode_buffer(ip, &dibh);
1324 if (!error) { 1325 if (error)
1325 error = inode_setattr(&ip->i_inode, attr); 1326 goto out_trans_end;
1326 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); 1327
1327 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1328 if ((attr->ia_valid & ATTR_SIZE) &&
1328 gfs2_dinode_out(ip, dibh->b_data); 1329 attr->ia_size != i_size_read(inode)) {
1329 brelse(dibh); 1330 int error;
1331
1332 error = vmtruncate(inode, attr->ia_size);
1333 gfs2_assert_warn(GFS2_SB(inode), !error);
1330 } 1334 }
1331 1335
1336 setattr_copy(inode, attr);
1337 mark_inode_dirty(inode);
1338
1339 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1340 gfs2_dinode_out(ip, dibh->b_data);
1341 brelse(dibh);
1342
1343out_trans_end:
1332 gfs2_trans_end(sdp); 1344 gfs2_trans_end(sdp);
1333 return error; 1345 return error;
1334} 1346}
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d..571abe97b42 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7..3ebc437736f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
27 if (!tree) 27 if (!tree)
28 return NULL; 28 return NULL;
29 29
30 init_MUTEX(&tree->tree_lock); 30 mutex_init(&tree->tree_lock);
31 spin_lock_init(&tree->hash_lock); 31 spin_lock_init(&tree->hash_lock);
32 /* Set the correct compare function */ 32 /* Set the correct compare function */
33 tree->sb = sb; 33 tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21..2a1d712f85d 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
33 unsigned int depth; 33 unsigned int depth;
34 34
35 //unsigned int map1_size, map_size; 35 //unsigned int map1_size, map_size;
36 struct semaphore tree_lock; 36 struct mutex tree_lock;
37 37
38 unsigned int pages_per_bnode; 38 unsigned int pages_per_bnode;
39 spinlock_t hash_lock; 39 spinlock_t hash_lock;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fe35e3b626c..c8cffb81e84 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
147 u16 blockoffset; 147 u16 blockoffset;
148 148
149 int fs_div; 149 int fs_div;
150
151 struct hlist_head rsrc_inodes;
152}; 150};
153 151
154#define HFS_FLG_BITMAP_DIRTY 0 152#define HFS_FLG_BITMAP_DIRTY 0
@@ -193,7 +191,7 @@ extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 191extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 192 __be32 log_size, __be32 phys_size, u32 clump_size);
195extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); 193extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
196extern void hfs_clear_inode(struct inode *); 194extern void hfs_evict_inode(struct inode *);
197extern void hfs_delete_inode(struct inode *); 195extern void hfs_delete_inode(struct inode *);
198 196
199/* attr.c */ 197/* attr.c */
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
254 sb->s_dirt = 1; 252 sb->s_dirt = 1;
255} 253}
256 254
257static inline void hfs_buffer_sync(struct buffer_head *bh)
258{
259 while (buffer_locked(bh)) {
260 wait_on_buffer(bh);
261 }
262 if (buffer_dirty(bh)) {
263 ll_rw_block(WRITE, 1, &bh);
264 wait_on_buffer(bh);
265 }
266}
267
268#define sb_bread512(sb, sec, data) ({ \ 255#define sb_bread512(sb, sec, data) ({ \
269 struct buffer_head *__bh; \ 256 struct buffer_head *__bh; \
270 sector_t __block; \ 257 sector_t __block; \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1b9fd..dffb4e99664 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -39,10 +39,19 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned flags, 39 loff_t pos, unsigned len, unsigned flags,
40 struct page **pagep, void **fsdata) 40 struct page **pagep, void **fsdata)
41{ 41{
42 int ret;
43
42 *pagep = NULL; 44 *pagep = NULL;
43 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
44 hfs_get_block, 46 hfs_get_block,
45 &HFS_I(mapping->host)->phys_size); 47 &HFS_I(mapping->host)->phys_size);
48 if (unlikely(ret)) {
49 loff_t isize = mapping->host->i_size;
50 if (pos + len > isize)
51 vmtruncate(mapping->host, isize);
52 }
53
54 return ret;
46} 55}
47 56
48static sector_t hfs_bmap(struct address_space *mapping, sector_t block) 57static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
@@ -112,9 +121,24 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
112{ 121{
113 struct file *file = iocb->ki_filp; 122 struct file *file = iocb->ki_filp;
114 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
124 ssize_t ret;
115 125
116 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 126 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
117 offset, nr_segs, hfs_get_block, NULL); 127 offset, nr_segs, hfs_get_block, NULL);
128
129 /*
130 * In case of error extending write may have instantiated a few
131 * blocks outside i_size. Trim these off again.
132 */
133 if (unlikely((rw & WRITE) && ret < 0)) {
134 loff_t isize = i_size_read(inode);
135 loff_t end = offset + iov_length(iov, nr_segs);
136
137 if (end > isize)
138 vmtruncate(inode, isize);
139 }
140
141 return ret;
118} 142}
119 143
120static int hfs_writepages(struct address_space *mapping, 144static int hfs_writepages(struct address_space *mapping,
@@ -500,15 +524,17 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
500 HFS_I(inode)->rsrc_inode = dir; 524 HFS_I(inode)->rsrc_inode = dir;
501 HFS_I(dir)->rsrc_inode = inode; 525 HFS_I(dir)->rsrc_inode = inode;
502 igrab(dir); 526 igrab(dir);
503 hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); 527 hlist_add_fake(&inode->i_hash);
504 mark_inode_dirty(inode); 528 mark_inode_dirty(inode);
505out: 529out:
506 d_add(dentry, inode); 530 d_add(dentry, inode);
507 return NULL; 531 return NULL;
508} 532}
509 533
510void hfs_clear_inode(struct inode *inode) 534void hfs_evict_inode(struct inode *inode)
511{ 535{
536 truncate_inode_pages(&inode->i_data, 0);
537 end_writeback(inode);
512 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 538 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
513 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 539 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
514 iput(HFS_I(inode)->rsrc_inode); 540 iput(HFS_I(inode)->rsrc_inode);
@@ -588,13 +614,43 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
588 attr->ia_mode = inode->i_mode & ~S_IWUGO; 614 attr->ia_mode = inode->i_mode & ~S_IWUGO;
589 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; 615 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
590 } 616 }
591 error = inode_setattr(inode, attr);
592 if (error)
593 return error;
594 617
618 if ((attr->ia_valid & ATTR_SIZE) &&
619 attr->ia_size != i_size_read(inode)) {
620 error = vmtruncate(inode, attr->ia_size);
621 if (error)
622 return error;
623 }
624
625 setattr_copy(inode, attr);
626 mark_inode_dirty(inode);
595 return 0; 627 return 0;
596} 628}
597 629
630static int hfs_file_fsync(struct file *filp, int datasync)
631{
632 struct inode *inode = filp->f_mapping->host;
633 struct super_block * sb;
634 int ret, err;
635
636 /* sync the inode to buffers */
637 ret = write_inode_now(inode, 0);
638
639 /* sync the superblock to buffers */
640 sb = inode->i_sb;
641 if (sb->s_dirt) {
642 lock_super(sb);
643 sb->s_dirt = 0;
644 if (!(sb->s_flags & MS_RDONLY))
645 hfs_mdb_commit(sb);
646 unlock_super(sb);
647 }
648 /* .. finally sync the buffers to disk */
649 err = sync_blockdev(sb->s_bdev);
650 if (!ret)
651 ret = err;
652 return ret;
653}
598 654
599static const struct file_operations hfs_file_operations = { 655static const struct file_operations hfs_file_operations = {
600 .llseek = generic_file_llseek, 656 .llseek = generic_file_llseek,
@@ -604,7 +660,7 @@ static const struct file_operations hfs_file_operations = {
604 .aio_write = generic_file_aio_write, 660 .aio_write = generic_file_aio_write,
605 .mmap = generic_file_mmap, 661 .mmap = generic_file_mmap,
606 .splice_read = generic_file_splice_read, 662 .splice_read = generic_file_splice_read,
607 .fsync = file_fsync, 663 .fsync = hfs_file_fsync,
608 .open = hfs_file_open, 664 .open = hfs_file_open,
609 .release = hfs_file_release, 665 .release = hfs_file_release,
610}; 666};
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac99..1563d5ce576 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
220 mdb->drLsMod = hfs_mtime(); 220 mdb->drLsMod = hfs_mtime();
221 221
222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh); 222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
223 hfs_buffer_sync(HFS_SB(sb)->mdb_bh); 223 sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
224 } 224 }
225 225
226 return 0; 226 return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); 287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); 288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); 289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
290 hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); 290 sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
291 } 291 }
292 292
293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { 293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0a81eb7111f..4824c27cebb 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/smp_lock.h>
24#include <linux/vfs.h> 23#include <linux/vfs.h>
25 24
26#include "hfs_fs.h" 25#include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
79 */ 78 */
80static void hfs_put_super(struct super_block *sb) 79static void hfs_put_super(struct super_block *sb)
81{ 80{
82 lock_kernel();
83
84 if (sb->s_dirt) 81 if (sb->s_dirt)
85 hfs_write_super(sb); 82 hfs_write_super(sb);
86 hfs_mdb_close(sb); 83 hfs_mdb_close(sb);
87 /* release the MDB's resources */ 84 /* release the MDB's resources */
88 hfs_mdb_put(sb); 85 hfs_mdb_put(sb);
89
90 unlock_kernel();
91} 86}
92 87
93/* 88/*
@@ -181,7 +176,7 @@ static const struct super_operations hfs_super_operations = {
181 .alloc_inode = hfs_alloc_inode, 176 .alloc_inode = hfs_alloc_inode,
182 .destroy_inode = hfs_destroy_inode, 177 .destroy_inode = hfs_destroy_inode,
183 .write_inode = hfs_write_inode, 178 .write_inode = hfs_write_inode,
184 .clear_inode = hfs_clear_inode, 179 .evict_inode = hfs_evict_inode,
185 .put_super = hfs_put_super, 180 .put_super = hfs_put_super,
186 .write_super = hfs_write_super, 181 .write_super = hfs_write_super,
187 .sync_fs = hfs_sync_fs, 182 .sync_fs = hfs_sync_fs,
@@ -385,8 +380,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
385 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); 380 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
386 if (!sbi) 381 if (!sbi)
387 return -ENOMEM; 382 return -ENOMEM;
383
388 sb->s_fs_info = sbi; 384 sb->s_fs_info = sbi;
389 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
390 385
391 res = -EINVAL; 386 res = -EINVAL;
392 if (!parse_options((char *)data, sbi)) { 387 if (!parse_options((char *)data, sbi)) {
@@ -446,17 +441,16 @@ bail:
446 return res; 441 return res;
447} 442}
448 443
449static int hfs_get_sb(struct file_system_type *fs_type, 444static struct dentry *hfs_mount(struct file_system_type *fs_type,
450 int flags, const char *dev_name, void *data, 445 int flags, const char *dev_name, void *data)
451 struct vfsmount *mnt)
452{ 446{
453 return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); 447 return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
454} 448}
455 449
456static struct file_system_type hfs_fs_type = { 450static struct file_system_type hfs_fs_type = {
457 .owner = THIS_MODULE, 451 .owner = THIS_MODULE,
458 .name = "hfs", 452 .name = "hfs",
459 .get_sb = hfs_get_sb, 453 .mount = hfs_mount,
460 .kill_sb = kill_block_super, 454 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV, 455 .fs_flags = FS_REQUIRES_DEV,
462}; 456};
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be..d182438c7ae 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == 0) {
56 res = -EINVAL;
57 goto fail;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == 0) {
75 res = -EINVAL;
76 goto fail;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -75,6 +83,7 @@ done:
75 fd->keylength = keylen; 83 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 84 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 85 fd->entrylength = len - keylen;
86fail:
78 return res; 87 return res;
79} 88}
80 89
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 207
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 208 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 209 keylen = hfs_brec_keylen(bnode, fd->record);
210 if (keylen == 0) {
211 res = -EINVAL;
212 goto out;
213 }
201 fd->keyoffset = off; 214 fd->keyoffset = off;
202 fd->keylength = keylen; 215 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 216 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03..ad57f5991eb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
19{ 19{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 21 struct page *page;
21 struct address_space *mapping; 22 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 23 __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 30 return size;
30 31
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 32 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 33 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 34 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 35 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 36 if (IS_ERR(page)) {
36 start = size; 37 start = size;
@@ -150,16 +151,17 @@ done:
150 set_page_dirty(page); 151 set_page_dirty(page);
151 kunmap(page); 152 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 153 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 154 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 155 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 156 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 157out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 158 mutex_unlock(&sbi->alloc_mutex);
158 return start; 159 return start;
159} 160}
160 161
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 162int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 163{
164 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 165 struct page *page;
164 struct address_space *mapping; 166 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 167 __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 174
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 175 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 176 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 177 if ((offset + count) > sbi->total_blocks)
176 return -2; 178 return -2;
177 179
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 180 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 181 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 182 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 183 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 184 pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
224out: 226out:
225 set_page_dirty(page); 227 set_page_dirty(page);
226 kunmap(page); 228 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 229 sbi->free_blocks += len;
228 sb->s_dirt = 1; 230 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 231 mutex_unlock(&sbi->alloc_mutex);
230 232
231 return 0; 233 return 0;
232} 234}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a40..2f39d05443e 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 43 if (!recoff)
44 return 0; 44 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 45
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 46 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 47 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 48 printk(KERN_ERR "hfs: keylen %d too large\n",
49 retval);
50 retval = 0;
51 }
49 } 52 }
50 return retval; 53 return retval;
51} 54}
@@ -216,7 +219,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 219static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 220{
218 struct hfs_btree *tree; 221 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 222 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 223 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 224 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 225 int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 238 new_node->type = node->type;
236 new_node->height = node->height; 239 new_node->height = node->height;
237 240
241 if (node->next)
242 next_node = hfs_bnode_find(tree, node->next);
243 else
244 next_node = NULL;
245
246 if (IS_ERR(next_node)) {
247 hfs_bnode_put(node);
248 hfs_bnode_put(new_node);
249 return next_node;
250 }
251
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 252 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 253 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 254 num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 262 /* panic? */
249 hfs_bnode_put(node); 263 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 264 hfs_bnode_put(new_node);
265 if (next_node)
266 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 267 return ERR_PTR(-ENOSPC);
252 } 268 }
253 269
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 318 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 319
304 /* update next bnode header */ 320 /* update next bnode header */
305 if (new_node->next) { 321 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 322 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 323 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 324 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e29..22e4d4e3299 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 63 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 64 tree->depth = be16_to_cpu(head->depth);
59 65
60 /* Set the correct compare function */ 66 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 67 switch (id) {
68 case HFSPLUS_EXT_CNID:
69 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
70 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
71 tree->max_key_len);
72 goto fail_page;
73 }
74 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
75 printk(KERN_ERR "hfs: invalid extent btree flag\n");
76 goto fail_page;
77 }
78
62 tree->keycmp = hfsplus_ext_cmp_key; 79 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 80 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 81 case HFSPLUS_CAT_CNID:
82 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
83 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
84 tree->max_key_len);
85 goto fail_page;
86 }
87 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
88 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
89 goto fail_page;
90 }
91
92 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 93 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 94 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 95 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 96 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 97 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 98 }
71 } else { 99 break;
100 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 101 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 102 goto fail_page;
74 } 103 }
75 104
105 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
106 printk(KERN_ERR "hfs: invalid btree flag\n");
107 goto fail_page;
108 }
109
76 size = tree->node_size; 110 size = tree->node_size;
77 if (!is_power_of_2(size)) 111 if (!is_power_of_2(size))
78 goto fail_page; 112 goto fail_page;
79 if (!tree->node_count) 113 if (!tree->node_count)
80 goto fail_page; 114 goto fail_page;
115
81 tree->node_size_shift = ffs(size) - 1; 116 tree->node_size_shift = ffs(size) - 1;
82 117
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
87 return tree; 122 return tree;
88 123
89 fail_page: 124 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 125 page_cache_release(page);
92 free_tree: 126 free_inode:
127 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 128 iput(tree->inode);
129 free_tree:
94 kfree(tree); 130 kfree(tree);
95 return NULL; 131 return NULL;
96} 132}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 228
193 while (!tree->free_nodes) { 229 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 230 struct inode *inode = tree->inode;
231 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 232 u32 count;
196 int res; 233 int res;
197 234
198 res = hfsplus_file_extend(inode); 235 res = hfsplus_file_extend(inode);
199 if (res) 236 if (res)
200 return ERR_PTR(res); 237 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 238 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 239 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 240 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 241 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 242 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 243 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 244 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 245 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf..8af45fc5b05 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
88{ 95{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97
89 if (S_ISDIR(inode->i_mode)) { 98 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 99 struct hfsplus_cat_folder *folder;
91 100
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 102 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 103 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 104 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 105 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 106 folder->create_date =
98 folder->content_mod_date = 107 folder->content_mod_date =
99 folder->attribute_mod_date = 108 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 109 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 110 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 111 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 112 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 113 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 114 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 120 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 121 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 122 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 123 HFSPLUS_I(inode)->create_date =
115 file->create_date = 124 file->create_date =
116 file->content_mod_date = 125 file->content_mod_date =
117 file->attribute_mod_date = 126 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 127 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 128 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 129 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 130 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 133 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 134 file->user_info.fdType = cpu_to_be32(sbi->type);
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 135 file->user_info.fdCreator = cpu_to_be32(sbi->creator);
127 } 136 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 142 file->user_info.fdFlags = cpu_to_be16(0x100);
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 145 }
137 return sizeof(*file); 146 return sizeof(*file);
138 } 147 }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
180 189
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
182{ 191{
192 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 193 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 194 hfsplus_cat_entry entry;
186 int entry_size; 195 int entry_size;
187 int err; 196 int err;
188 197
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
190 sb = dir->i_sb; 199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
192 200
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
234 242
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 243int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 244{
237 struct super_block *sb; 245 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 246 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 247 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 248 struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
242 u16 type; 250 u16 type;
243 251
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
245 sb = dir->i_sb; 253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
247 254
248 if (!str) { 255 if (!str) {
249 int len; 256 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 287 }
281 288
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 289 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 290 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 291 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 292 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 319 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 320 struct inode *dst_dir, struct qstr *dst_name)
314{ 321{
315 struct super_block *sb; 322 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 323 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 324 hfsplus_cat_entry entry;
318 int entry_size, type; 325 int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
320 327
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 329 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 331 dst_fd = src_fd;
326 332
327 /* find the old dir entry and read the data */ 333 /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca8..9d59c0571f5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
39 39
40 dentry->d_op = &hfsplus_dentry_operations; 40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 44again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 45 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
68 cnid = be32_to_cpu(entry.file.id); 68 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
73 HFSPLUS_SB(sb).hidden_dir) { 73 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 74 struct qstr str;
75 char name[32]; 75 char name[32];
76 76
@@ -86,7 +86,8 @@ again:
86 linkid = be32_to_cpu(entry.file.permissions.dev); 86 linkid = be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 87 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 88 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 89 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
90 goto again; 91 goto again;
91 } 92 }
92 } else if (!dentry->d_fsdata) 93 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
101 if (IS_ERR(inode)) 102 if (IS_ERR(inode))
102 return ERR_CAST(inode); 103 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 104 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 105 HFSPLUS_I(inode)->linkid = linkid;
105out: 106out:
106 d_add(dentry, inode); 107 d_add(dentry, inode);
107 return NULL; 108 return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 125 if (filp->f_pos >= inode->i_size)
125 return 0; 126 return 0;
126 127
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 128 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 129 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 130 err = hfs_brec_find(&fd);
130 if (err) 131 if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 err = -EIO; 181 err = -EIO;
181 goto out; 182 goto out;
182 } 183 }
183 if (HFSPLUS_SB(sb).hidden_dir && 184 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 185 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
186 be32_to_cpu(entry.folder.id))
185 goto next; 187 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 188 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 189 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 219 }
218 filp->private_data = rd; 220 filp->private_data = rd;
219 rd->file = filp; 221 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 222 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 223 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 224 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 225out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 231{
230 struct hfsplus_readdir_data *rd = file->private_data; 232 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 233 if (rd) {
234 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 235 list_del(&rd->list);
236 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 237 kfree(rd);
234 } 238 }
235 return 0; 239 return 0;
236} 240}
237 241
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 242static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 243 struct dentry *dst_dentry)
262{ 244{
263 struct super_block *sb = dst_dir->i_sb; 245 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 246 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 247 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 248 struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 252
271 if (HFSPLUS_IS_RSRC(inode)) 253 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 254 return -EPERM;
255 if (!S_ISREG(inode->i_mode))
256 return -EPERM;
273 257
258 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 259 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 260 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 261 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 264 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 265 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 266 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 267 sbi->hidden_dir, &str);
283 if (!res) 268 if (!res)
284 break; 269 break;
285 if (res != -EEXIST) 270 if (res != -EEXIST)
286 return res; 271 goto out;
287 } 272 }
288 HFSPLUS_I(inode).dev = id; 273 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 274 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 275 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
292 if (res) 277 if (res)
293 /* panic? */ 278 /* panic? */
294 return res; 279 goto out;
295 HFSPLUS_SB(sb).file_count++; 280 sbi->file_count++;
296 } 281 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 282 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 283 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 284 if (res)
300 return res; 285 goto out;
301 286
302 inc_nlink(inode); 287 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 289 ihold(inode);
305 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 292 sbi->file_count++;
308 sb->s_dirt = 1; 293 dst_dir->i_sb->s_dirt = 1;
309 294out:
310 return 0; 295 mutex_unlock(&sbi->vh_mutex);
296 return res;
311} 297}
312 298
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 299static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 300{
315 struct super_block *sb = dir->i_sb; 301 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 302 struct inode *inode = dentry->d_inode;
317 struct qstr str; 303 struct qstr str;
318 char name[32]; 304 char name[32];
@@ -322,21 +308,24 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 308 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 309 return -EPERM;
324 310
311 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 312 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 313 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 314 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 315 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 316 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 318 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 319 sbi->hidden_dir, &str);
333 if (!res) 320 if (!res) {
334 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
335 return res; 322 drop_nlink(inode);
323 }
324 goto out;
336 } 325 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 326 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 327 if (res)
339 return res; 328 goto out;
340 329
341 if (inode->i_nlink > 0) 330 if (inode->i_nlink > 0)
342 drop_nlink(inode); 331 drop_nlink(inode);
@@ -344,10 +333,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 333 clear_nlink(inode);
345 if (!inode->i_nlink) { 334 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 335 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 336 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 337 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 338 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 339 sbi->hidden_dir,
351 NULL); 340 NULL);
352 if (!res) 341 if (!res)
353 hfsplus_delete_inode(inode); 342 hfsplus_delete_inode(inode);
@@ -356,107 +345,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 345 } else
357 hfsplus_delete_inode(inode); 346 hfsplus_delete_inode(inode);
358 } else 347 } else
359 HFSPLUS_SB(sb).file_count--; 348 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 349 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 350 mark_inode_dirty(inode);
362 351out:
352 mutex_unlock(&sbi->vh_mutex);
363 return res; 353 return res;
364} 354}
365 355
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 356static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 357{
389 struct inode *inode; 358 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
359 struct inode *inode = dentry->d_inode;
390 int res; 360 int res;
391 361
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 362 if (inode->i_size != 2)
394 return -ENOTEMPTY; 363 return -ENOTEMPTY;
364
365 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 366 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 367 if (res)
397 return res; 368 goto out;
398 clear_nlink(inode); 369 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 370 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 371 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 372 mark_inode_dirty(inode);
402 return 0; 373out:
374 mutex_unlock(&sbi->vh_mutex);
375 return res;
403} 376}
404 377
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 378static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 379 const char *symname)
407{ 380{
408 struct super_block *sb; 381 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 382 struct inode *inode;
410 int res; 383 int res = -ENOSPC;
411 384
412 sb = dir->i_sb; 385 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 386 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 387 if (!inode)
415 return -ENOSPC; 388 goto out;
416 389
417 res = page_symlink(inode, symname, strlen(symname) + 1); 390 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 391 if (res)
419 inode->i_nlink = 0; 392 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 393
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 394 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
395 if (res)
396 goto out_err;
427 397
428 if (!res) { 398 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 399 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 400 goto out;
431 }
432 401
402out_err:
403 inode->i_nlink = 0;
404 hfsplus_delete_inode(inode);
405 iput(inode);
406out:
407 mutex_unlock(&sbi->vh_mutex);
433 return res; 408 return res;
434} 409}
435 410
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 411static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 412 int mode, dev_t rdev)
438{ 413{
439 struct super_block *sb; 414 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 415 struct inode *inode;
441 int res; 416 int res = -ENOSPC;
442 417
443 sb = dir->i_sb; 418 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 419 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 420 if (!inode)
446 return -ENOSPC; 421 goto out;
422
423 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
424 init_special_inode(inode, mode, rdev);
447 425
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 427 if (res) {
450 inode->i_nlink = 0; 428 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 429 hfsplus_delete_inode(inode);
452 iput(inode); 430 iput(inode);
453 return res; 431 goto out;
454 } 432 }
455 init_special_inode(inode, mode, rdev); 433
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 434 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 435 mark_inode_dirty(inode);
436out:
437 mutex_unlock(&sbi->vh_mutex);
438 return res;
439}
458 440
459 return 0; 441static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
442 struct nameidata *nd)
443{
444 return hfsplus_mknod(dir, dentry, mode, 0);
445}
446
447static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
448{
449 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 450}
461 451
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 452static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +456,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 456
467 /* Unlink destination if it already exists */ 457 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 458 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 459 if (S_ISDIR(new_dentry->d_inode->i_mode))
460 res = hfsplus_rmdir(new_dir, new_dentry);
461 else
462 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 463 if (res)
471 return res; 464 return res;
472 } 465 }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cd..0c9cb1820a5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
87{ 87{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 89 int res;
89 90
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 91 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 92
93 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
94 HFSPLUS_IS_RSRC(inode) ?
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96
92 res = hfs_brec_find(fd); 97 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
94 if (res != -ENOENT) 99 if (res != -ENOENT)
95 return; 100 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 101 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 102 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
98 } else { 104 } else {
99 if (res) 105 if (res)
100 return; 106 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 107 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 108 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
103 } 110 }
104} 111}
105 112
106void hfsplus_ext_write_extent(struct inode *inode) 113static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 114{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
109 struct hfs_find_data fd; 116 struct hfs_find_data fd;
110 117
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 119 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 120 hfs_find_exit(&fd);
114 } 121 }
115} 122}
116 123
124void hfsplus_ext_write_extent(struct inode *inode)
125{
126 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
127 hfsplus_ext_write_extent_locked(inode);
128 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
129}
130
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 131static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 132 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 133 u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
136 150
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
138{ 152{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 154 int res;
140 155
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 156 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 159 __hfsplus_ext_write_extent(inode, fd);
143 160
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 162 block, HFSPLUS_IS_RSRC(inode) ?
163 HFSPLUS_TYPE_RSRC :
164 HFSPLUS_TYPE_DATA);
146 if (!res) { 165 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
149 } else { 168 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 169 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
152 } 171 }
153 return res; 172 return res;
154} 173}
155 174
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 175static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 176{
177 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 178 struct hfs_find_data fd;
159 int res; 179 int res;
160 180
161 if (block >= HFSPLUS_I(inode).cached_start && 181 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 182 block < hip->cached_start + hip->cached_blocks)
163 return 0; 183 return 0;
164 184
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 185 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 186 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 187 hfs_find_exit(&fd);
168 return res; 188 return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 192int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 193 struct buffer_head *bh_result, int create)
174{ 194{
175 struct super_block *sb; 195 struct super_block *sb = inode->i_sb;
196 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 198 int res = -EIO;
177 u32 ablock, dblock, mask; 199 u32 ablock, dblock, mask;
178 int shift; 200 int shift;
179 201
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 202 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 203 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 204 ablock = iblock >> sbi->fs_shift;
185 205
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 206 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 207 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 208 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 209 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 210 res = hfsplus_file_extend(inode);
191 if (res) 211 if (res)
192 return res; 212 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 214 } else
195 create = 0; 215 create = 0;
196 216
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 217 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 218 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 219 goto done;
200 } 220 }
201 221
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 222 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 223 return -EIO;
204 224
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 225 mutex_lock(&hip->extents_lock);
206 res = hfsplus_ext_read_extent(inode, ablock); 226 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 227 if (!res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 228 dblock = hfsplus_ext_find_block(hip->cached_extents,
209 HFSPLUS_I(inode).cached_start); 229 ablock - hip->cached_start);
210 } else { 230 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 231 mutex_unlock(&hip->extents_lock);
212 return -EIO; 232 return -EIO;
213 } 233 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 234 mutex_unlock(&hip->extents_lock);
215 235
216done: 236done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 238 mask = (1 << sbi->fs_shift) - 1;
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
220 if (create) { 240 if (create) {
221 set_buffer_new(bh_result); 241 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 242 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 243 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 244 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode); 245 mark_inode_dirty(inode);
226 } 246 }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 347 if (total_blocks == blocks)
328 return 0; 348 return 0;
329 349
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 350 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 351 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 352 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 353 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 368int hfsplus_file_extend(struct inode *inode)
349{ 369{
350 struct super_block *sb = inode->i_sb; 370 struct super_block *sb = inode->i_sb;
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 373 u32 start, len, goal;
352 int res; 374 int res;
353 375
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 376 if (sbi->alloc_file->i_size * 8 <
377 sbi->total_blocks - sbi->free_blocks + 8) {
355 // extend alloc file 378 // extend alloc file
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 380 sbi->alloc_file->i_size * 8,
381 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 382 return -ENOSPC;
359 } 383 }
360 384
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 385 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 386 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 387 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 388 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 389 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 390 if (res)
367 goto out; 391 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 392 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 393 }
370 394
371 len = HFSPLUS_I(inode).clump_blocks; 395 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 396 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 397 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 398 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 399 if (start >= goal) {
376 res = -ENOSPC; 400 res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 403 }
380 404
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 405 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 406
383 if (!HFSPLUS_I(inode).first_blocks) { 407 if (hip->alloc_blocks <= hip->first_blocks) {
408 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 409 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 410 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 411 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 412 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 413 res = 0;
389 } else { 414 } else {
390 /* try to append to extents in inode */ 415 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 416 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 417 hip->alloc_blocks,
393 start, len); 418 start, len);
394 if (res == -ENOSPC) 419 if (res == -ENOSPC)
395 goto insert_extent; 420 goto insert_extent;
396 } 421 }
397 if (!res) { 422 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 423 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 424 hip->first_blocks += len;
400 } 425 }
401 } else { 426 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 427 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 428 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 429 start, len);
406 if (!res) { 430 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 431 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 433 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 434 } else if (res == -ENOSPC)
411 goto insert_extent; 435 goto insert_extent;
412 } 436 }
413out: 437out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 438 mutex_unlock(&hip->extents_lock);
415 if (!res) { 439 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 440 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
418 } 442 }
419 return res; 443 return res;
420 444
421insert_extent: 445insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 446 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 447 hfsplus_ext_write_extent_locked(inode);
424 448
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 449 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 450 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 451 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 452 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 454 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 455 hip->cached_blocks = len;
432 456
433 res = 0; 457 res = 0;
434 goto out; 458 goto out;
@@ -437,13 +461,15 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 461void hfsplus_file_truncate(struct inode *inode)
438{ 462{
439 struct super_block *sb = inode->i_sb; 463 struct super_block *sb = inode->i_sb;
464 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 465 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 466 u32 alloc_cnt, blk_cnt, start;
442 int res; 467 int res;
443 468
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 470 inode->i_ino, (long long)hip->phys_size, inode->i_size);
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 471
472 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 473 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 474 struct page *page;
449 void *fsdata; 475 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
460 return; 486 return;
461 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
462 return; 488 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 489 } else if (inode->i_size == hip->phys_size)
464 return; 490 return;
465 491
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 492 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 493 HFSPLUS_SB(sb)->alloc_blksz_shift;
494 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 495 if (blk_cnt == alloc_cnt)
469 goto out; 496 goto out;
470 497
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 498 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 499 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 500 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 501 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 502 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 503 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 504 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 505 hip->first_blocks = blk_cnt;
479 break; 506 break;
480 } 507 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 508 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 509 if (res)
483 break; 510 break;
484 start = HFSPLUS_I(inode).cached_start; 511 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 512 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 513 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 514 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 515 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
490 break; 517 break;
491 } 518 }
492 alloc_cnt = start; 519 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 520 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
495 hfs_brec_remove(&fd); 522 hfs_brec_remove(&fd);
496 } 523 }
497 hfs_find_exit(&fd); 524 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 525 mutex_unlock(&hip->extents_lock);
499 526
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 527 hip->alloc_blocks = blk_cnt;
501out: 528out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 529 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
505 mark_inode_dirty(inode); 532 mark_inode_dirty(inode);
506} 533}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 6505c30ad96..cb3653efb57 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
62 unsigned int depth; 62 unsigned int depth;
63 63
64 //unsigned int map1_size, map_size; 64 //unsigned int map1_size, map_size;
65 struct semaphore tree_lock; 65 struct mutex tree_lock;
66 66
67 unsigned int pages_per_bnode; 67 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 68 spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
121 u32 sect_count; 121 u32 sect_count;
122 int fs_shift; 122 int fs_shift;
123 123
124 /* Stuff in host order from Vol Header */ 124 /* immutable data from the volume header */
125 u32 alloc_blksz; 125 u32 alloc_blksz;
126 int alloc_blksz_shift; 126 int alloc_blksz_shift;
127 u32 total_blocks; 127 u32 total_blocks;
128 u32 data_clump_blocks, rsrc_clump_blocks;
129
130 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 131 u32 free_blocks;
129 u32 next_alloc; 132 struct mutex alloc_mutex;
133
134 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 135 u32 next_cnid;
131 u32 file_count; 136 u32 file_count;
132 u32 folder_count; 137 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 138 struct mutex vh_mutex;
134 139
135 /* Config options */ 140 /* Config options */
136 u32 creator; 141 u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
143 int part, session; 148 int part, session;
144 149
145 unsigned long flags; 150 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 151};
149 152
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 153#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 154#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 155#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 156#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 157#define HFSPLUS_SB_CASEFOLD 4
155 158
156 159
157struct hfsplus_inode_info { 160struct hfsplus_inode_info {
158 struct mutex extents_lock;
159 u32 clump_blocks, alloc_blocks;
160 sector_t fs_blocks;
161 /* Allocation extents from catalog record or volume header */
162 hfsplus_extent_rec first_extents;
163 u32 first_blocks;
164 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks;
166 atomic_t opencnt; 161 atomic_t opencnt;
167 162
168 struct inode *rsrc_inode; 163 /*
164 * Extent allocation information, protected by extents_lock.
165 */
166 u32 first_blocks;
167 u32 clump_blocks;
168 u32 alloc_blocks;
169 u32 cached_start;
170 u32 cached_blocks;
171 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents;
169 unsigned long flags; 173 unsigned long flags;
174 struct mutex extents_lock;
170 175
176 /*
177 * Immutable data.
178 */
179 struct inode *rsrc_inode;
171 __be32 create_date; 180 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 181
182 /*
183 * Protected by sbi->vh_mutex.
184 */
185 u32 linkid;
186
187 /*
188 * Protected by i_mutex.
189 */
190 sector_t fs_blocks;
191 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 192 struct list_head open_dir_list;
179 loff_t phys_size; 193 loff_t phys_size;
194
180 struct inode vfs_inode; 195 struct inode vfs_inode;
181}; 196};
182 197
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 199#define HFSPLUS_FLG_EXT_DIRTY 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004 200#define HFSPLUS_FLG_EXT_NEW 0x0004
186 201
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
189 204
190struct hfs_find_data { 205struct hfs_find_data {
191 /* filled by caller */ 206 /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 326int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 327int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 328 struct inode *, struct qstr *);
329void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 330
315/* dir.c */ 331/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 332extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -351,6 +367,7 @@ int hfsplus_show_options(struct seq_file *, struct vfsmount *);
351 367
352/* super.c */ 368/* super.c */
353struct inode *hfsplus_iget(struct super_block *, unsigned long); 369struct inode *hfsplus_iget(struct super_block *, unsigned long);
370int hfsplus_sync_fs(struct super_block *sb, int wait);
354 371
355/* tables.c */ 372/* tables.c */
356extern u16 hfsplus_case_fold_table[]; 373extern u16 hfsplus_case_fold_table[];
@@ -371,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
371int hfs_part_find(struct super_block *, sector_t *, sector_t *); 388int hfs_part_find(struct super_block *, sector_t *, sector_t *);
372 389
373/* access macros */ 390/* access macros */
374/*
375static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) 391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
376{ 392{
377 return sb->s_fs_info; 393 return sb->s_fs_info;
378} 394}
395
379static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) 396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
380{ 397{
381 return list_entry(inode, struct hfsplus_inode_info, vfs_inode); 398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
382} 399}
383*/
384#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
385#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
386
387#if 1
388#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
389#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
390#else
391#define hfsplus_kmap(p) kmap(p)
392#define hfsplus_kunmap(p) kunmap(p)
393#endif
394 400
395#define sb_bread512(sb, sec, data) ({ \ 401#define sb_bread512(sb, sec, data) ({ \
396 struct buffer_head *__bh; \ 402 struct buffer_head *__bh; \
@@ -418,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
418#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 424#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
419#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 425#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
420 426
421#define kdev_t_to_nr(x) (x)
422
423#endif 427#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61..6892899fd6f 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 200 struct hfsplus_unistr name;
201} __packed; 201} __packed;
202 202
203#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 204
204/* Structs from hfs.h */ 205/* Structs from hfs.h */
205struct hfsp_point { 206struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 324 __be32 start_block;
324} __packed; 325} __packed;
325 326
326#define HFSPLUS_EXT_KEYLEN 12 327#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 328
328/* HFS+ generic BTree key */ 329/* HFS+ generic BTree key */
329typedef union { 330typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 9bbb82924a2..8afd7e84f98 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -31,10 +31,19 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
31 loff_t pos, unsigned len, unsigned flags, 31 loff_t pos, unsigned len, unsigned flags,
32 struct page **pagep, void **fsdata) 32 struct page **pagep, void **fsdata)
33{ 33{
34 int ret;
35
34 *pagep = NULL; 36 *pagep = NULL;
35 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
36 hfsplus_get_block, 38 hfsplus_get_block,
37 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize)
43 vmtruncate(mapping->host, isize);
44 }
45
46 return ret;
38} 47}
39 48
40static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) 49static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
@@ -53,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
53 62
54 switch (inode->i_ino) { 63 switch (inode->i_ino) {
55 case HFSPLUS_EXT_CNID: 64 case HFSPLUS_EXT_CNID:
56 tree = HFSPLUS_SB(sb).ext_tree; 65 tree = HFSPLUS_SB(sb)->ext_tree;
57 break; 66 break;
58 case HFSPLUS_CAT_CNID: 67 case HFSPLUS_CAT_CNID:
59 tree = HFSPLUS_SB(sb).cat_tree; 68 tree = HFSPLUS_SB(sb)->cat_tree;
60 break; 69 break;
61 case HFSPLUS_ATTR_CNID: 70 case HFSPLUS_ATTR_CNID:
62 tree = HFSPLUS_SB(sb).attr_tree; 71 tree = HFSPLUS_SB(sb)->attr_tree;
63 break; 72 break;
64 default: 73 default:
65 BUG(); 74 BUG();
@@ -105,9 +114,24 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
105{ 114{
106 struct file *file = iocb->ki_filp; 115 struct file *file = iocb->ki_filp;
107 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 116 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
117 ssize_t ret;
108 118
109 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 119 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
110 offset, nr_segs, hfsplus_get_block, NULL); 120 offset, nr_segs, hfsplus_get_block, NULL);
121
122 /*
123 * In case of error extending write may have instantiated a few
124 * blocks outside i_size. Trim these off again.
125 */
126 if (unlikely((rw & WRITE) && ret < 0)) {
127 loff_t isize = i_size_read(inode);
128 loff_t end = offset + iov_length(iov, nr_segs);
129
130 if (end > isize)
131 vmtruncate(inode, isize);
132 }
133
134 return ret;
111} 135}
112 136
113static int hfsplus_writepages(struct address_space *mapping, 137static int hfsplus_writepages(struct address_space *mapping,
@@ -148,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
148 struct hfs_find_data fd; 172 struct hfs_find_data fd;
149 struct super_block *sb = dir->i_sb; 173 struct super_block *sb = dir->i_sb;
150 struct inode *inode = NULL; 174 struct inode *inode = NULL;
175 struct hfsplus_inode_info *hip;
151 int err; 176 int err;
152 177
153 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 178 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
154 goto out; 179 goto out;
155 180
156 inode = HFSPLUS_I(dir).rsrc_inode; 181 inode = HFSPLUS_I(dir)->rsrc_inode;
157 if (inode) 182 if (inode)
158 goto out; 183 goto out;
159 184
@@ -161,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
161 if (!inode) 186 if (!inode)
162 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
163 188
189 hip = HFSPLUS_I(inode);
164 inode->i_ino = dir->i_ino; 190 inode->i_ino = dir->i_ino;
165 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 191 INIT_LIST_HEAD(&hip->open_dir_list);
166 mutex_init(&HFSPLUS_I(inode).extents_lock); 192 mutex_init(&hip->extents_lock);
167 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 193 hip->flags = HFSPLUS_FLG_RSRC;
168 194
169 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
170 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 196 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
171 if (!err) 197 if (!err)
172 err = hfsplus_cat_read_inode(inode, &fd); 198 err = hfsplus_cat_read_inode(inode, &fd);
@@ -175,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
175 iput(inode); 201 iput(inode);
176 return ERR_PTR(err); 202 return ERR_PTR(err);
177 } 203 }
178 HFSPLUS_I(inode).rsrc_inode = dir; 204 hip->rsrc_inode = dir;
179 HFSPLUS_I(dir).rsrc_inode = inode; 205 HFSPLUS_I(dir)->rsrc_inode = inode;
180 igrab(dir); 206 igrab(dir);
181 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 207
208 /*
209 * __mark_inode_dirty expects inodes to be hashed. Since we don't
210 * want resource fork inodes in the regular inode space, we make them
211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking.
213 */
214 hlist_add_fake(&inode->i_hash);
215
182 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
183out: 217out:
184 d_add(dentry, inode); 218 d_add(dentry, inode);
@@ -187,30 +221,27 @@ out:
187 221
188static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
189{ 223{
190 struct super_block *sb = inode->i_sb; 224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
191 u16 mode; 225 u16 mode;
192 226
193 mode = be16_to_cpu(perms->mode); 227 mode = be16_to_cpu(perms->mode);
194 228
195 inode->i_uid = be32_to_cpu(perms->owner); 229 inode->i_uid = be32_to_cpu(perms->owner);
196 if (!inode->i_uid && !mode) 230 if (!inode->i_uid && !mode)
197 inode->i_uid = HFSPLUS_SB(sb).uid; 231 inode->i_uid = sbi->uid;
198 232
199 inode->i_gid = be32_to_cpu(perms->group); 233 inode->i_gid = be32_to_cpu(perms->group);
200 if (!inode->i_gid && !mode) 234 if (!inode->i_gid && !mode)
201 inode->i_gid = HFSPLUS_SB(sb).gid; 235 inode->i_gid = sbi->gid;
202 236
203 if (dir) { 237 if (dir) {
204 mode = mode ? (mode & S_IALLUGO) : 238 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
205 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
206 mode |= S_IFDIR; 239 mode |= S_IFDIR;
207 } else if (!mode) 240 } else if (!mode)
208 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 241 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
209 ~(HFSPLUS_SB(sb).umask));
210 inode->i_mode = mode; 242 inode->i_mode = mode;
211 243
212 HFSPLUS_I(inode).rootflags = perms->rootflags; 244 HFSPLUS_I(inode)->userflags = perms->userflags;
213 HFSPLUS_I(inode).userflags = perms->userflags;
214 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 245 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
215 inode->i_flags |= S_IMMUTABLE; 246 inode->i_flags |= S_IMMUTABLE;
216 else 247 else
@@ -221,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
221 inode->i_flags &= ~S_APPEND; 252 inode->i_flags &= ~S_APPEND;
222} 253}
223 254
224static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
225{
226 if (inode->i_flags & S_IMMUTABLE)
227 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
228 else
229 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
230 if (inode->i_flags & S_APPEND)
231 perms->rootflags |= HFSPLUS_FLG_APPEND;
232 else
233 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
234 perms->userflags = HFSPLUS_I(inode).userflags;
235 perms->mode = cpu_to_be16(inode->i_mode);
236 perms->owner = cpu_to_be32(inode->i_uid);
237 perms->group = cpu_to_be32(inode->i_gid);
238 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
239}
240
241static int hfsplus_file_open(struct inode *inode, struct file *file) 255static int hfsplus_file_open(struct inode *inode, struct file *file)
242{ 256{
243 if (HFSPLUS_IS_RSRC(inode)) 257 if (HFSPLUS_IS_RSRC(inode))
244 inode = HFSPLUS_I(inode).rsrc_inode; 258 inode = HFSPLUS_I(inode)->rsrc_inode;
245 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 259 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
246 return -EOVERFLOW; 260 return -EOVERFLOW;
247 atomic_inc(&HFSPLUS_I(inode).opencnt); 261 atomic_inc(&HFSPLUS_I(inode)->opencnt);
248 return 0; 262 return 0;
249} 263}
250 264
@@ -253,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
253 struct super_block *sb = inode->i_sb; 267 struct super_block *sb = inode->i_sb;
254 268
255 if (HFSPLUS_IS_RSRC(inode)) 269 if (HFSPLUS_IS_RSRC(inode))
256 inode = HFSPLUS_I(inode).rsrc_inode; 270 inode = HFSPLUS_I(inode)->rsrc_inode;
257 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 271 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
258 mutex_lock(&inode->i_mutex); 272 mutex_lock(&inode->i_mutex);
259 hfsplus_file_truncate(inode); 273 hfsplus_file_truncate(inode);
260 if (inode->i_flags & S_DEAD) { 274 if (inode->i_flags & S_DEAD) {
261 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 275 hfsplus_delete_cat(inode->i_ino,
276 HFSPLUS_SB(sb)->hidden_dir, NULL);
262 hfsplus_delete_inode(inode); 277 hfsplus_delete_inode(inode);
263 } 278 }
264 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
@@ -266,9 +281,56 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
266 return 0; 281 return 0;
267} 282}
268 283
284static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
285{
286 struct inode *inode = dentry->d_inode;
287 int error;
288
289 error = inode_change_ok(inode, attr);
290 if (error)
291 return error;
292
293 if ((attr->ia_valid & ATTR_SIZE) &&
294 attr->ia_size != i_size_read(inode)) {
295 error = vmtruncate(inode, attr->ia_size);
296 if (error)
297 return error;
298 }
299
300 setattr_copy(inode, attr);
301 mark_inode_dirty(inode);
302 return 0;
303}
304
305static int hfsplus_file_fsync(struct file *filp, int datasync)
306{
307 struct inode *inode = filp->f_mapping->host;
308 struct super_block * sb;
309 int ret, err;
310
311 /* sync the inode to buffers */
312 ret = write_inode_now(inode, 0);
313
314 /* sync the superblock to buffers */
315 sb = inode->i_sb;
316 if (sb->s_dirt) {
317 if (!(sb->s_flags & MS_RDONLY))
318 hfsplus_sync_fs(sb, 1);
319 else
320 sb->s_dirt = 0;
321 }
322
323 /* .. finally sync the buffers to disk */
324 err = sync_blockdev(sb->s_bdev);
325 if (!ret)
326 ret = err;
327 return ret;
328}
329
269static const struct inode_operations hfsplus_file_inode_operations = { 330static const struct inode_operations hfsplus_file_inode_operations = {
270 .lookup = hfsplus_file_lookup, 331 .lookup = hfsplus_file_lookup,
271 .truncate = hfsplus_file_truncate, 332 .truncate = hfsplus_file_truncate,
333 .setattr = hfsplus_setattr,
272 .setxattr = hfsplus_setxattr, 334 .setxattr = hfsplus_setxattr,
273 .getxattr = hfsplus_getxattr, 335 .getxattr = hfsplus_getxattr,
274 .listxattr = hfsplus_listxattr, 336 .listxattr = hfsplus_listxattr,
@@ -282,7 +344,7 @@ static const struct file_operations hfsplus_file_operations = {
282 .aio_write = generic_file_aio_write, 344 .aio_write = generic_file_aio_write,
283 .mmap = generic_file_mmap, 345 .mmap = generic_file_mmap,
284 .splice_read = generic_file_splice_read, 346 .splice_read = generic_file_splice_read,
285 .fsync = file_fsync, 347 .fsync = hfsplus_file_fsync,
286 .open = hfsplus_file_open, 348 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 349 .release = hfsplus_file_release,
288 .unlocked_ioctl = hfsplus_ioctl, 350 .unlocked_ioctl = hfsplus_ioctl,
@@ -290,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
290 352
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 353struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
292{ 354{
355 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
293 struct inode *inode = new_inode(sb); 356 struct inode *inode = new_inode(sb);
357 struct hfsplus_inode_info *hip;
358
294 if (!inode) 359 if (!inode)
295 return NULL; 360 return NULL;
296 361
297 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 362 inode->i_ino = sbi->next_cnid++;
298 inode->i_mode = mode; 363 inode->i_mode = mode;
299 inode->i_uid = current_fsuid(); 364 inode->i_uid = current_fsuid();
300 inode->i_gid = current_fsgid(); 365 inode->i_gid = current_fsgid();
301 inode->i_nlink = 1; 366 inode->i_nlink = 1;
302 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 367 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
303 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 368
304 mutex_init(&HFSPLUS_I(inode).extents_lock); 369 hip = HFSPLUS_I(inode);
305 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 370 INIT_LIST_HEAD(&hip->open_dir_list);
306 HFSPLUS_I(inode).flags = 0; 371 mutex_init(&hip->extents_lock);
307 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 372 atomic_set(&hip->opencnt, 0);
308 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 373 hip->flags = 0;
309 HFSPLUS_I(inode).alloc_blocks = 0; 374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
310 HFSPLUS_I(inode).first_blocks = 0; 375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
311 HFSPLUS_I(inode).cached_start = 0; 376 hip->alloc_blocks = 0;
312 HFSPLUS_I(inode).cached_blocks = 0; 377 hip->first_blocks = 0;
313 HFSPLUS_I(inode).phys_size = 0; 378 hip->cached_start = 0;
314 HFSPLUS_I(inode).fs_blocks = 0; 379 hip->cached_blocks = 0;
315 HFSPLUS_I(inode).rsrc_inode = NULL; 380 hip->phys_size = 0;
381 hip->fs_blocks = 0;
382 hip->rsrc_inode = NULL;
316 if (S_ISDIR(inode->i_mode)) { 383 if (S_ISDIR(inode->i_mode)) {
317 inode->i_size = 2; 384 inode->i_size = 2;
318 HFSPLUS_SB(sb).folder_count++; 385 sbi->folder_count++;
319 inode->i_op = &hfsplus_dir_inode_operations; 386 inode->i_op = &hfsplus_dir_inode_operations;
320 inode->i_fop = &hfsplus_dir_operations; 387 inode->i_fop = &hfsplus_dir_operations;
321 } else if (S_ISREG(inode->i_mode)) { 388 } else if (S_ISREG(inode->i_mode)) {
322 HFSPLUS_SB(sb).file_count++; 389 sbi->file_count++;
323 inode->i_op = &hfsplus_file_inode_operations; 390 inode->i_op = &hfsplus_file_inode_operations;
324 inode->i_fop = &hfsplus_file_operations; 391 inode->i_fop = &hfsplus_file_operations;
325 inode->i_mapping->a_ops = &hfsplus_aops; 392 inode->i_mapping->a_ops = &hfsplus_aops;
326 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 393 hip->clump_blocks = sbi->data_clump_blocks;
327 } else if (S_ISLNK(inode->i_mode)) { 394 } else if (S_ISLNK(inode->i_mode)) {
328 HFSPLUS_SB(sb).file_count++; 395 sbi->file_count++;
329 inode->i_op = &page_symlink_inode_operations; 396 inode->i_op = &page_symlink_inode_operations;
330 inode->i_mapping->a_ops = &hfsplus_aops; 397 inode->i_mapping->a_ops = &hfsplus_aops;
331 HFSPLUS_I(inode).clump_blocks = 1; 398 hip->clump_blocks = 1;
332 } else 399 } else
333 HFSPLUS_SB(sb).file_count++; 400 sbi->file_count++;
334 insert_inode_hash(inode); 401 insert_inode_hash(inode);
335 mark_inode_dirty(inode); 402 mark_inode_dirty(inode);
336 sb->s_dirt = 1; 403 sb->s_dirt = 1;
@@ -343,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
343 struct super_block *sb = inode->i_sb; 410 struct super_block *sb = inode->i_sb;
344 411
345 if (S_ISDIR(inode->i_mode)) { 412 if (S_ISDIR(inode->i_mode)) {
346 HFSPLUS_SB(sb).folder_count--; 413 HFSPLUS_SB(sb)->folder_count--;
347 sb->s_dirt = 1; 414 sb->s_dirt = 1;
348 return; 415 return;
349 } 416 }
350 HFSPLUS_SB(sb).file_count--; 417 HFSPLUS_SB(sb)->file_count--;
351 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
352 if (!inode->i_nlink) { 419 if (!inode->i_nlink) {
353 inode->i_size = 0; 420 inode->i_size = 0;
@@ -363,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
363void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 430void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
364{ 431{
365 struct super_block *sb = inode->i_sb; 432 struct super_block *sb = inode->i_sb;
433 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
434 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
366 u32 count; 435 u32 count;
367 int i; 436 int i;
368 437
369 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 438 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
370 sizeof(hfsplus_extent_rec));
371 for (count = 0, i = 0; i < 8; i++) 439 for (count = 0, i = 0; i < 8; i++)
372 count += be32_to_cpu(fork->extents[i].block_count); 440 count += be32_to_cpu(fork->extents[i].block_count);
373 HFSPLUS_I(inode).first_blocks = count; 441 hip->first_blocks = count;
374 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 442 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
375 HFSPLUS_I(inode).cached_start = 0; 443 hip->cached_start = 0;
376 HFSPLUS_I(inode).cached_blocks = 0; 444 hip->cached_blocks = 0;
377 445
378 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 446 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
379 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 447 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
380 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 448 hip->fs_blocks =
381 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 449 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
382 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 450 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
383 if (!HFSPLUS_I(inode).clump_blocks) 451 hip->clump_blocks =
384 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 452 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
385 HFSPLUS_SB(sb).data_clump_blocks; 453 if (!hip->clump_blocks) {
454 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
455 sbi->rsrc_clump_blocks :
456 sbi->data_clump_blocks;
457 }
386} 458}
387 459
388void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
389{ 461{
390 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
391 sizeof(hfsplus_extent_rec)); 463 sizeof(hfsplus_extent_rec));
392 fork->total_size = cpu_to_be64(inode->i_size); 464 fork->total_size = cpu_to_be64(inode->i_size);
393 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 465 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
394} 466}
395 467
396int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 468int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -401,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
401 473
402 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 474 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
403 475
404 HFSPLUS_I(inode).dev = 0; 476 HFSPLUS_I(inode)->linkid = 0;
405 if (type == HFSPLUS_FOLDER) { 477 if (type == HFSPLUS_FOLDER) {
406 struct hfsplus_cat_folder *folder = &entry.folder; 478 struct hfsplus_cat_folder *folder = &entry.folder;
407 479
@@ -415,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
415 inode->i_atime = hfsp_mt2ut(folder->access_date); 487 inode->i_atime = hfsp_mt2ut(folder->access_date);
416 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 488 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
417 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 489 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
418 HFSPLUS_I(inode).create_date = folder->create_date; 490 HFSPLUS_I(inode)->create_date = folder->create_date;
419 HFSPLUS_I(inode).fs_blocks = 0; 491 HFSPLUS_I(inode)->fs_blocks = 0;
420 inode->i_op = &hfsplus_dir_inode_operations; 492 inode->i_op = &hfsplus_dir_inode_operations;
421 inode->i_fop = &hfsplus_dir_operations; 493 inode->i_fop = &hfsplus_dir_operations;
422 } else if (type == HFSPLUS_FILE) { 494 } else if (type == HFSPLUS_FILE) {
@@ -447,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
447 inode->i_atime = hfsp_mt2ut(file->access_date); 519 inode->i_atime = hfsp_mt2ut(file->access_date);
448 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 520 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
449 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 521 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
450 HFSPLUS_I(inode).create_date = file->create_date; 522 HFSPLUS_I(inode)->create_date = file->create_date;
451 } else { 523 } else {
452 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 524 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
453 res = -EIO; 525 res = -EIO;
@@ -462,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
462 hfsplus_cat_entry entry; 534 hfsplus_cat_entry entry;
463 535
464 if (HFSPLUS_IS_RSRC(inode)) 536 if (HFSPLUS_IS_RSRC(inode))
465 main_inode = HFSPLUS_I(inode).rsrc_inode; 537 main_inode = HFSPLUS_I(inode)->rsrc_inode;
466 538
467 if (!main_inode->i_nlink) 539 if (!main_inode->i_nlink)
468 return 0; 540 return 0;
469 541
470 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 542 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
471 /* panic? */ 543 /* panic? */
472 return -EIO; 544 return -EIO;
473 545
@@ -483,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
483 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 555 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
484 sizeof(struct hfsplus_cat_folder)); 556 sizeof(struct hfsplus_cat_folder));
485 /* simple node checks? */ 557 /* simple node checks? */
486 hfsplus_set_perms(inode, &folder->permissions); 558 hfsplus_cat_set_perms(inode, &folder->permissions);
487 folder->access_date = hfsp_ut2mt(inode->i_atime); 559 folder->access_date = hfsp_ut2mt(inode->i_atime);
488 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 560 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
489 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 561 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -505,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
505 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 577 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
506 sizeof(struct hfsplus_cat_file)); 578 sizeof(struct hfsplus_cat_file));
507 hfsplus_inode_write_fork(inode, &file->data_fork); 579 hfsplus_inode_write_fork(inode, &file->data_fork);
508 if (S_ISREG(inode->i_mode)) 580 hfsplus_cat_set_perms(inode, &file->permissions);
509 HFSPLUS_I(inode).dev = inode->i_nlink;
510 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
511 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
512 hfsplus_set_perms(inode, &file->permissions);
513 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
514 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
515 else 583 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f09902..40a85a3ded6 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_unlock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 168 return -EOPNOTSUPP;
154 169
155 if (size) { 170 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 171 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 172 if (res)
158 return res; 173 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 174 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 192 } else
178 res = size ? -ERANGE : 4; 193 res = size ? -ERANGE : 4;
179 } else 194 } else
180 res = -ENODATA; 195 res = -EOPNOTSUPP;
181out: 196out:
182 if (size) 197 if (size)
183 hfs_find_exit(&fd); 198 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07..f9ab276a4d8 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
143 kfree(p); 143 kfree(p);
144 break; 144 break;
145 case opt_decompose: 145 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 146 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 147 break;
148 case opt_nodecompose: 148 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 150 break;
151 case opt_force: 151 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 153 break;
154 default: 154 default:
155 return 0; 155 return 0;
@@ -171,7 +171,7 @@ done:
171 171
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 173{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 174 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 175
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
184 seq_printf(seq, ",session=%u", sbi->session); 184 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 185 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 186 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 188 seq_printf(seq, ",nodecompose");
189 return 0; 189 return 0;
190} 190}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd029..208b16c645c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
74int hfs_part_find(struct super_block *sb, 74int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 75 sector_t *part_start, sector_t *part_size)
76{ 76{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
77 struct buffer_head *bh; 78 struct buffer_head *bh;
78 __be16 *data; 79 __be16 *data;
79 int i, size, res; 80 int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
95 for (i = 0; i < size; p++, i++) { 96 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize && 97 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && 98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 99 (sbi->part < 0 || sbi->part == i)) {
99 *part_start += be32_to_cpu(p->pdStart); 100 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize); 101 *part_size = be32_to_cpu(p->pdSize);
101 res = 0; 102 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
111 size = be32_to_cpu(pm->pmMapBlkCnt); 112 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) { 113 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && 114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 115 (sbi->part < 0 || sbi->part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart); 116 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt); 117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0; 118 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 74b473a8ef9..52cc746d3ba 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/nls.h> 16#include <linux/nls.h>
18 17
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 20
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 23static int hfsplus_system_read_inode(struct inode *inode)
25{ 24{
26 struct hfs_find_data fd; 25 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30
31 inode = iget_locked(sb, ino);
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36 26
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 27 switch (inode->i_ino) {
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 28 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 29 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 30 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,125 +45,156 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 45 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 46 break;
77 default: 47 default:
78 goto bad_inode; 48 return -EIO;
49 }
50
51 return 0;
52}
53
54struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
55{
56 struct hfs_find_data fd;
57 struct inode *inode;
58 int err;
59
60 inode = iget_locked(sb, ino);
61 if (!inode)
62 return ERR_PTR(-ENOMEM);
63 if (!(inode->i_state & I_NEW))
64 return inode;
65
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71
72 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
73 inode->i_ino == HFSPLUS_ROOT_CNID) {
74 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
75 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
76 if (!err)
77 err = hfsplus_cat_read_inode(inode, &fd);
78 hfs_find_exit(&fd);
79 } else {
80 err = hfsplus_system_read_inode(inode);
81 }
82
83 if (err) {
84 iget_failed(inode);
85 return ERR_PTR(err);
79 } 86 }
80 87
81done:
82 unlock_new_inode(inode); 88 unlock_new_inode(inode);
83 return inode; 89 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 90}
89 91
90static int hfsplus_write_inode(struct inode *inode, 92static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 93{
93 struct hfsplus_vh *vhdr; 94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 95 struct hfsplus_vh *vhdr = sbi->s_vhdr;
96 struct hfsplus_fork_raw *fork;
97 struct hfs_btree *tree = NULL;
95 98
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 99 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 100 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 101 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 102 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 103 break;
114 case HFSPLUS_CAT_CNID: 104 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 105 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 106 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 107 break;
122 case HFSPLUS_ALLOC_CNID: 108 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 109 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 110 break;
129 case HFSPLUS_START_CNID: 111 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 112 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 113 break;
136 case HFSPLUS_ATTR_CNID: 114 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 115 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 116 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 117 default:
140 } 118 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 119 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 120
143 break; 121 if (fork->total_size != cpu_to_be64(inode->i_size)) {
122 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
123 inode->i_sb->s_dirt = 1;
144 } 124 }
145 return ret; 125 hfsplus_inode_write_fork(inode, fork);
126 if (tree)
127 hfs_btree_write(tree);
128 return 0;
146} 129}
147 130
148static void hfsplus_clear_inode(struct inode *inode) 131static int hfsplus_write_inode(struct inode *inode,
132 struct writeback_control *wbc)
149{ 133{
150 dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino); 134 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
135
136 hfsplus_ext_write_extent(inode);
137
138 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
139 inode->i_ino == HFSPLUS_ROOT_CNID)
140 return hfsplus_cat_write_inode(inode);
141 else
142 return hfsplus_system_write_inode(inode);
143}
144
145static void hfsplus_evict_inode(struct inode *inode)
146{
147 dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
148 truncate_inode_pages(&inode->i_data, 0);
149 end_writeback(inode);
151 if (HFSPLUS_IS_RSRC(inode)) { 150 if (HFSPLUS_IS_RSRC(inode)) {
152 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 151 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
153 iput(HFSPLUS_I(inode).rsrc_inode); 152 iput(HFSPLUS_I(inode)->rsrc_inode);
154 } 153 }
155} 154}
156 155
157static int hfsplus_sync_fs(struct super_block *sb, int wait) 156int hfsplus_sync_fs(struct super_block *sb, int wait)
158{ 157{
159 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr;
160 160
161 dprint(DBG_SUPER, "hfsplus_write_super\n"); 161 dprint(DBG_SUPER, "hfsplus_write_super\n");
162 162
163 lock_super(sb); 163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
164 sb->s_dirt = 0; 165 sb->s_dirt = 0;
165 166
166 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
167 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
168 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 169 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
169 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 170 vhdr->file_count = cpu_to_be32(sbi->file_count);
170 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
171 171
172 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 172 mark_buffer_dirty(sbi->s_vhbh);
173 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
174 if (HFSPLUS_SB(sb).sect_count) { 174 if (sbi->sect_count) {
175 struct buffer_head *bh; 175 struct buffer_head *bh;
176 u32 block, offset; 176 u32 block, offset;
177 177
178 block = HFSPLUS_SB(sb).blockoffset; 178 block = sbi->blockoffset;
179 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
180 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
182 HFSPLUS_SB(sb).sect_count, block, offset); 182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
183 bh = sb_bread(sb, block); 184 bh = sb_bread(sb, block);
184 if (bh) { 185 if (bh) {
185 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
186 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
187 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
188 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
189 brelse(bh); 190 brelse(bh);
190 } else 191 } else
191 printk(KERN_WARNING "hfs: backup not found!\n"); 192 printk(KERN_WARNING "hfs: backup not found!\n");
192 } 193 }
193 } 194 }
194 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
195 } 195 }
196 unlock_super(sb); 196 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex);
197 return 0; 198 return 0;
198} 199}
199 200
@@ -207,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
207 208
208static void hfsplus_put_super(struct super_block *sb) 209static void hfsplus_put_super(struct super_block *sb)
209{ 210{
211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
212
210 dprint(DBG_SUPER, "hfsplus_put_super\n"); 213 dprint(DBG_SUPER, "hfsplus_put_super\n");
214
211 if (!sb->s_fs_info) 215 if (!sb->s_fs_info)
212 return; 216 return;
213 217
214 lock_kernel();
215
216 if (sb->s_dirt) 218 if (sb->s_dirt)
217 hfsplus_write_super(sb); 219 hfsplus_write_super(sb);
218 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
219 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 221 struct hfsplus_vh *vhdr = sbi->s_vhdr;
220 222
221 vhdr->modify_date = hfsp_now2mt(); 223 vhdr->modify_date = hfsp_now2mt();
222 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
223 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
224 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 226 mark_buffer_dirty(sbi->s_vhbh);
225 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 227 sync_dirty_buffer(sbi->s_vhbh);
226 } 228 }
227 229
228 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 230 hfs_btree_close(sbi->cat_tree);
229 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 231 hfs_btree_close(sbi->ext_tree);
230 iput(HFSPLUS_SB(sb).alloc_file); 232 iput(sbi->alloc_file);
231 iput(HFSPLUS_SB(sb).hidden_dir); 233 iput(sbi->hidden_dir);
232 brelse(HFSPLUS_SB(sb).s_vhbh); 234 brelse(sbi->s_vhbh);
233 unload_nls(HFSPLUS_SB(sb).nls); 235 unload_nls(sbi->nls);
234 kfree(sb->s_fs_info); 236 kfree(sb->s_fs_info);
235 sb->s_fs_info = NULL; 237 sb->s_fs_info = NULL;
236
237 unlock_kernel();
238} 238}
239 239
240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
241{ 241{
242 struct super_block *sb = dentry->d_sb; 242 struct super_block *sb = dentry->d_sb;
243 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
243 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 244 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
244 245
245 buf->f_type = HFSPLUS_SUPER_MAGIC; 246 buf->f_type = HFSPLUS_SUPER_MAGIC;
246 buf->f_bsize = sb->s_blocksize; 247 buf->f_bsize = sb->s_blocksize;
247 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 248 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
248 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 249 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
249 buf->f_bavail = buf->f_bfree; 250 buf->f_bavail = buf->f_bfree;
250 buf->f_files = 0xFFFFFFFF; 251 buf->f_files = 0xFFFFFFFF;
251 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 252 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
252 buf->f_fsid.val[0] = (u32)id; 253 buf->f_fsid.val[0] = (u32)id;
253 buf->f_fsid.val[1] = (u32)(id >> 32); 254 buf->f_fsid.val[1] = (u32)(id >> 32);
254 buf->f_namelen = HFSPLUS_MAX_STRLEN; 255 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -261,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
261 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 262 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
262 return 0; 263 return 0;
263 if (!(*flags & MS_RDONLY)) { 264 if (!(*flags & MS_RDONLY)) {
264 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
265 struct hfsplus_sb_info sbi; 266 struct hfsplus_sb_info sbi;
266 267
267 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
268 sbi.nls = HFSPLUS_SB(sb).nls; 269 sbi.nls = HFSPLUS_SB(sb)->nls;
269 if (!hfsplus_parse_options(data, &sbi)) 270 if (!hfsplus_parse_options(data, &sbi))
270 return -EINVAL; 271 return -EINVAL;
271 272
@@ -274,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
274 "running fsck.hfsplus is recommended. leaving read-only.\n"); 275 "running fsck.hfsplus is recommended. leaving read-only.\n");
275 sb->s_flags |= MS_RDONLY; 276 sb->s_flags |= MS_RDONLY;
276 *flags |= MS_RDONLY; 277 *flags |= MS_RDONLY;
277 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
278 /* nothing */ 279 /* nothing */
279 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
280 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -293,7 +294,7 @@ static const struct super_operations hfsplus_sops = {
293 .alloc_inode = hfsplus_alloc_inode, 294 .alloc_inode = hfsplus_alloc_inode,
294 .destroy_inode = hfsplus_destroy_inode, 295 .destroy_inode = hfsplus_destroy_inode,
295 .write_inode = hfsplus_write_inode, 296 .write_inode = hfsplus_write_inode,
296 .clear_inode = hfsplus_clear_inode, 297 .evict_inode = hfsplus_evict_inode,
297 .put_super = hfsplus_put_super, 298 .put_super = hfsplus_put_super,
298 .write_super = hfsplus_write_super, 299 .write_super = hfsplus_write_super,
299 .sync_fs = hfsplus_sync_fs, 300 .sync_fs = hfsplus_sync_fs,
@@ -318,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
318 return -ENOMEM; 319 return -ENOMEM;
319 320
320 sb->s_fs_info = sbi; 321 sb->s_fs_info = sbi;
321 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 322 mutex_init(&sbi->alloc_mutex);
323 mutex_init(&sbi->vh_mutex);
322 hfsplus_fill_defaults(sbi); 324 hfsplus_fill_defaults(sbi);
323 if (!hfsplus_parse_options(data, sbi)) { 325 if (!hfsplus_parse_options(data, sbi)) {
324 printk(KERN_ERR "hfs: unable to parse mount options\n"); 326 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -342,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
342 err = -EINVAL; 344 err = -EINVAL;
343 goto cleanup; 345 goto cleanup;
344 } 346 }
345 vhdr = HFSPLUS_SB(sb).s_vhdr; 347 vhdr = sbi->s_vhdr;
346 348
347 /* Copy parts of the volume header into the superblock */ 349 /* Copy parts of the volume header into the superblock */
348 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 350 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -351,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
351 printk(KERN_ERR "hfs: wrong filesystem version\n"); 353 printk(KERN_ERR "hfs: wrong filesystem version\n");
352 goto cleanup; 354 goto cleanup;
353 } 355 }
354 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 356 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
355 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 357 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
356 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 358 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
357 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 359 sbi->file_count = be32_to_cpu(vhdr->file_count);
358 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 360 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
359 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 361 sbi->data_clump_blocks =
360 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 362 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
361 if (!HFSPLUS_SB(sb).data_clump_blocks) 363 if (!sbi->data_clump_blocks)
362 HFSPLUS_SB(sb).data_clump_blocks = 1; 364 sbi->data_clump_blocks = 1;
363 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 365 sbi->rsrc_clump_blocks =
364 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 366 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 367 if (!sbi->rsrc_clump_blocks)
368 sbi->rsrc_clump_blocks = 1;
366 369
367 /* Set up operations so we can load metadata */ 370 /* Set up operations so we can load metadata */
368 sb->s_op = &hfsplus_sops; 371 sb->s_op = &hfsplus_sops;
@@ -372,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
372 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
373 "running fsck.hfsplus is recommended. mounting read-only.\n"); 376 "running fsck.hfsplus is recommended. mounting read-only.\n");
374 sb->s_flags |= MS_RDONLY; 377 sb->s_flags |= MS_RDONLY;
375 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
376 /* nothing */ 379 /* nothing */
377 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
378 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -382,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
382 "use the force option at your own risk, mounting read-only.\n"); 385 "use the force option at your own risk, mounting read-only.\n");
383 sb->s_flags |= MS_RDONLY; 386 sb->s_flags |= MS_RDONLY;
384 } 387 }
385 sbi->flags &= ~HFSPLUS_SB_FORCE;
386 388
387 /* Load metadata objects (B*Trees) */ 389 /* Load metadata objects (B*Trees) */
388 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 390 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
389 if (!HFSPLUS_SB(sb).ext_tree) { 391 if (!sbi->ext_tree) {
390 printk(KERN_ERR "hfs: failed to load extents file\n"); 392 printk(KERN_ERR "hfs: failed to load extents file\n");
391 goto cleanup; 393 goto cleanup;
392 } 394 }
393 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 395 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
394 if (!HFSPLUS_SB(sb).cat_tree) { 396 if (!sbi->cat_tree) {
395 printk(KERN_ERR "hfs: failed to load catalog file\n"); 397 printk(KERN_ERR "hfs: failed to load catalog file\n");
396 goto cleanup; 398 goto cleanup;
397 } 399 }
@@ -402,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
402 err = PTR_ERR(inode); 404 err = PTR_ERR(inode);
403 goto cleanup; 405 goto cleanup;
404 } 406 }
405 HFSPLUS_SB(sb).alloc_file = inode; 407 sbi->alloc_file = inode;
406 408
407 /* Load the root directory */ 409 /* Load the root directory */
408 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -421,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
421 423
422 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
423 str.name = HFSP_HIDDENDIR_NAME; 425 str.name = HFSP_HIDDENDIR_NAME;
424 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 426 hfs_find_init(sbi->cat_tree, &fd);
425 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
426 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
427 hfs_find_exit(&fd); 429 hfs_find_exit(&fd);
@@ -432,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
432 err = PTR_ERR(inode); 434 err = PTR_ERR(inode);
433 goto cleanup; 435 goto cleanup;
434 } 436 }
435 HFSPLUS_SB(sb).hidden_dir = inode; 437 sbi->hidden_dir = inode;
436 } else 438 } else
437 hfs_find_exit(&fd); 439 hfs_find_exit(&fd);
438 440
@@ -447,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
447 be32_add_cpu(&vhdr->write_count, 1); 449 be32_add_cpu(&vhdr->write_count, 1);
448 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
449 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
450 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 452 mark_buffer_dirty(sbi->s_vhbh);
451 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 453 sync_dirty_buffer(sbi->s_vhbh);
452 454
453 if (!HFSPLUS_SB(sb).hidden_dir) { 455 if (!sbi->hidden_dir) {
454 printk(KERN_DEBUG "hfs: create hidden dir...\n"); 456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
455 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 457
456 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 458 mutex_lock(&sbi->vh_mutex);
457 &str, HFSPLUS_SB(sb).hidden_dir); 459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
458 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); 460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex);
463
464 mark_inode_dirty(sbi->hidden_dir);
459 } 465 }
460out: 466out:
461 unload_nls(sbi->nls); 467 unload_nls(sbi->nls);
@@ -484,23 +490,21 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
484 490
485static void hfsplus_destroy_inode(struct inode *inode) 491static void hfsplus_destroy_inode(struct inode *inode)
486{ 492{
487 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
488} 494}
489 495
490#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
491 497
492static int hfsplus_get_sb(struct file_system_type *fs_type, 498static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
493 int flags, const char *dev_name, void *data, 499 int flags, const char *dev_name, void *data)
494 struct vfsmount *mnt)
495{ 500{
496 return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, 501 return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
497 mnt);
498} 502}
499 503
500static struct file_system_type hfsplus_fs_type = { 504static struct file_system_type hfsplus_fs_type = {
501 .owner = THIS_MODULE, 505 .owner = THIS_MODULE,
502 .name = "hfsplus", 506 .name = "hfsplus",
503 .get_sb = hfsplus_get_sb, 507 .mount = hfsplus_mount,
504 .kill_sb = kill_block_super, 508 .kill_sb = kill_block_super,
505 .fs_flags = FS_REQUIRES_DEV, 509 .fs_flags = FS_REQUIRES_DEV,
506}; 510};
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa40..b66d67de882 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
122{ 122{
123 const hfsplus_unichr *ip; 123 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 124 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 125 u8 *op;
126 u16 cc, c0, c1; 126 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 127 u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 132 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 133 len = *len_p;
134 ce1 = NULL; 134 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 135 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 136
137 while (ustrlen > 0) { 137 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 138 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 247 wchar_t *uc)
248{ 248{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 249 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 250 if (size <= 0) {
251 *uc = '?'; 251 *uc = '?';
252 size = 1; 252 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 293 u16 *dstr, outlen = 0;
294 wchar_t c; 294 wchar_t c;
295 295
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
299 299
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 330 wchar_t c;
331 u16 c2; 331 u16 c2;
332 332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 333 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 334 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 335 hash = init_name_hash();
336 astr = str->name; 336 astr = str->name;
337 len = str->len; 337 len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 373 u16 c1, c2;
374 wchar_t c; 374 wchar_t c;
375 375
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 378 astr1 = s1->name;
379 len1 = s1->len; 379 len1 = s1->len;
380 astr2 = s2->name; 380 astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d..8972c20b321 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 65 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 66 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 67
68 if (HFSPLUS_SB(sb).session >= 0) { 68 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 69 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 70 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 87/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 88int hfsplus_read_wrapper(struct super_block *sb)
89{ 89{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
90 struct buffer_head *bh; 91 struct buffer_head *bh;
91 struct hfsplus_vh *vhdr; 92 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 93 struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break; 124 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { 125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; 126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
126 break; 127 break;
127 } 128 }
128 brelse(bh); 129 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 144 if (blocksize < HFSPLUS_SECTOR_SIZE ||
144 ((blocksize - 1) & blocksize)) 145 ((blocksize - 1) & blocksize))
145 return -EINVAL; 146 return -EINVAL;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 147 sbi->alloc_blksz = blocksize;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0; 148 sbi->alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 149 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 150 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 152
152 /* align block size to block offset */ 153 /* align block size to block offset */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EINVAL; 159 return -EINVAL;
159 } 160 }
160 161
161 HFSPLUS_SB(sb).blockoffset = part_start >> 162 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 164 sbi->sect_count = part_size;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
165 sb->s_blocksize_bits;
166 166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh) 168 if (!bh)
169 return -EIO; 169 return -EIO;
170 170
171 /* should still be the same... */ 171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? 172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : 173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) 174 goto error;
175 goto error; 175 } else {
176 HFSPLUS_SB(sb).s_vhbh = bh; 176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 HFSPLUS_SB(sb).s_vhdr = vhdr; 177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
178 182
179 return 0; 183 return 0;
180 error: 184 error:
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 2f34f8f2134..bf15a43016b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
28 * #define ATTR_KILL_SUID 2048 28 * #define ATTR_KILL_SUID 2048
29 * #define ATTR_KILL_SGID 4096 29 * #define ATTR_KILL_SGID 4096
30 * 30 *
31 * and this is because they were added in 2.5 development in this patch: 31 * and this is because they were added in 2.5 development.
32 *
33 * http://linux.bkbits.net:8080/linux-2.5/
34 * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
35 * |src/.|src/include|src/include/linux|related/include/linux/fs.h
36 *
37 * Actually, they are not needed by most ->setattr() methods - they are set by 32 * Actually, they are not needed by most ->setattr() methods - they are set by
38 * callers of notify_change() to notify that the setuid/setgid bits must be 33 * callers of notify_change() to notify that the setuid/setgid bits must be
39 * dropped. 34 * dropped.
@@ -53,18 +48,28 @@ struct hostfs_iattr {
53 struct timespec ia_ctime; 48 struct timespec ia_ctime;
54}; 49};
55 50
56extern int stat_file(const char *path, unsigned long long *inode_out, 51struct hostfs_stat {
57 int *mode_out, int *nlink_out, int *uid_out, int *gid_out, 52 unsigned long long ino;
58 unsigned long long *size_out, struct timespec *atime_out, 53 unsigned int mode;
59 struct timespec *mtime_out, struct timespec *ctime_out, 54 unsigned int nlink;
60 int *blksize_out, unsigned long long *blocks_out, int fd); 55 unsigned int uid;
56 unsigned int gid;
57 unsigned long long size;
58 struct timespec atime, mtime, ctime;
59 unsigned int blksize;
60 unsigned long long blocks;
61 unsigned int maj;
62 unsigned int min;
63};
64
65extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
61extern int access_file(char *path, int r, int w, int x); 66extern int access_file(char *path, int r, int w, int x);
62extern int open_file(char *path, int r, int w, int append); 67extern int open_file(char *path, int r, int w, int append);
63extern int file_type(const char *path, int *maj, int *min);
64extern void *open_dir(char *path, int *err_out); 68extern void *open_dir(char *path, int *err_out);
65extern char *read_dir(void *stream, unsigned long long *pos, 69extern char *read_dir(void *stream, unsigned long long *pos,
66 unsigned long long *ino_out, int *len_out); 70 unsigned long long *ino_out, int *len_out);
67extern void close_file(void *stream); 71extern void close_file(void *stream);
72extern int replace_file(int oldfd, int fd);
68extern void close_dir(void *stream); 73extern void close_dir(void *stream);
69extern int read_file(int fd, unsigned long long *offset, char *buf, int len); 74extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
70extern int write_file(int fd, unsigned long long *offset, const char *buf, 75extern int write_file(int fd, unsigned long long *offset, const char *buf,
@@ -86,7 +91,6 @@ extern int rename_file(char *from, char *to);
86extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 91extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
87 long long *bfree_out, long long *bavail_out, 92 long long *bfree_out, long long *bavail_out,
88 long long *files_out, long long *ffree_out, 93 long long *files_out, long long *ffree_out,
89 void *fsid_out, int fsid_size, long *namelen_out, 94 void *fsid_out, int fsid_size, long *namelen_out);
90 long *spare_out);
91 95
92#endif 96#endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 87ac1891a18..2c0f148a49e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,12 +14,12 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h>
17#include "hostfs.h" 18#include "hostfs.h"
18#include "init.h" 19#include "init.h"
19#include "kern.h" 20#include "kern.h"
20 21
21struct hostfs_inode_info { 22struct hostfs_inode_info {
22 char *host_filename;
23 int fd; 23 int fd;
24 fmode_t mode; 24 fmode_t mode;
25 struct inode vfs_inode; 25 struct inode vfs_inode;
@@ -49,7 +49,7 @@ static int append = 0;
49 49
50static const struct inode_operations hostfs_iops; 50static const struct inode_operations hostfs_iops;
51static const struct inode_operations hostfs_dir_iops; 51static const struct inode_operations hostfs_dir_iops;
52static const struct address_space_operations hostfs_link_aops; 52static const struct inode_operations hostfs_link_iops;
53 53
54#ifndef MODULE 54#ifndef MODULE
55static int __init hostfs_args(char *options, int *add) 55static int __init hostfs_args(char *options, int *add)
@@ -90,71 +90,58 @@ __uml_setup("hostfs=", hostfs_args,
90); 90);
91#endif 91#endif
92 92
93static char *dentry_name(struct dentry *dentry, int extra) 93static char *__dentry_name(struct dentry *dentry, char *name)
94{ 94{
95 struct dentry *parent; 95 char *p = __dentry_path(dentry, name, PATH_MAX);
96 char *root, *name; 96 char *root;
97 int len; 97 size_t len;
98
99 len = 0;
100 parent = dentry;
101 while (parent->d_parent != parent) {
102 len += parent->d_name.len + 1;
103 parent = parent->d_parent;
104 }
105 98
106 root = HOSTFS_I(parent->d_inode)->host_filename; 99 spin_unlock(&dcache_lock);
107 len += strlen(root);
108 name = kmalloc(len + extra + 1, GFP_KERNEL);
109 if (name == NULL)
110 return NULL;
111 100
112 name[len] = '\0'; 101 root = dentry->d_sb->s_fs_info;
113 parent = dentry; 102 len = strlen(root);
114 while (parent->d_parent != parent) { 103 if (IS_ERR(p)) {
115 len -= parent->d_name.len + 1; 104 __putname(name);
116 name[len] = '/'; 105 return NULL;
117 strncpy(&name[len + 1], parent->d_name.name, 106 }
118 parent->d_name.len); 107 strlcpy(name, root, PATH_MAX);
119 parent = parent->d_parent; 108 if (len > p - name) {
109 __putname(name);
110 return NULL;
111 }
112 if (p > name + len) {
113 char *s = name + len;
114 while ((*s++ = *p++) != '\0')
115 ;
120 } 116 }
121 strncpy(name, root, strlen(root));
122 return name; 117 return name;
123} 118}
124 119
125static char *inode_name(struct inode *ino, int extra) 120static char *dentry_name(struct dentry *dentry)
126{ 121{
127 struct dentry *dentry; 122 char *name = __getname();
123 if (!name)
124 return NULL;
128 125
129 dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); 126 spin_lock(&dcache_lock);
130 return dentry_name(dentry, extra); 127 return __dentry_name(dentry, name); /* will unlock */
131} 128}
132 129
133static int read_name(struct inode *ino, char *name) 130static char *inode_name(struct inode *ino)
134{ 131{
135 /* 132 struct dentry *dentry;
136 * The non-int inode fields are copied into ints by stat_file and 133 char *name = __getname();
137 * then copied into the inode because passing the actual pointers 134 if (!name)
138 * in and having them treated as int * breaks on big-endian machines 135 return NULL;
139 */
140 int err;
141 int i_mode, i_nlink, i_blksize;
142 unsigned long long i_size;
143 unsigned long long i_ino;
144 unsigned long long i_blocks;
145
146 err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid,
147 &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime,
148 &ino->i_ctime, &i_blksize, &i_blocks, -1);
149 if (err)
150 return err;
151 136
152 ino->i_ino = i_ino; 137 spin_lock(&dcache_lock);
153 ino->i_mode = i_mode; 138 if (list_empty(&ino->i_dentry)) {
154 ino->i_nlink = i_nlink; 139 spin_unlock(&dcache_lock);
155 ino->i_size = i_size; 140 __putname(name);
156 ino->i_blocks = i_blocks; 141 return NULL;
157 return 0; 142 }
143 dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
144 return __dentry_name(dentry, name); /* will unlock */
158} 145}
159 146
160static char *follow_link(char *link) 147static char *follow_link(char *link)
@@ -205,53 +192,11 @@ static char *follow_link(char *link)
205 return ERR_PTR(n); 192 return ERR_PTR(n);
206} 193}
207 194
208static int hostfs_read_inode(struct inode *ino)
209{
210 char *name;
211 int err = 0;
212
213 /*
214 * Unfortunately, we are called from iget() when we don't have a dentry
215 * allocated yet.
216 */
217 if (list_empty(&ino->i_dentry))
218 goto out;
219
220 err = -ENOMEM;
221 name = inode_name(ino, 0);
222 if (name == NULL)
223 goto out;
224
225 if (file_type(name, NULL, NULL) == OS_TYPE_SYMLINK) {
226 name = follow_link(name);
227 if (IS_ERR(name)) {
228 err = PTR_ERR(name);
229 goto out;
230 }
231 }
232
233 err = read_name(ino, name);
234 kfree(name);
235 out:
236 return err;
237}
238
239static struct inode *hostfs_iget(struct super_block *sb) 195static struct inode *hostfs_iget(struct super_block *sb)
240{ 196{
241 struct inode *inode; 197 struct inode *inode = new_inode(sb);
242 long ret;
243
244 inode = iget_locked(sb, 0);
245 if (!inode) 198 if (!inode)
246 return ERR_PTR(-ENOMEM); 199 return ERR_PTR(-ENOMEM);
247 if (inode->i_state & I_NEW) {
248 ret = hostfs_read_inode(inode);
249 if (ret < 0) {
250 iget_failed(inode);
251 return ERR_PTR(ret);
252 }
253 unlock_new_inode(inode);
254 }
255 return inode; 200 return inode;
256} 201}
257 202
@@ -269,10 +214,10 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
269 long long f_files; 214 long long f_files;
270 long long f_ffree; 215 long long f_ffree;
271 216
272 err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename, 217 err = do_statfs(dentry->d_sb->s_fs_info,
273 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, 218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
274 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
275 &sf->f_namelen, sf->f_spare); 220 &sf->f_namelen);
276 if (err) 221 if (err)
277 return err; 222 return err;
278 sf->f_blocks = f_blocks; 223 sf->f_blocks = f_blocks;
@@ -288,47 +233,32 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
288{ 233{
289 struct hostfs_inode_info *hi; 234 struct hostfs_inode_info *hi;
290 235
291 hi = kmalloc(sizeof(*hi), GFP_KERNEL); 236 hi = kzalloc(sizeof(*hi), GFP_KERNEL);
292 if (hi == NULL) 237 if (hi == NULL)
293 return NULL; 238 return NULL;
294 239 hi->fd = -1;
295 *hi = ((struct hostfs_inode_info) { .host_filename = NULL,
296 .fd = -1,
297 .mode = 0 });
298 inode_init_once(&hi->vfs_inode); 240 inode_init_once(&hi->vfs_inode);
299 return &hi->vfs_inode; 241 return &hi->vfs_inode;
300} 242}
301 243
302static void hostfs_delete_inode(struct inode *inode) 244static void hostfs_evict_inode(struct inode *inode)
303{ 245{
304 truncate_inode_pages(&inode->i_data, 0); 246 truncate_inode_pages(&inode->i_data, 0);
247 end_writeback(inode);
305 if (HOSTFS_I(inode)->fd != -1) { 248 if (HOSTFS_I(inode)->fd != -1) {
306 close_file(&HOSTFS_I(inode)->fd); 249 close_file(&HOSTFS_I(inode)->fd);
307 HOSTFS_I(inode)->fd = -1; 250 HOSTFS_I(inode)->fd = -1;
308 } 251 }
309 clear_inode(inode);
310} 252}
311 253
312static void hostfs_destroy_inode(struct inode *inode) 254static void hostfs_destroy_inode(struct inode *inode)
313{ 255{
314 kfree(HOSTFS_I(inode)->host_filename);
315
316 /*
317 * XXX: This should not happen, probably. The check is here for
318 * additional safety.
319 */
320 if (HOSTFS_I(inode)->fd != -1) {
321 close_file(&HOSTFS_I(inode)->fd);
322 printk(KERN_DEBUG "Closing host fd in .destroy_inode\n");
323 }
324
325 kfree(HOSTFS_I(inode)); 256 kfree(HOSTFS_I(inode));
326} 257}
327 258
328static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 259static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
329{ 260{
330 struct inode *root = vfs->mnt_sb->s_root->d_inode; 261 const char *root_path = vfs->mnt_sb->s_fs_info;
331 const char *root_path = HOSTFS_I(root)->host_filename;
332 size_t offset = strlen(root_ino) + 1; 262 size_t offset = strlen(root_ino) + 1;
333 263
334 if (strlen(root_path) > offset) 264 if (strlen(root_path) > offset)
@@ -339,9 +269,8 @@ static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
339 269
340static const struct super_operations hostfs_sbops = { 270static const struct super_operations hostfs_sbops = {
341 .alloc_inode = hostfs_alloc_inode, 271 .alloc_inode = hostfs_alloc_inode,
342 .drop_inode = generic_delete_inode,
343 .delete_inode = hostfs_delete_inode,
344 .destroy_inode = hostfs_destroy_inode, 272 .destroy_inode = hostfs_destroy_inode,
273 .evict_inode = hostfs_evict_inode,
345 .statfs = hostfs_statfs, 274 .statfs = hostfs_statfs,
346 .show_options = hostfs_show_options, 275 .show_options = hostfs_show_options,
347}; 276};
@@ -353,11 +282,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
353 unsigned long long next, ino; 282 unsigned long long next, ino;
354 int error, len; 283 int error, len;
355 284
356 name = dentry_name(file->f_path.dentry, 0); 285 name = dentry_name(file->f_path.dentry);
357 if (name == NULL) 286 if (name == NULL)
358 return -ENOMEM; 287 return -ENOMEM;
359 dir = open_dir(name, &error); 288 dir = open_dir(name, &error);
360 kfree(name); 289 __putname(name);
361 if (dir == NULL) 290 if (dir == NULL)
362 return -error; 291 return -error;
363 next = file->f_pos; 292 next = file->f_pos;
@@ -373,40 +302,59 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
373 302
374int hostfs_file_open(struct inode *ino, struct file *file) 303int hostfs_file_open(struct inode *ino, struct file *file)
375{ 304{
305 static DEFINE_MUTEX(open_mutex);
376 char *name; 306 char *name;
377 fmode_t mode = 0; 307 fmode_t mode = 0;
308 int err;
378 int r = 0, w = 0, fd; 309 int r = 0, w = 0, fd;
379 310
380 mode = file->f_mode & (FMODE_READ | FMODE_WRITE); 311 mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
381 if ((mode & HOSTFS_I(ino)->mode) == mode) 312 if ((mode & HOSTFS_I(ino)->mode) == mode)
382 return 0; 313 return 0;
383 314
384 /* 315 mode |= HOSTFS_I(ino)->mode;
385 * The file may already have been opened, but with the wrong access,
386 * so this resets things and reopens the file with the new access.
387 */
388 if (HOSTFS_I(ino)->fd != -1) {
389 close_file(&HOSTFS_I(ino)->fd);
390 HOSTFS_I(ino)->fd = -1;
391 }
392 316
393 HOSTFS_I(ino)->mode |= mode; 317retry:
394 if (HOSTFS_I(ino)->mode & FMODE_READ) 318 if (mode & FMODE_READ)
395 r = 1; 319 r = 1;
396 if (HOSTFS_I(ino)->mode & FMODE_WRITE) 320 if (mode & FMODE_WRITE)
397 w = 1; 321 w = 1;
398 if (w) 322 if (w)
399 r = 1; 323 r = 1;
400 324
401 name = dentry_name(file->f_path.dentry, 0); 325 name = dentry_name(file->f_path.dentry);
402 if (name == NULL) 326 if (name == NULL)
403 return -ENOMEM; 327 return -ENOMEM;
404 328
405 fd = open_file(name, r, w, append); 329 fd = open_file(name, r, w, append);
406 kfree(name); 330 __putname(name);
407 if (fd < 0) 331 if (fd < 0)
408 return fd; 332 return fd;
409 FILE_HOSTFS_I(file)->fd = fd; 333
334 mutex_lock(&open_mutex);
335 /* somebody else had handled it first? */
336 if ((mode & HOSTFS_I(ino)->mode) == mode) {
337 mutex_unlock(&open_mutex);
338 return 0;
339 }
340 if ((mode | HOSTFS_I(ino)->mode) != mode) {
341 mode |= HOSTFS_I(ino)->mode;
342 mutex_unlock(&open_mutex);
343 close_file(&fd);
344 goto retry;
345 }
346 if (HOSTFS_I(ino)->fd == -1) {
347 HOSTFS_I(ino)->fd = fd;
348 } else {
349 err = replace_file(fd, HOSTFS_I(ino)->fd);
350 close_file(&fd);
351 if (err < 0) {
352 mutex_unlock(&open_mutex);
353 return err;
354 }
355 }
356 HOSTFS_I(ino)->mode = mode;
357 mutex_unlock(&open_mutex);
410 358
411 return 0; 359 return 0;
412} 360}
@@ -544,54 +492,50 @@ static const struct address_space_operations hostfs_aops = {
544 .write_end = hostfs_write_end, 492 .write_end = hostfs_write_end,
545}; 493};
546 494
547static int init_inode(struct inode *inode, struct dentry *dentry) 495static int read_name(struct inode *ino, char *name)
548{ 496{
549 char *name; 497 dev_t rdev;
550 int type, err = -ENOMEM; 498 struct hostfs_stat st;
551 int maj, min; 499 int err = stat_file(name, &st, -1);
552 dev_t rdev = 0; 500 if (err)
501 return err;
553 502
554 if (dentry) { 503 /* Reencode maj and min with the kernel encoding.*/
555 name = dentry_name(dentry, 0); 504 rdev = MKDEV(st.maj, st.min);
556 if (name == NULL)
557 goto out;
558 type = file_type(name, &maj, &min);
559 /* Reencode maj and min with the kernel encoding.*/
560 rdev = MKDEV(maj, min);
561 kfree(name);
562 }
563 else type = OS_TYPE_DIR;
564 505
565 err = 0; 506 switch (st.mode & S_IFMT) {
566 if (type == OS_TYPE_SYMLINK) 507 case S_IFLNK:
567 inode->i_op = &page_symlink_inode_operations; 508 ino->i_op = &hostfs_link_iops;
568 else if (type == OS_TYPE_DIR)
569 inode->i_op = &hostfs_dir_iops;
570 else inode->i_op = &hostfs_iops;
571
572 if (type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
573 else inode->i_fop = &hostfs_file_fops;
574
575 if (type == OS_TYPE_SYMLINK)
576 inode->i_mapping->a_ops = &hostfs_link_aops;
577 else inode->i_mapping->a_ops = &hostfs_aops;
578
579 switch (type) {
580 case OS_TYPE_CHARDEV:
581 init_special_inode(inode, S_IFCHR, rdev);
582 break; 509 break;
583 case OS_TYPE_BLOCKDEV: 510 case S_IFDIR:
584 init_special_inode(inode, S_IFBLK, rdev); 511 ino->i_op = &hostfs_dir_iops;
512 ino->i_fop = &hostfs_dir_fops;
585 break; 513 break;
586 case OS_TYPE_FIFO: 514 case S_IFCHR:
587 init_special_inode(inode, S_IFIFO, 0); 515 case S_IFBLK:
516 case S_IFIFO:
517 case S_IFSOCK:
518 init_special_inode(ino, st.mode & S_IFMT, rdev);
519 ino->i_op = &hostfs_iops;
588 break; 520 break;
589 case OS_TYPE_SOCK: 521
590 init_special_inode(inode, S_IFSOCK, 0); 522 default:
591 break; 523 ino->i_op = &hostfs_iops;
592 } 524 ino->i_fop = &hostfs_file_fops;
593 out: 525 ino->i_mapping->a_ops = &hostfs_aops;
594 return err; 526 }
527
528 ino->i_ino = st.ino;
529 ino->i_mode = st.mode;
530 ino->i_nlink = st.nlink;
531 ino->i_uid = st.uid;
532 ino->i_gid = st.gid;
533 ino->i_atime = st.atime;
534 ino->i_mtime = st.mtime;
535 ino->i_ctime = st.ctime;
536 ino->i_size = st.size;
537 ino->i_blocks = st.blocks;
538 return 0;
595} 539}
596 540
597int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, 541int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -607,12 +551,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
607 goto out; 551 goto out;
608 } 552 }
609 553
610 error = init_inode(inode, dentry);
611 if (error)
612 goto out_put;
613
614 error = -ENOMEM; 554 error = -ENOMEM;
615 name = dentry_name(dentry, 0); 555 name = dentry_name(dentry);
616 if (name == NULL) 556 if (name == NULL)
617 goto out_put; 557 goto out_put;
618 558
@@ -622,9 +562,10 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
622 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); 562 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
623 if (fd < 0) 563 if (fd < 0)
624 error = fd; 564 error = fd;
625 else error = read_name(inode, name); 565 else
566 error = read_name(inode, name);
626 567
627 kfree(name); 568 __putname(name);
628 if (error) 569 if (error)
629 goto out_put; 570 goto out_put;
630 571
@@ -652,17 +593,14 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
652 goto out; 593 goto out;
653 } 594 }
654 595
655 err = init_inode(inode, dentry);
656 if (err)
657 goto out_put;
658
659 err = -ENOMEM; 596 err = -ENOMEM;
660 name = dentry_name(dentry, 0); 597 name = dentry_name(dentry);
661 if (name == NULL) 598 if (name == NULL)
662 goto out_put; 599 goto out_put;
663 600
664 err = read_name(inode, name); 601 err = read_name(inode, name);
665 kfree(name); 602
603 __putname(name);
666 if (err == -ENOENT) { 604 if (err == -ENOENT) {
667 iput(inode); 605 iput(inode);
668 inode = NULL; 606 inode = NULL;
@@ -680,36 +618,21 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
680 return ERR_PTR(err); 618 return ERR_PTR(err);
681} 619}
682 620
683static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
684{
685 char *file;
686 int len;
687
688 file = inode_name(ino, dentry->d_name.len + 1);
689 if (file == NULL)
690 return NULL;
691 strcat(file, "/");
692 len = strlen(file);
693 strncat(file, dentry->d_name.name, dentry->d_name.len);
694 file[len + dentry->d_name.len] = '\0';
695 return file;
696}
697
698int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) 621int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
699{ 622{
700 char *from_name, *to_name; 623 char *from_name, *to_name;
701 int err; 624 int err;
702 625
703 if ((from_name = inode_dentry_name(ino, from)) == NULL) 626 if ((from_name = dentry_name(from)) == NULL)
704 return -ENOMEM; 627 return -ENOMEM;
705 to_name = dentry_name(to, 0); 628 to_name = dentry_name(to);
706 if (to_name == NULL) { 629 if (to_name == NULL) {
707 kfree(from_name); 630 __putname(from_name);
708 return -ENOMEM; 631 return -ENOMEM;
709 } 632 }
710 err = link_file(to_name, from_name); 633 err = link_file(to_name, from_name);
711 kfree(from_name); 634 __putname(from_name);
712 kfree(to_name); 635 __putname(to_name);
713 return err; 636 return err;
714} 637}
715 638
@@ -718,13 +641,14 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
718 char *file; 641 char *file;
719 int err; 642 int err;
720 643
721 if ((file = inode_dentry_name(ino, dentry)) == NULL)
722 return -ENOMEM;
723 if (append) 644 if (append)
724 return -EPERM; 645 return -EPERM;
725 646
647 if ((file = dentry_name(dentry)) == NULL)
648 return -ENOMEM;
649
726 err = unlink_file(file); 650 err = unlink_file(file);
727 kfree(file); 651 __putname(file);
728 return err; 652 return err;
729} 653}
730 654
@@ -733,10 +657,10 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
733 char *file; 657 char *file;
734 int err; 658 int err;
735 659
736 if ((file = inode_dentry_name(ino, dentry)) == NULL) 660 if ((file = dentry_name(dentry)) == NULL)
737 return -ENOMEM; 661 return -ENOMEM;
738 err = make_symlink(file, to); 662 err = make_symlink(file, to);
739 kfree(file); 663 __putname(file);
740 return err; 664 return err;
741} 665}
742 666
@@ -745,10 +669,10 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
745 char *file; 669 char *file;
746 int err; 670 int err;
747 671
748 if ((file = inode_dentry_name(ino, dentry)) == NULL) 672 if ((file = dentry_name(dentry)) == NULL)
749 return -ENOMEM; 673 return -ENOMEM;
750 err = do_mkdir(file, mode); 674 err = do_mkdir(file, mode);
751 kfree(file); 675 __putname(file);
752 return err; 676 return err;
753} 677}
754 678
@@ -757,10 +681,10 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
757 char *file; 681 char *file;
758 int err; 682 int err;
759 683
760 if ((file = inode_dentry_name(ino, dentry)) == NULL) 684 if ((file = dentry_name(dentry)) == NULL)
761 return -ENOMEM; 685 return -ENOMEM;
762 err = do_rmdir(file); 686 err = do_rmdir(file);
763 kfree(file); 687 __putname(file);
764 return err; 688 return err;
765} 689}
766 690
@@ -776,22 +700,20 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
776 goto out; 700 goto out;
777 } 701 }
778 702
779 err = init_inode(inode, dentry);
780 if (err)
781 goto out_put;
782
783 err = -ENOMEM; 703 err = -ENOMEM;
784 name = dentry_name(dentry, 0); 704 name = dentry_name(dentry);
785 if (name == NULL) 705 if (name == NULL)
786 goto out_put; 706 goto out_put;
787 707
788 init_special_inode(inode, mode, dev); 708 init_special_inode(inode, mode, dev);
789 err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); 709 err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
790 if (err) 710 if (!err)
791 goto out_free; 711 goto out_free;
792 712
793 err = read_name(inode, name); 713 err = read_name(inode, name);
794 kfree(name); 714 __putname(name);
715 if (err)
716 goto out_put;
795 if (err) 717 if (err)
796 goto out_put; 718 goto out_put;
797 719
@@ -799,7 +721,7 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
799 return 0; 721 return 0;
800 722
801 out_free: 723 out_free:
802 kfree(name); 724 __putname(name);
803 out_put: 725 out_put:
804 iput(inode); 726 iput(inode);
805 out: 727 out:
@@ -812,15 +734,15 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
812 char *from_name, *to_name; 734 char *from_name, *to_name;
813 int err; 735 int err;
814 736
815 if ((from_name = inode_dentry_name(from_ino, from)) == NULL) 737 if ((from_name = dentry_name(from)) == NULL)
816 return -ENOMEM; 738 return -ENOMEM;
817 if ((to_name = inode_dentry_name(to_ino, to)) == NULL) { 739 if ((to_name = dentry_name(to)) == NULL) {
818 kfree(from_name); 740 __putname(from_name);
819 return -ENOMEM; 741 return -ENOMEM;
820 } 742 }
821 err = rename_file(from_name, to_name); 743 err = rename_file(from_name, to_name);
822 kfree(from_name); 744 __putname(from_name);
823 kfree(to_name); 745 __putname(to_name);
824 return err; 746 return err;
825} 747}
826 748
@@ -832,7 +754,7 @@ int hostfs_permission(struct inode *ino, int desired)
832 if (desired & MAY_READ) r = 1; 754 if (desired & MAY_READ) r = 1;
833 if (desired & MAY_WRITE) w = 1; 755 if (desired & MAY_WRITE) w = 1;
834 if (desired & MAY_EXEC) x = 1; 756 if (desired & MAY_EXEC) x = 1;
835 name = inode_name(ino, 0); 757 name = inode_name(ino);
836 if (name == NULL) 758 if (name == NULL)
837 return -ENOMEM; 759 return -ENOMEM;
838 760
@@ -841,7 +763,7 @@ int hostfs_permission(struct inode *ino, int desired)
841 err = 0; 763 err = 0;
842 else 764 else
843 err = access_file(name, r, w, x); 765 err = access_file(name, r, w, x);
844 kfree(name); 766 __putname(name);
845 if (!err) 767 if (!err)
846 err = generic_permission(ino, desired, NULL); 768 err = generic_permission(ino, desired, NULL);
847 return err; 769 return err;
@@ -849,13 +771,14 @@ int hostfs_permission(struct inode *ino, int desired)
849 771
850int hostfs_setattr(struct dentry *dentry, struct iattr *attr) 772int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
851{ 773{
774 struct inode *inode = dentry->d_inode;
852 struct hostfs_iattr attrs; 775 struct hostfs_iattr attrs;
853 char *name; 776 char *name;
854 int err; 777 int err;
855 778
856 int fd = HOSTFS_I(dentry->d_inode)->fd; 779 int fd = HOSTFS_I(inode)->fd;
857 780
858 err = inode_change_ok(dentry->d_inode, attr); 781 err = inode_change_ok(inode, attr);
859 if (err) 782 if (err)
860 return err; 783 return err;
861 784
@@ -897,15 +820,26 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
897 if (attr->ia_valid & ATTR_MTIME_SET) { 820 if (attr->ia_valid & ATTR_MTIME_SET) {
898 attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; 821 attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
899 } 822 }
900 name = dentry_name(dentry, 0); 823 name = dentry_name(dentry);
901 if (name == NULL) 824 if (name == NULL)
902 return -ENOMEM; 825 return -ENOMEM;
903 err = set_attr(name, &attrs, fd); 826 err = set_attr(name, &attrs, fd);
904 kfree(name); 827 __putname(name);
905 if (err) 828 if (err)
906 return err; 829 return err;
907 830
908 return inode_setattr(dentry->d_inode, attr); 831 if ((attr->ia_valid & ATTR_SIZE) &&
832 attr->ia_size != i_size_read(inode)) {
833 int error;
834
835 error = vmtruncate(inode, attr->ia_size);
836 if (err)
837 return err;
838 }
839
840 setattr_copy(inode, attr);
841 mark_inode_dirty(inode);
842 return 0;
909} 843}
910 844
911static const struct inode_operations hostfs_iops = { 845static const struct inode_operations hostfs_iops = {
@@ -935,32 +869,41 @@ static const struct inode_operations hostfs_dir_iops = {
935 .setattr = hostfs_setattr, 869 .setattr = hostfs_setattr,
936}; 870};
937 871
938int hostfs_link_readpage(struct file *file, struct page *page) 872static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
939{ 873{
940 char *buffer, *name; 874 char *link = __getname();
941 int err; 875 if (link) {
942 876 char *path = dentry_name(dentry);
943 buffer = kmap(page); 877 int err = -ENOMEM;
944 name = inode_name(page->mapping->host, 0); 878 if (path) {
945 if (name == NULL) 879 err = hostfs_do_readlink(path, link, PATH_MAX);
946 return -ENOMEM; 880 if (err == PATH_MAX)
947 err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE); 881 err = -E2BIG;
948 kfree(name); 882 __putname(path);
949 if (err == PAGE_CACHE_SIZE) 883 }
950 err = -E2BIG; 884 if (err < 0) {
951 else if (err > 0) { 885 __putname(link);
952 flush_dcache_page(page); 886 link = ERR_PTR(err);
953 SetPageUptodate(page); 887 }
954 if (PageError(page)) ClearPageError(page); 888 } else {
955 err = 0; 889 link = ERR_PTR(-ENOMEM);
956 } 890 }
957 kunmap(page); 891
958 unlock_page(page); 892 nd_set_link(nd, link);
959 return err; 893 return NULL;
960} 894}
961 895
962static const struct address_space_operations hostfs_link_aops = { 896static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
963 .readpage = hostfs_link_readpage, 897{
898 char *s = nd_get_link(nd);
899 if (!IS_ERR(s))
900 __putname(s);
901}
902
903static const struct inode_operations hostfs_link_iops = {
904 .readlink = generic_readlink,
905 .follow_link = hostfs_follow_link,
906 .put_link = hostfs_put_link,
964}; 907};
965 908
966static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) 909static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
@@ -980,65 +923,63 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
980 req_root = ""; 923 req_root = "";
981 924
982 err = -ENOMEM; 925 err = -ENOMEM;
983 host_root_path = kmalloc(strlen(root_ino) + 1 926 sb->s_fs_info = host_root_path =
984 + strlen(req_root) + 1, GFP_KERNEL); 927 kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
985 if (host_root_path == NULL) 928 if (host_root_path == NULL)
986 goto out; 929 goto out;
987 930
988 sprintf(host_root_path, "%s/%s", root_ino, req_root); 931 sprintf(host_root_path, "%s/%s", root_ino, req_root);
989 932
990 root_inode = hostfs_iget(sb); 933 root_inode = new_inode(sb);
991 if (IS_ERR(root_inode)) { 934 if (!root_inode)
992 err = PTR_ERR(root_inode); 935 goto out;
993 goto out_free;
994 }
995 936
996 err = init_inode(root_inode, NULL); 937 err = read_name(root_inode, host_root_path);
997 if (err) 938 if (err)
998 goto out_put; 939 goto out_put;
999 940
1000 HOSTFS_I(root_inode)->host_filename = host_root_path; 941 if (S_ISLNK(root_inode->i_mode)) {
1001 /* 942 char *name = follow_link(host_root_path);
1002 * Avoid that in the error path, iput(root_inode) frees again 943 if (IS_ERR(name))
1003 * host_root_path through hostfs_destroy_inode! 944 err = PTR_ERR(name);
1004 */ 945 else
1005 host_root_path = NULL; 946 err = read_name(root_inode, name);
947 kfree(name);
948 if (err)
949 goto out_put;
950 }
1006 951
1007 err = -ENOMEM; 952 err = -ENOMEM;
1008 sb->s_root = d_alloc_root(root_inode); 953 sb->s_root = d_alloc_root(root_inode);
1009 if (sb->s_root == NULL) 954 if (sb->s_root == NULL)
1010 goto out_put; 955 goto out_put;
1011 956
1012 err = hostfs_read_inode(root_inode);
1013 if (err) {
1014 /* No iput in this case because the dput does that for us */
1015 dput(sb->s_root);
1016 sb->s_root = NULL;
1017 goto out;
1018 }
1019
1020 return 0; 957 return 0;
1021 958
1022out_put: 959out_put:
1023 iput(root_inode); 960 iput(root_inode);
1024out_free:
1025 kfree(host_root_path);
1026out: 961out:
1027 return err; 962 return err;
1028} 963}
1029 964
1030static int hostfs_read_sb(struct file_system_type *type, 965static struct dentry *hostfs_read_sb(struct file_system_type *type,
1031 int flags, const char *dev_name, 966 int flags, const char *dev_name,
1032 void *data, struct vfsmount *mnt) 967 void *data)
968{
969 return mount_nodev(type, flags, data, hostfs_fill_sb_common);
970}
971
972static void hostfs_kill_sb(struct super_block *s)
1033{ 973{
1034 return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); 974 kill_anon_super(s);
975 kfree(s->s_fs_info);
1035} 976}
1036 977
1037static struct file_system_type hostfs_type = { 978static struct file_system_type hostfs_type = {
1038 .owner = THIS_MODULE, 979 .owner = THIS_MODULE,
1039 .name = "hostfs", 980 .name = "hostfs",
1040 .get_sb = hostfs_read_sb, 981 .mount = hostfs_read_sb,
1041 .kill_sb = kill_anon_super, 982 .kill_sb = hostfs_kill_sb,
1042 .fs_flags = 0, 983 .fs_flags = 0,
1043}; 984};
1044 985
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index b79424f9328..d51a98384bc 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -19,11 +19,27 @@
19#include "user.h" 19#include "user.h"
20#include <utime.h> 20#include <utime.h>
21 21
22int stat_file(const char *path, unsigned long long *inode_out, int *mode_out, 22static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
23 int *nlink_out, int *uid_out, int *gid_out, 23{
24 unsigned long long *size_out, struct timespec *atime_out, 24 p->ino = buf->st_ino;
25 struct timespec *mtime_out, struct timespec *ctime_out, 25 p->mode = buf->st_mode;
26 int *blksize_out, unsigned long long *blocks_out, int fd) 26 p->nlink = buf->st_nlink;
27 p->uid = buf->st_uid;
28 p->gid = buf->st_gid;
29 p->size = buf->st_size;
30 p->atime.tv_sec = buf->st_atime;
31 p->atime.tv_nsec = 0;
32 p->ctime.tv_sec = buf->st_ctime;
33 p->ctime.tv_nsec = 0;
34 p->mtime.tv_sec = buf->st_mtime;
35 p->mtime.tv_nsec = 0;
36 p->blksize = buf->st_blksize;
37 p->blocks = buf->st_blocks;
38 p->maj = os_major(buf->st_rdev);
39 p->min = os_minor(buf->st_rdev);
40}
41
42int stat_file(const char *path, struct hostfs_stat *p, int fd)
27{ 43{
28 struct stat64 buf; 44 struct stat64 buf;
29 45
@@ -33,68 +49,10 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
33 } else if (lstat64(path, &buf) < 0) { 49 } else if (lstat64(path, &buf) < 0) {
34 return -errno; 50 return -errno;
35 } 51 }
36 52 stat64_to_hostfs(&buf, p);
37 if (inode_out != NULL)
38 *inode_out = buf.st_ino;
39 if (mode_out != NULL)
40 *mode_out = buf.st_mode;
41 if (nlink_out != NULL)
42 *nlink_out = buf.st_nlink;
43 if (uid_out != NULL)
44 *uid_out = buf.st_uid;
45 if (gid_out != NULL)
46 *gid_out = buf.st_gid;
47 if (size_out != NULL)
48 *size_out = buf.st_size;
49 if (atime_out != NULL) {
50 atime_out->tv_sec = buf.st_atime;
51 atime_out->tv_nsec = 0;
52 }
53 if (mtime_out != NULL) {
54 mtime_out->tv_sec = buf.st_mtime;
55 mtime_out->tv_nsec = 0;
56 }
57 if (ctime_out != NULL) {
58 ctime_out->tv_sec = buf.st_ctime;
59 ctime_out->tv_nsec = 0;
60 }
61 if (blksize_out != NULL)
62 *blksize_out = buf.st_blksize;
63 if (blocks_out != NULL)
64 *blocks_out = buf.st_blocks;
65 return 0; 53 return 0;
66} 54}
67 55
68int file_type(const char *path, int *maj, int *min)
69{
70 struct stat64 buf;
71
72 if (lstat64(path, &buf) < 0)
73 return -errno;
74 /*
75 * We cannot pass rdev as is because glibc and the kernel disagree
76 * about its definition.
77 */
78 if (maj != NULL)
79 *maj = major(buf.st_rdev);
80 if (min != NULL)
81 *min = minor(buf.st_rdev);
82
83 if (S_ISDIR(buf.st_mode))
84 return OS_TYPE_DIR;
85 else if (S_ISLNK(buf.st_mode))
86 return OS_TYPE_SYMLINK;
87 else if (S_ISCHR(buf.st_mode))
88 return OS_TYPE_CHARDEV;
89 else if (S_ISBLK(buf.st_mode))
90 return OS_TYPE_BLOCKDEV;
91 else if (S_ISFIFO(buf.st_mode))
92 return OS_TYPE_FIFO;
93 else if (S_ISSOCK(buf.st_mode))
94 return OS_TYPE_SOCK;
95 else return OS_TYPE_FILE;
96}
97
98int access_file(char *path, int r, int w, int x) 56int access_file(char *path, int r, int w, int x)
99{ 57{
100 int mode = 0; 58 int mode = 0;
@@ -136,8 +94,7 @@ void *open_dir(char *path, int *err_out)
136 94
137 dir = opendir(path); 95 dir = opendir(path);
138 *err_out = errno; 96 *err_out = errno;
139 if (dir == NULL) 97
140 return NULL;
141 return dir; 98 return dir;
142} 99}
143 100
@@ -202,6 +159,11 @@ int fsync_file(int fd, int datasync)
202 return 0; 159 return 0;
203} 160}
204 161
162int replace_file(int oldfd, int fd)
163{
164 return dup2(oldfd, fd);
165}
166
205void close_file(void *stream) 167void close_file(void *stream)
206{ 168{
207 close(*((int *) stream)); 169 close(*((int *) stream));
@@ -235,14 +197,14 @@ int file_create(char *name, int ur, int uw, int ux, int gr,
235 197
236int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) 198int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
237{ 199{
200 struct hostfs_stat st;
238 struct timeval times[2]; 201 struct timeval times[2];
239 struct timespec atime_ts, mtime_ts;
240 int err, ma; 202 int err, ma;
241 203
242 if (attrs->ia_valid & HOSTFS_ATTR_MODE) { 204 if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
243 if (fd >= 0) { 205 if (fd >= 0) {
244 if (fchmod(fd, attrs->ia_mode) != 0) 206 if (fchmod(fd, attrs->ia_mode) != 0)
245 return (-errno); 207 return -errno;
246 } else if (chmod(file, attrs->ia_mode) != 0) { 208 } else if (chmod(file, attrs->ia_mode) != 0) {
247 return -errno; 209 return -errno;
248 } 210 }
@@ -279,15 +241,14 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
279 */ 241 */
280 ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET); 242 ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
281 if (attrs->ia_valid & ma) { 243 if (attrs->ia_valid & ma) {
282 err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 244 err = stat_file(file, &st, fd);
283 &atime_ts, &mtime_ts, NULL, NULL, NULL, fd);
284 if (err != 0) 245 if (err != 0)
285 return err; 246 return err;
286 247
287 times[0].tv_sec = atime_ts.tv_sec; 248 times[0].tv_sec = st.atime.tv_sec;
288 times[0].tv_usec = atime_ts.tv_nsec / 1000; 249 times[0].tv_usec = st.atime.tv_nsec / 1000;
289 times[1].tv_sec = mtime_ts.tv_sec; 250 times[1].tv_sec = st.mtime.tv_sec;
290 times[1].tv_usec = mtime_ts.tv_nsec / 1000; 251 times[1].tv_usec = st.mtime.tv_nsec / 1000;
291 252
292 if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { 253 if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
293 times[0].tv_sec = attrs->ia_atime.tv_sec; 254 times[0].tv_sec = attrs->ia_atime.tv_sec;
@@ -308,9 +269,9 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
308 269
309 /* Note: ctime is not handled */ 270 /* Note: ctime is not handled */
310 if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) { 271 if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
311 err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 272 err = stat_file(file, &st, fd);
312 &attrs->ia_atime, &attrs->ia_mtime, NULL, 273 attrs->ia_atime = st.atime;
313 NULL, NULL, fd); 274 attrs->ia_mtime = st.mtime;
314 if (err != 0) 275 if (err != 0)
315 return err; 276 return err;
316 } 277 }
@@ -361,7 +322,7 @@ int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
361{ 322{
362 int err; 323 int err;
363 324
364 err = mknod(file, mode, makedev(major, minor)); 325 err = mknod(file, mode, os_makedev(major, minor));
365 if (err) 326 if (err)
366 return -errno; 327 return -errno;
367 return 0; 328 return 0;
@@ -402,8 +363,7 @@ int rename_file(char *from, char *to)
402int do_statfs(char *root, long *bsize_out, long long *blocks_out, 363int do_statfs(char *root, long *bsize_out, long long *blocks_out,
403 long long *bfree_out, long long *bavail_out, 364 long long *bfree_out, long long *bavail_out,
404 long long *files_out, long long *ffree_out, 365 long long *files_out, long long *ffree_out,
405 void *fsid_out, int fsid_size, long *namelen_out, 366 void *fsid_out, int fsid_size, long *namelen_out)
406 long *spare_out)
407{ 367{
408 struct statfs64 buf; 368 struct statfs64 buf;
409 int err; 369 int err;
@@ -422,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
422 sizeof(buf.f_fsid) > fsid_size ? fsid_size : 382 sizeof(buf.f_fsid) > fsid_size ? fsid_size :
423 sizeof(buf.f_fsid)); 383 sizeof(buf.f_fsid));
424 *namelen_out = buf.f_namelen; 384 *namelen_out = buf.f_namelen;
425 spare_out[0] = buf.f_spare[0]; 385
426 spare_out[1] = buf.f_spare[1];
427 spare_out[2] = buf.f_spare[2];
428 spare_out[3] = buf.f_spare[3];
429 spare_out[4] = buf.f_spare[4];
430 return 0; 386 return 0;
431} 387}
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6..63b6f563231 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix
4 help 5 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e..793cb9d943d 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
14#ifdef DEBUG_LOCKS 14#ifdef DEBUG_LOCKS
15 printk("lock creation\n"); 15 printk("lock creation\n");
16#endif 16#endif
17 down(&hpfs_sb(s)->hpfs_creation_de); 17 mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
18} 18}
19 19
20void hpfs_unlock_creation(struct super_block *s) 20void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
22#ifdef DEBUG_LOCKS 22#ifdef DEBUG_LOCKS
23 printk("unlock creation\n"); 23 printk("unlock creation\n");
24#endif 24#endif
25 up(&hpfs_sb(s)->hpfs_creation_de); 25 mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
26} 26}
27 27
28/* Map a sector into a buffer and return pointers to it and to the buffer. */ 28/* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index a9ae9bfa752..c0340887c7e 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -97,10 +97,19 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
97 loff_t pos, unsigned len, unsigned flags, 97 loff_t pos, unsigned len, unsigned flags,
98 struct page **pagep, void **fsdata) 98 struct page **pagep, void **fsdata)
99{ 99{
100 int ret;
101
100 *pagep = NULL; 102 *pagep = NULL;
101 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 103 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
102 hpfs_get_block, 104 hpfs_get_block,
103 &hpfs_i(mapping->host)->mmu_private); 105 &hpfs_i(mapping->host)->mmu_private);
106 if (unlikely(ret)) {
107 loff_t isize = mapping->host->i_size;
108 if (pos + len > isize)
109 vmtruncate(mapping->host, isize);
110 }
111
112 return ret;
104} 113}
105 114
106static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) 115static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 75f9d432485..2fee17d0d9a 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
87 unsigned *sb_bmp_dir; /* main bitmap directory */ 87 unsigned *sb_bmp_dir; /* main bitmap directory */
88 unsigned sb_c_bitmap; /* current bitmap */ 88 unsigned sb_c_bitmap; /* current bitmap */
89 unsigned sb_max_fwd_alloc; /* max forwad allocation */ 89 unsigned sb_max_fwd_alloc; /* max forwad allocation */
90 struct semaphore hpfs_creation_de; /* when creating dirents, nobody else 90 struct mutex hpfs_creation_de; /* when creating dirents, nobody else
91 can alloc blocks */ 91 can alloc blocks */
92 /*unsigned sb_mounting : 1;*/ 92 /*unsigned sb_mounting : 1;*/
93 int sb_timeshift; 93 int sb_timeshift;
@@ -281,7 +281,7 @@ void hpfs_write_inode(struct inode *);
281void hpfs_write_inode_nolock(struct inode *); 281void hpfs_write_inode_nolock(struct inode *);
282int hpfs_setattr(struct dentry *, struct iattr *); 282int hpfs_setattr(struct dentry *, struct iattr *);
283void hpfs_write_if_changed(struct inode *); 283void hpfs_write_if_changed(struct inode *);
284void hpfs_delete_inode(struct inode *); 284void hpfs_evict_inode(struct inode *);
285 285
286/* map.c */ 286/* map.c */
287 287
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1042a9bc97f..56f0da1cfd1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,15 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
277 if (error) 277 if (error)
278 goto out_unlock; 278 goto out_unlock;
279 279
280 error = inode_setattr(inode, attr); 280 if ((attr->ia_valid & ATTR_SIZE) &&
281 if (error) 281 attr->ia_size != i_size_read(inode)) {
282 goto out_unlock; 282 error = vmtruncate(inode, attr->ia_size);
283 if (error)
284 return error;
285 }
286
287 setattr_copy(inode, attr);
288 mark_inode_dirty(inode);
283 289
284 hpfs_write_inode(inode); 290 hpfs_write_inode(inode);
285 291
@@ -296,11 +302,13 @@ void hpfs_write_if_changed(struct inode *inode)
296 hpfs_write_inode(inode); 302 hpfs_write_inode(inode);
297} 303}
298 304
299void hpfs_delete_inode(struct inode *inode) 305void hpfs_evict_inode(struct inode *inode)
300{ 306{
301 truncate_inode_pages(&inode->i_data, 0); 307 truncate_inode_pages(&inode->i_data, 0);
302 lock_kernel(); 308 end_writeback(inode);
303 hpfs_remove_fnode(inode->i_sb, inode->i_ino); 309 if (!inode->i_nlink) {
304 unlock_kernel(); 310 lock_kernel();
305 clear_inode(inode); 311 hpfs_remove_fnode(inode->i_sb, inode->i_ino);
312 unlock_kernel();
313 }
306} 314}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index aa53842c599..6c5f01597c3 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -450,7 +450,7 @@ static const struct super_operations hpfs_sops =
450{ 450{
451 .alloc_inode = hpfs_alloc_inode, 451 .alloc_inode = hpfs_alloc_inode,
452 .destroy_inode = hpfs_destroy_inode, 452 .destroy_inode = hpfs_destroy_inode,
453 .delete_inode = hpfs_delete_inode, 453 .evict_inode = hpfs_evict_inode,
454 .put_super = hpfs_put_super, 454 .put_super = hpfs_put_super,
455 .statfs = hpfs_statfs, 455 .statfs = hpfs_statfs,
456 .remount_fs = hpfs_remount_fs, 456 .remount_fs = hpfs_remount_fs,
@@ -477,17 +477,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 477
478 int o; 478 int o;
479 479
480 lock_kernel();
481
480 save_mount_options(s, options); 482 save_mount_options(s, options);
481 483
482 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 484 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
483 if (!sbi) 485 if (!sbi) {
486 unlock_kernel();
484 return -ENOMEM; 487 return -ENOMEM;
488 }
485 s->s_fs_info = sbi; 489 s->s_fs_info = sbi;
486 490
487 sbi->sb_bmp_dir = NULL; 491 sbi->sb_bmp_dir = NULL;
488 sbi->sb_cp_table = NULL; 492 sbi->sb_cp_table = NULL;
489 493
490 init_MUTEX(&sbi->hpfs_creation_de); 494 mutex_init(&sbi->hpfs_creation_de);
491 495
492 uid = current_uid(); 496 uid = current_uid();
493 gid = current_gid(); 497 gid = current_gid();
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
666 root->i_blocks = 5; 670 root->i_blocks = 5;
667 hpfs_brelse4(&qbh); 671 hpfs_brelse4(&qbh);
668 } 672 }
673 unlock_kernel();
669 return 0; 674 return 0;
670 675
671bail4: brelse(bh2); 676bail4: brelse(bh2);
@@ -677,20 +682,20 @@ bail0:
677 kfree(sbi->sb_cp_table); 682 kfree(sbi->sb_cp_table);
678 s->s_fs_info = NULL; 683 s->s_fs_info = NULL;
679 kfree(sbi); 684 kfree(sbi);
685 unlock_kernel();
680 return -EINVAL; 686 return -EINVAL;
681} 687}
682 688
683static int hpfs_get_sb(struct file_system_type *fs_type, 689static struct dentry *hpfs_mount(struct file_system_type *fs_type,
684 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 690 int flags, const char *dev_name, void *data)
685{ 691{
686 return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, 692 return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
687 mnt);
688} 693}
689 694
690static struct file_system_type hpfs_fs_type = { 695static struct file_system_type hpfs_fs_type = {
691 .owner = THIS_MODULE, 696 .owner = THIS_MODULE,
692 .name = "hpfs", 697 .name = "hpfs",
693 .get_sb = hpfs_get_sb, 698 .mount = hpfs_mount,
694 .kill_sb = kill_block_super, 699 .kill_sb = kill_block_super,
695 .fs_flags = FS_REQUIRES_DEV, 700 .fs_flags = FS_REQUIRES_DEV,
696}; 701};
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 826c3f9d29a..f702b5f713f 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/statfs.h> 16#include <linux/statfs.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/pid_namespace.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "os.h" 20#include "os.h"
20 21
@@ -597,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
597 .readdir = hppfs_readdir, 598 .readdir = hppfs_readdir,
598 .open = hppfs_dir_open, 599 .open = hppfs_dir_open,
599 .fsync = hppfs_fsync, 600 .fsync = hppfs_fsync,
601 .llseek = default_llseek,
600}; 602};
601 603
602static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) 604static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -623,12 +625,11 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
623 return &hi->vfs_inode; 625 return &hi->vfs_inode;
624} 626}
625 627
626void hppfs_delete_inode(struct inode *ino) 628void hppfs_evict_inode(struct inode *ino)
627{ 629{
630 end_writeback(ino);
628 dput(HPPFS_I(ino)->proc_dentry); 631 dput(HPPFS_I(ino)->proc_dentry);
629 mntput(ino->i_sb->s_fs_info); 632 mntput(ino->i_sb->s_fs_info);
630
631 clear_inode(ino);
632} 633}
633 634
634static void hppfs_destroy_inode(struct inode *inode) 635static void hppfs_destroy_inode(struct inode *inode)
@@ -639,7 +640,7 @@ static void hppfs_destroy_inode(struct inode *inode)
639static const struct super_operations hppfs_sbops = { 640static const struct super_operations hppfs_sbops = {
640 .alloc_inode = hppfs_alloc_inode, 641 .alloc_inode = hppfs_alloc_inode,
641 .destroy_inode = hppfs_destroy_inode, 642 .destroy_inode = hppfs_destroy_inode,
642 .delete_inode = hppfs_delete_inode, 643 .evict_inode = hppfs_evict_inode,
643 .statfs = hppfs_statfs, 644 .statfs = hppfs_statfs,
644}; 645};
645 646
@@ -747,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
747 return(err); 748 return(err);
748} 749}
749 750
750static int hppfs_read_super(struct file_system_type *type, 751static struct dentry *hppfs_read_super(struct file_system_type *type,
751 int flags, const char *dev_name, 752 int flags, const char *dev_name,
752 void *data, struct vfsmount *mnt) 753 void *data)
753{ 754{
754 return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); 755 return mount_nodev(type, flags, data, hppfs_fill_super);
755} 756}
756 757
757static struct file_system_type hppfs_type = { 758static struct file_system_type hppfs_type = {
758 .owner = THIS_MODULE, 759 .owner = THIS_MODULE,
759 .name = "hppfs", 760 .name = "hppfs",
760 .get_sb = hppfs_read_super, 761 .mount = hppfs_read_super,
761 .kill_sb = kill_anon_super, 762 .kill_sb = kill_anon_super,
762 .fs_flags = 0, 763 .fs_flags = 0,
763}; 764};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a4e9a7ec369..d6cfac1f0a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -371,27 +372,10 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
371 hugetlb_unreserve_pages(inode, start, freed); 372 hugetlb_unreserve_pages(inode, start, freed);
372} 373}
373 374
374static void hugetlbfs_delete_inode(struct inode *inode) 375static void hugetlbfs_evict_inode(struct inode *inode)
375{ 376{
376 truncate_hugepages(inode, 0); 377 truncate_hugepages(inode, 0);
377 clear_inode(inode); 378 end_writeback(inode);
378}
379
380static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
381{
382 if (generic_detach_inode(inode)) {
383 truncate_hugepages(inode, 0);
384 clear_inode(inode);
385 destroy_inode(inode);
386 }
387}
388
389static void hugetlbfs_drop_inode(struct inode *inode)
390{
391 if (!inode->i_nlink)
392 generic_delete_inode(inode);
393 else
394 hugetlbfs_forget_inode(inode);
395} 379}
396 380
397static inline void 381static inline void
@@ -448,19 +432,20 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
448 432
449 error = inode_change_ok(inode, attr); 433 error = inode_change_ok(inode, attr);
450 if (error) 434 if (error)
451 goto out; 435 return error;
452 436
453 if (ia_valid & ATTR_SIZE) { 437 if (ia_valid & ATTR_SIZE) {
454 error = -EINVAL; 438 error = -EINVAL;
455 if (!(attr->ia_size & ~huge_page_mask(h))) 439 if (attr->ia_size & ~huge_page_mask(h))
456 error = hugetlb_vmtruncate(inode, attr->ia_size); 440 return -EINVAL;
441 error = hugetlb_vmtruncate(inode, attr->ia_size);
457 if (error) 442 if (error)
458 goto out; 443 return error;
459 attr->ia_valid &= ~ATTR_SIZE;
460 } 444 }
461 error = inode_setattr(inode, attr); 445
462out: 446 setattr_copy(inode, attr);
463 return error; 447 mark_inode_dirty(inode);
448 return 0;
464} 449}
465 450
466static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 451static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
@@ -471,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
471 inode = new_inode(sb); 456 inode = new_inode(sb);
472 if (inode) { 457 if (inode) {
473 struct hugetlbfs_inode_info *info; 458 struct hugetlbfs_inode_info *info;
459 inode->i_ino = get_next_ino();
474 inode->i_mode = mode; 460 inode->i_mode = mode;
475 inode->i_uid = uid; 461 inode->i_uid = uid;
476 inode->i_gid = gid; 462 inode->i_gid = gid;
@@ -589,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
589 return 0; 575 return 0;
590} 576}
591 577
578static int hugetlbfs_migrate_page(struct address_space *mapping,
579 struct page *newpage, struct page *page)
580{
581 int rc;
582
583 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
584 if (rc)
585 return rc;
586 migrate_page_copy(newpage, page);
587
588 return 0;
589}
590
592static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 591static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
593{ 592{
594 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 593 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -675,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
675 .write_begin = hugetlbfs_write_begin, 674 .write_begin = hugetlbfs_write_begin,
676 .write_end = hugetlbfs_write_end, 675 .write_end = hugetlbfs_write_end,
677 .set_page_dirty = hugetlbfs_set_page_dirty, 676 .set_page_dirty = hugetlbfs_set_page_dirty,
677 .migratepage = hugetlbfs_migrate_page,
678}; 678};
679 679
680 680
@@ -690,6 +690,7 @@ const struct file_operations hugetlbfs_file_operations = {
690 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
691 .fsync = noop_fsync, 691 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693 .llseek = default_llseek,
693}; 694};
694 695
695static const struct inode_operations hugetlbfs_dir_inode_operations = { 696static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -712,9 +713,8 @@ static const struct inode_operations hugetlbfs_inode_operations = {
712static const struct super_operations hugetlbfs_ops = { 713static const struct super_operations hugetlbfs_ops = {
713 .alloc_inode = hugetlbfs_alloc_inode, 714 .alloc_inode = hugetlbfs_alloc_inode,
714 .destroy_inode = hugetlbfs_destroy_inode, 715 .destroy_inode = hugetlbfs_destroy_inode,
716 .evict_inode = hugetlbfs_evict_inode,
715 .statfs = hugetlbfs_statfs, 717 .statfs = hugetlbfs_statfs,
716 .delete_inode = hugetlbfs_delete_inode,
717 .drop_inode = hugetlbfs_drop_inode,
718 .put_super = hugetlbfs_put_super, 718 .put_super = hugetlbfs_put_super,
719 .show_options = generic_show_options, 719 .show_options = generic_show_options,
720}; 720};
@@ -896,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
896 } 896 }
897} 897}
898 898
899static int hugetlbfs_get_sb(struct file_system_type *fs_type, 899static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
900 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 900 int flags, const char *dev_name, void *data)
901{ 901{
902 return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); 902 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
903} 903}
904 904
905static struct file_system_type hugetlbfs_fs_type = { 905static struct file_system_type hugetlbfs_fs_type = {
906 .name = "hugetlbfs", 906 .name = "hugetlbfs",
907 .get_sb = hugetlbfs_get_sb, 907 .mount = hugetlbfs_mount,
908 .kill_sb = kill_litter_super, 908 .kill_sb = kill_litter_super,
909}; 909};
910 910
diff --git a/fs/inode.c b/fs/inode.c
index 722860b323a..ae2727ab0c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,16 +20,15 @@
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/cdev.h> 21#include <linux/cdev.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/inotify.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/mount.h> 24#include <linux/mount.h>
26#include <linux/async.h> 25#include <linux/async.h>
27#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h>
28 28
29/* 29/*
30 * This is needed for the following functions: 30 * This is needed for the following functions:
31 * - inode_has_buffers 31 * - inode_has_buffers
32 * - invalidate_inode_buffers
33 * - invalidate_bdev 32 * - invalidate_bdev
34 * 33 *
35 * FIXME: remove all knowledge of the buffer layer from this file 34 * FIXME: remove all knowledge of the buffer layer from this file
@@ -73,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
73 * allowing for low-overhead inode sync() operations. 72 * allowing for low-overhead inode sync() operations.
74 */ 73 */
75 74
76LIST_HEAD(inode_in_use); 75static LIST_HEAD(inode_lru);
77LIST_HEAD(inode_unused);
78static struct hlist_head *inode_hashtable __read_mostly; 76static struct hlist_head *inode_hashtable __read_mostly;
79 77
80/* 78/*
@@ -104,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
104 */ 102 */
105struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
106 104
105static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
106static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
107
107static struct kmem_cache *inode_cachep __read_mostly; 108static struct kmem_cache *inode_cachep __read_mostly;
108 109
110static inline int get_nr_inodes(void)
111{
112 return percpu_counter_sum_positive(&nr_inodes);
113}
114
115static inline int get_nr_inodes_unused(void)
116{
117 return percpu_counter_sum_positive(&nr_inodes_unused);
118}
119
120int get_nr_dirty_inodes(void)
121{
122 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
123 return nr_dirty > 0 ? nr_dirty : 0;
124
125}
126
127/*
128 * Handle nr_inode sysctl
129 */
130#ifdef CONFIG_SYSCTL
131int proc_nr_inodes(ctl_table *table, int write,
132 void __user *buffer, size_t *lenp, loff_t *ppos)
133{
134 inodes_stat.nr_inodes = get_nr_inodes();
135 inodes_stat.nr_unused = get_nr_inodes_unused();
136 return proc_dointvec(table, write, buffer, lenp, ppos);
137}
138#endif
139
109static void wake_up_inode(struct inode *inode) 140static void wake_up_inode(struct inode *inode)
110{ 141{
111 /* 142 /*
@@ -193,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
193 inode->i_fsnotify_mask = 0; 224 inode->i_fsnotify_mask = 0;
194#endif 225#endif
195 226
227 percpu_counter_inc(&nr_inodes);
228
196 return 0; 229 return 0;
197out: 230out:
198 return -ENOMEM; 231 return -ENOMEM;
@@ -233,11 +266,13 @@ void __destroy_inode(struct inode *inode)
233 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 266 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
234 posix_acl_release(inode->i_default_acl); 267 posix_acl_release(inode->i_default_acl);
235#endif 268#endif
269 percpu_counter_dec(&nr_inodes);
236} 270}
237EXPORT_SYMBOL(__destroy_inode); 271EXPORT_SYMBOL(__destroy_inode);
238 272
239void destroy_inode(struct inode *inode) 273static void destroy_inode(struct inode *inode)
240{ 274{
275 BUG_ON(!list_empty(&inode->i_lru));
241 __destroy_inode(inode); 276 __destroy_inode(inode);
242 if (inode->i_sb->s_op->destroy_inode) 277 if (inode->i_sb->s_op->destroy_inode)
243 inode->i_sb->s_op->destroy_inode(inode); 278 inode->i_sb->s_op->destroy_inode(inode);
@@ -256,6 +291,8 @@ void inode_init_once(struct inode *inode)
256 INIT_HLIST_NODE(&inode->i_hash); 291 INIT_HLIST_NODE(&inode->i_hash);
257 INIT_LIST_HEAD(&inode->i_dentry); 292 INIT_LIST_HEAD(&inode->i_dentry);
258 INIT_LIST_HEAD(&inode->i_devices); 293 INIT_LIST_HEAD(&inode->i_devices);
294 INIT_LIST_HEAD(&inode->i_wb_list);
295 INIT_LIST_HEAD(&inode->i_lru);
259 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 296 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
260 spin_lock_init(&inode->i_data.tree_lock); 297 spin_lock_init(&inode->i_data.tree_lock);
261 spin_lock_init(&inode->i_data.i_mmap_lock); 298 spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -264,12 +301,8 @@ void inode_init_once(struct inode *inode)
264 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 301 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
265 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 302 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
266 i_size_ordered_init(inode); 303 i_size_ordered_init(inode);
267#ifdef CONFIG_INOTIFY
268 INIT_LIST_HEAD(&inode->inotify_watches);
269 mutex_init(&inode->inotify_mutex);
270#endif
271#ifdef CONFIG_FSNOTIFY 304#ifdef CONFIG_FSNOTIFY
272 INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries); 305 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
273#endif 306#endif
274} 307}
275EXPORT_SYMBOL(inode_init_once); 308EXPORT_SYMBOL(inode_init_once);
@@ -286,40 +319,137 @@ static void init_once(void *foo)
286 */ 319 */
287void __iget(struct inode *inode) 320void __iget(struct inode *inode)
288{ 321{
289 if (atomic_inc_return(&inode->i_count) != 1) 322 atomic_inc(&inode->i_count);
290 return; 323}
324
325/*
326 * get additional reference to inode; caller must already hold one.
327 */
328void ihold(struct inode *inode)
329{
330 WARN_ON(atomic_inc_return(&inode->i_count) < 2);
331}
332EXPORT_SYMBOL(ihold);
333
334static void inode_lru_list_add(struct inode *inode)
335{
336 if (list_empty(&inode->i_lru)) {
337 list_add(&inode->i_lru, &inode_lru);
338 percpu_counter_inc(&nr_inodes_unused);
339 }
340}
341
342static void inode_lru_list_del(struct inode *inode)
343{
344 if (!list_empty(&inode->i_lru)) {
345 list_del_init(&inode->i_lru);
346 percpu_counter_dec(&nr_inodes_unused);
347 }
348}
349
350static inline void __inode_sb_list_add(struct inode *inode)
351{
352 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
353}
291 354
292 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 355/**
293 list_move(&inode->i_list, &inode_in_use); 356 * inode_sb_list_add - add inode to the superblock list of inodes
294 inodes_stat.nr_unused--; 357 * @inode: inode to add
358 */
359void inode_sb_list_add(struct inode *inode)
360{
361 spin_lock(&inode_lock);
362 __inode_sb_list_add(inode);
363 spin_unlock(&inode_lock);
364}
365EXPORT_SYMBOL_GPL(inode_sb_list_add);
366
367static inline void __inode_sb_list_del(struct inode *inode)
368{
369 list_del_init(&inode->i_sb_list);
370}
371
372static unsigned long hash(struct super_block *sb, unsigned long hashval)
373{
374 unsigned long tmp;
375
376 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
377 L1_CACHE_BYTES;
378 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
379 return tmp & I_HASHMASK;
295} 380}
296 381
297/** 382/**
298 * clear_inode - clear an inode 383 * __insert_inode_hash - hash an inode
299 * @inode: inode to clear 384 * @inode: unhashed inode
385 * @hashval: unsigned long value used to locate this object in the
386 * inode_hashtable.
300 * 387 *
301 * This is called by the filesystem to tell us 388 * Add an inode to the inode hash for this superblock.
302 * that the inode is no longer useful. We just
303 * terminate it with extreme prejudice.
304 */ 389 */
305void clear_inode(struct inode *inode) 390void __insert_inode_hash(struct inode *inode, unsigned long hashval)
306{ 391{
307 might_sleep(); 392 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
308 invalidate_inode_buffers(inode); 393
394 spin_lock(&inode_lock);
395 hlist_add_head(&inode->i_hash, b);
396 spin_unlock(&inode_lock);
397}
398EXPORT_SYMBOL(__insert_inode_hash);
309 399
400/**
401 * __remove_inode_hash - remove an inode from the hash
402 * @inode: inode to unhash
403 *
404 * Remove an inode from the superblock.
405 */
406static void __remove_inode_hash(struct inode *inode)
407{
408 hlist_del_init(&inode->i_hash);
409}
410
411/**
412 * remove_inode_hash - remove an inode from the hash
413 * @inode: inode to unhash
414 *
415 * Remove an inode from the superblock.
416 */
417void remove_inode_hash(struct inode *inode)
418{
419 spin_lock(&inode_lock);
420 hlist_del_init(&inode->i_hash);
421 spin_unlock(&inode_lock);
422}
423EXPORT_SYMBOL(remove_inode_hash);
424
425void end_writeback(struct inode *inode)
426{
427 might_sleep();
310 BUG_ON(inode->i_data.nrpages); 428 BUG_ON(inode->i_data.nrpages);
429 BUG_ON(!list_empty(&inode->i_data.private_list));
311 BUG_ON(!(inode->i_state & I_FREEING)); 430 BUG_ON(!(inode->i_state & I_FREEING));
312 BUG_ON(inode->i_state & I_CLEAR); 431 BUG_ON(inode->i_state & I_CLEAR);
313 inode_sync_wait(inode); 432 inode_sync_wait(inode);
314 if (inode->i_sb->s_op->clear_inode) 433 inode->i_state = I_FREEING | I_CLEAR;
315 inode->i_sb->s_op->clear_inode(inode); 434}
435EXPORT_SYMBOL(end_writeback);
436
437static void evict(struct inode *inode)
438{
439 const struct super_operations *op = inode->i_sb->s_op;
440
441 if (op->evict_inode) {
442 op->evict_inode(inode);
443 } else {
444 if (inode->i_data.nrpages)
445 truncate_inode_pages(&inode->i_data, 0);
446 end_writeback(inode);
447 }
316 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 448 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
317 bd_forget(inode); 449 bd_forget(inode);
318 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 450 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
319 cd_forget(inode); 451 cd_forget(inode);
320 inode->i_state = I_CLEAR;
321} 452}
322EXPORT_SYMBOL(clear_inode);
323 453
324/* 454/*
325 * dispose_list - dispose of the contents of a local list 455 * dispose_list - dispose of the contents of a local list
@@ -330,104 +460,113 @@ EXPORT_SYMBOL(clear_inode);
330 */ 460 */
331static void dispose_list(struct list_head *head) 461static void dispose_list(struct list_head *head)
332{ 462{
333 int nr_disposed = 0;
334
335 while (!list_empty(head)) { 463 while (!list_empty(head)) {
336 struct inode *inode; 464 struct inode *inode;
337 465
338 inode = list_first_entry(head, struct inode, i_list); 466 inode = list_first_entry(head, struct inode, i_lru);
339 list_del(&inode->i_list); 467 list_del_init(&inode->i_lru);
340 468
341 if (inode->i_data.nrpages) 469 evict(inode);
342 truncate_inode_pages(&inode->i_data, 0);
343 clear_inode(inode);
344 470
345 spin_lock(&inode_lock); 471 spin_lock(&inode_lock);
346 hlist_del_init(&inode->i_hash); 472 __remove_inode_hash(inode);
347 list_del_init(&inode->i_sb_list); 473 __inode_sb_list_del(inode);
348 spin_unlock(&inode_lock); 474 spin_unlock(&inode_lock);
349 475
350 wake_up_inode(inode); 476 wake_up_inode(inode);
351 destroy_inode(inode); 477 destroy_inode(inode);
352 nr_disposed++;
353 } 478 }
354 spin_lock(&inode_lock);
355 inodes_stat.nr_inodes -= nr_disposed;
356 spin_unlock(&inode_lock);
357} 479}
358 480
359/* 481/**
360 * Invalidate all inodes for a device. 482 * evict_inodes - evict all evictable inodes for a superblock
483 * @sb: superblock to operate on
484 *
485 * Make sure that no inodes with zero refcount are retained. This is
486 * called by superblock shutdown after having MS_ACTIVE flag removed,
487 * so any inode reaching zero refcount during or after that call will
488 * be immediately evicted.
361 */ 489 */
362static int invalidate_list(struct list_head *head, struct list_head *dispose) 490void evict_inodes(struct super_block *sb)
363{ 491{
364 struct list_head *next; 492 struct inode *inode, *next;
365 int busy = 0, count = 0; 493 LIST_HEAD(dispose);
366 494
367 next = head->next; 495 down_write(&iprune_sem);
368 for (;;) {
369 struct list_head *tmp = next;
370 struct inode *inode;
371
372 /*
373 * We can reschedule here without worrying about the list's
374 * consistency because the per-sb list of inodes must not
375 * change during umount anymore, and because iprune_sem keeps
376 * shrink_icache_memory() away.
377 */
378 cond_resched_lock(&inode_lock);
379 496
380 next = next->next; 497 spin_lock(&inode_lock);
381 if (tmp == head) 498 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
382 break; 499 if (atomic_read(&inode->i_count))
383 inode = list_entry(tmp, struct inode, i_sb_list);
384 if (inode->i_state & I_NEW)
385 continue; 500 continue;
386 invalidate_inode_buffers(inode); 501
387 if (!atomic_read(&inode->i_count)) { 502 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
388 list_move(&inode->i_list, dispose); 503 WARN_ON(1);
389 WARN_ON(inode->i_state & I_NEW);
390 inode->i_state |= I_FREEING;
391 count++;
392 continue; 504 continue;
393 } 505 }
394 busy = 1; 506
507 inode->i_state |= I_FREEING;
508
509 /*
510 * Move the inode off the IO lists and LRU once I_FREEING is
511 * set so that it won't get moved back on there if it is dirty.
512 */
513 list_move(&inode->i_lru, &dispose);
514 list_del_init(&inode->i_wb_list);
515 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
516 percpu_counter_dec(&nr_inodes_unused);
395 } 517 }
396 /* only unused inodes may be cached with i_count zero */ 518 spin_unlock(&inode_lock);
397 inodes_stat.nr_unused -= count; 519
398 return busy; 520 dispose_list(&dispose);
521 up_write(&iprune_sem);
399} 522}
400 523
401/** 524/**
402 * invalidate_inodes - discard the inodes on a device 525 * invalidate_inodes - attempt to free all inodes on a superblock
403 * @sb: superblock 526 * @sb: superblock to operate on
404 * 527 *
405 * Discard all of the inodes for a given superblock. If the discard 528 * Attempts to free all inodes for a given superblock. If there were any
406 * fails because there are busy inodes then a non zero value is returned. 529 * busy inodes return a non-zero value, else zero.
407 * If the discard is successful all the inodes have been discarded.
408 */ 530 */
409int invalidate_inodes(struct super_block *sb) 531int invalidate_inodes(struct super_block *sb)
410{ 532{
411 int busy; 533 int busy = 0;
412 LIST_HEAD(throw_away); 534 struct inode *inode, *next;
535 LIST_HEAD(dispose);
413 536
414 down_write(&iprune_sem); 537 down_write(&iprune_sem);
538
415 spin_lock(&inode_lock); 539 spin_lock(&inode_lock);
416 inotify_unmount_inodes(&sb->s_inodes); 540 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
417 fsnotify_unmount_inodes(&sb->s_inodes); 541 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
418 busy = invalidate_list(&sb->s_inodes, &throw_away); 542 continue;
543 if (atomic_read(&inode->i_count)) {
544 busy = 1;
545 continue;
546 }
547
548 inode->i_state |= I_FREEING;
549
550 /*
551 * Move the inode off the IO lists and LRU once I_FREEING is
552 * set so that it won't get moved back on there if it is dirty.
553 */
554 list_move(&inode->i_lru, &dispose);
555 list_del_init(&inode->i_wb_list);
556 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
557 percpu_counter_dec(&nr_inodes_unused);
558 }
419 spin_unlock(&inode_lock); 559 spin_unlock(&inode_lock);
420 560
421 dispose_list(&throw_away); 561 dispose_list(&dispose);
422 up_write(&iprune_sem); 562 up_write(&iprune_sem);
423 563
424 return busy; 564 return busy;
425} 565}
426EXPORT_SYMBOL(invalidate_inodes);
427 566
428static int can_unuse(struct inode *inode) 567static int can_unuse(struct inode *inode)
429{ 568{
430 if (inode->i_state) 569 if (inode->i_state & ~I_REFERENCED)
431 return 0; 570 return 0;
432 if (inode_has_buffers(inode)) 571 if (inode_has_buffers(inode))
433 return 0; 572 return 0;
@@ -439,22 +578,24 @@ static int can_unuse(struct inode *inode)
439} 578}
440 579
441/* 580/*
442 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 581 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
443 * a temporary list and then are freed outside inode_lock by dispose_list(). 582 * temporary list and then are freed outside inode_lock by dispose_list().
444 * 583 *
445 * Any inodes which are pinned purely because of attached pagecache have their 584 * Any inodes which are pinned purely because of attached pagecache have their
446 * pagecache removed. We expect the final iput() on that inode to add it to 585 * pagecache removed. If the inode has metadata buffers attached to
447 * the front of the inode_unused list. So look for it there and if the 586 * mapping->private_list then try to remove them.
448 * inode is still freeable, proceed. The right inode is found 99.9% of the
449 * time in testing on a 4-way.
450 * 587 *
451 * If the inode has metadata buffers attached to mapping->private_list then 588 * If the inode has the I_REFERENCED flag set, then it means that it has been
452 * try to remove them. 589 * used recently - the flag is set in iput_final(). When we encounter such an
590 * inode, clear the flag and move it to the back of the LRU so it gets another
591 * pass through the LRU before it gets reclaimed. This is necessary because of
592 * the fact we are doing lazy LRU updates to minimise lock contention so the
593 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
594 * with this flag set because they are the inodes that are out of order.
453 */ 595 */
454static void prune_icache(int nr_to_scan) 596static void prune_icache(int nr_to_scan)
455{ 597{
456 LIST_HEAD(freeable); 598 LIST_HEAD(freeable);
457 int nr_pruned = 0;
458 int nr_scanned; 599 int nr_scanned;
459 unsigned long reap = 0; 600 unsigned long reap = 0;
460 601
@@ -463,13 +604,26 @@ static void prune_icache(int nr_to_scan)
463 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 604 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
464 struct inode *inode; 605 struct inode *inode;
465 606
466 if (list_empty(&inode_unused)) 607 if (list_empty(&inode_lru))
467 break; 608 break;
468 609
469 inode = list_entry(inode_unused.prev, struct inode, i_list); 610 inode = list_entry(inode_lru.prev, struct inode, i_lru);
470 611
471 if (inode->i_state || atomic_read(&inode->i_count)) { 612 /*
472 list_move(&inode->i_list, &inode_unused); 613 * Referenced or dirty inodes are still in use. Give them
614 * another pass through the LRU as we canot reclaim them now.
615 */
616 if (atomic_read(&inode->i_count) ||
617 (inode->i_state & ~I_REFERENCED)) {
618 list_del_init(&inode->i_lru);
619 percpu_counter_dec(&nr_inodes_unused);
620 continue;
621 }
622
623 /* recently referenced inodes get one more pass */
624 if (inode->i_state & I_REFERENCED) {
625 list_move(&inode->i_lru, &inode_lru);
626 inode->i_state &= ~I_REFERENCED;
473 continue; 627 continue;
474 } 628 }
475 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 629 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -481,18 +635,23 @@ static void prune_icache(int nr_to_scan)
481 iput(inode); 635 iput(inode);
482 spin_lock(&inode_lock); 636 spin_lock(&inode_lock);
483 637
484 if (inode != list_entry(inode_unused.next, 638 if (inode != list_entry(inode_lru.next,
485 struct inode, i_list)) 639 struct inode, i_lru))
486 continue; /* wrong inode or list_empty */ 640 continue; /* wrong inode or list_empty */
487 if (!can_unuse(inode)) 641 if (!can_unuse(inode))
488 continue; 642 continue;
489 } 643 }
490 list_move(&inode->i_list, &freeable);
491 WARN_ON(inode->i_state & I_NEW); 644 WARN_ON(inode->i_state & I_NEW);
492 inode->i_state |= I_FREEING; 645 inode->i_state |= I_FREEING;
493 nr_pruned++; 646
647 /*
648 * Move the inode off the IO lists and LRU once I_FREEING is
649 * set so that it won't get moved back on there if it is dirty.
650 */
651 list_move(&inode->i_lru, &freeable);
652 list_del_init(&inode->i_wb_list);
653 percpu_counter_dec(&nr_inodes_unused);
494 } 654 }
495 inodes_stat.nr_unused -= nr_pruned;
496 if (current_is_kswapd()) 655 if (current_is_kswapd())
497 __count_vm_events(KSWAPD_INODESTEAL, reap); 656 __count_vm_events(KSWAPD_INODESTEAL, reap);
498 else 657 else
@@ -524,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
524 return -1; 683 return -1;
525 prune_icache(nr); 684 prune_icache(nr);
526 } 685 }
527 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 686 return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
528} 687}
529 688
530static struct shrinker icache_shrinker = { 689static struct shrinker icache_shrinker = {
@@ -535,9 +694,6 @@ static struct shrinker icache_shrinker = {
535static void __wait_on_freeing_inode(struct inode *inode); 694static void __wait_on_freeing_inode(struct inode *inode);
536/* 695/*
537 * Called with the inode lock held. 696 * Called with the inode lock held.
538 * NOTE: we are not increasing the inode-refcount, you must call __iget()
539 * by hand after calling find_inode now! This simplifies iunique and won't
540 * add any additional branch in the common code.
541 */ 697 */
542static struct inode *find_inode(struct super_block *sb, 698static struct inode *find_inode(struct super_block *sb,
543 struct hlist_head *head, 699 struct hlist_head *head,
@@ -553,13 +709,14 @@ repeat:
553 continue; 709 continue;
554 if (!test(inode, data)) 710 if (!test(inode, data))
555 continue; 711 continue;
556 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 712 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
557 __wait_on_freeing_inode(inode); 713 __wait_on_freeing_inode(inode);
558 goto repeat; 714 goto repeat;
559 } 715 }
560 break; 716 __iget(inode);
717 return inode;
561 } 718 }
562 return node ? inode : NULL; 719 return NULL;
563} 720}
564 721
565/* 722/*
@@ -578,57 +735,53 @@ repeat:
578 continue; 735 continue;
579 if (inode->i_sb != sb) 736 if (inode->i_sb != sb)
580 continue; 737 continue;
581 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 738 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
582 __wait_on_freeing_inode(inode); 739 __wait_on_freeing_inode(inode);
583 goto repeat; 740 goto repeat;
584 } 741 }
585 break; 742 __iget(inode);
743 return inode;
586 } 744 }
587 return node ? inode : NULL; 745 return NULL;
588}
589
590static unsigned long hash(struct super_block *sb, unsigned long hashval)
591{
592 unsigned long tmp;
593
594 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
595 L1_CACHE_BYTES;
596 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
597 return tmp & I_HASHMASK;
598}
599
600static inline void
601__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
602 struct inode *inode)
603{
604 inodes_stat.nr_inodes++;
605 list_add(&inode->i_list, &inode_in_use);
606 list_add(&inode->i_sb_list, &sb->s_inodes);
607 if (head)
608 hlist_add_head(&inode->i_hash, head);
609} 746}
610 747
611/** 748/*
612 * inode_add_to_lists - add a new inode to relevant lists 749 * Each cpu owns a range of LAST_INO_BATCH numbers.
613 * @sb: superblock inode belongs to 750 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
614 * @inode: inode to mark in use 751 * to renew the exhausted range.
752 *
753 * This does not significantly increase overflow rate because every CPU can
754 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
755 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
756 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
757 * overflow rate by 2x, which does not seem too significant.
615 * 758 *
616 * When an inode is allocated it needs to be accounted for, added to the in use 759 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
617 * list, the owning superblock and the inode hash. This needs to be done under 760 * error if st_ino won't fit in target struct field. Use 32bit counter
618 * the inode_lock, so export a function to do this rather than the inode lock 761 * here to attempt to avoid that.
619 * itself. We calculate the hash list to add to here so it is all internal
620 * which requires the caller to have already set up the inode number in the
621 * inode to add.
622 */ 762 */
623void inode_add_to_lists(struct super_block *sb, struct inode *inode) 763#define LAST_INO_BATCH 1024
764static DEFINE_PER_CPU(unsigned int, last_ino);
765
766unsigned int get_next_ino(void)
624{ 767{
625 struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); 768 unsigned int *p = &get_cpu_var(last_ino);
769 unsigned int res = *p;
626 770
627 spin_lock(&inode_lock); 771#ifdef CONFIG_SMP
628 __inode_add_to_lists(sb, head, inode); 772 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
629 spin_unlock(&inode_lock); 773 static atomic_t shared_last_ino;
774 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
775
776 res = next - LAST_INO_BATCH;
777 }
778#endif
779
780 *p = ++res;
781 put_cpu_var(last_ino);
782 return res;
630} 783}
631EXPORT_SYMBOL_GPL(inode_add_to_lists); 784EXPORT_SYMBOL(get_next_ino);
632 785
633/** 786/**
634 * new_inode - obtain an inode 787 * new_inode - obtain an inode
@@ -644,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
644 */ 797 */
645struct inode *new_inode(struct super_block *sb) 798struct inode *new_inode(struct super_block *sb)
646{ 799{
647 /*
648 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
649 * error if st_ino won't fit in target struct field. Use 32bit counter
650 * here to attempt to avoid that.
651 */
652 static unsigned int last_ino;
653 struct inode *inode; 800 struct inode *inode;
654 801
655 spin_lock_prefetch(&inode_lock); 802 spin_lock_prefetch(&inode_lock);
@@ -657,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
657 inode = alloc_inode(sb); 804 inode = alloc_inode(sb);
658 if (inode) { 805 if (inode) {
659 spin_lock(&inode_lock); 806 spin_lock(&inode_lock);
660 __inode_add_to_lists(sb, NULL, inode); 807 __inode_sb_list_add(inode);
661 inode->i_ino = ++last_ino;
662 inode->i_state = 0; 808 inode->i_state = 0;
663 spin_unlock(&inode_lock); 809 spin_unlock(&inode_lock);
664 } 810 }
@@ -669,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
669void unlock_new_inode(struct inode *inode) 815void unlock_new_inode(struct inode *inode)
670{ 816{
671#ifdef CONFIG_DEBUG_LOCK_ALLOC 817#ifdef CONFIG_DEBUG_LOCK_ALLOC
672 if (inode->i_mode & S_IFDIR) { 818 if (S_ISDIR(inode->i_mode)) {
673 struct file_system_type *type = inode->i_sb->s_type; 819 struct file_system_type *type = inode->i_sb->s_type;
674 820
675 /* Set new key only if filesystem hasn't already changed it */ 821 /* Set new key only if filesystem hasn't already changed it */
@@ -726,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
726 if (set(inode, data)) 872 if (set(inode, data))
727 goto set_failed; 873 goto set_failed;
728 874
729 __inode_add_to_lists(sb, head, inode); 875 hlist_add_head(&inode->i_hash, head);
876 __inode_sb_list_add(inode);
730 inode->i_state = I_NEW; 877 inode->i_state = I_NEW;
731 spin_unlock(&inode_lock); 878 spin_unlock(&inode_lock);
732 879
@@ -741,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
741 * us. Use the old inode instead of the one we just 888 * us. Use the old inode instead of the one we just
742 * allocated. 889 * allocated.
743 */ 890 */
744 __iget(old);
745 spin_unlock(&inode_lock); 891 spin_unlock(&inode_lock);
746 destroy_inode(inode); 892 destroy_inode(inode);
747 inode = old; 893 inode = old;
@@ -773,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
773 old = find_inode_fast(sb, head, ino); 919 old = find_inode_fast(sb, head, ino);
774 if (!old) { 920 if (!old) {
775 inode->i_ino = ino; 921 inode->i_ino = ino;
776 __inode_add_to_lists(sb, head, inode); 922 hlist_add_head(&inode->i_hash, head);
923 __inode_sb_list_add(inode);
777 inode->i_state = I_NEW; 924 inode->i_state = I_NEW;
778 spin_unlock(&inode_lock); 925 spin_unlock(&inode_lock);
779 926
@@ -788,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
788 * us. Use the old inode instead of the one we just 935 * us. Use the old inode instead of the one we just
789 * allocated. 936 * allocated.
790 */ 937 */
791 __iget(old);
792 spin_unlock(&inode_lock); 938 spin_unlock(&inode_lock);
793 destroy_inode(inode); 939 destroy_inode(inode);
794 inode = old; 940 inode = old;
@@ -797,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
797 return inode; 943 return inode;
798} 944}
799 945
946/*
947 * search the inode cache for a matching inode number.
948 * If we find one, then the inode number we are trying to
949 * allocate is not unique and so we should not use it.
950 *
951 * Returns 1 if the inode number is unique, 0 if it is not.
952 */
953static int test_inode_iunique(struct super_block *sb, unsigned long ino)
954{
955 struct hlist_head *b = inode_hashtable + hash(sb, ino);
956 struct hlist_node *node;
957 struct inode *inode;
958
959 hlist_for_each_entry(inode, node, b, i_hash) {
960 if (inode->i_ino == ino && inode->i_sb == sb)
961 return 0;
962 }
963
964 return 1;
965}
966
800/** 967/**
801 * iunique - get a unique inode number 968 * iunique - get a unique inode number
802 * @sb: superblock 969 * @sb: superblock
@@ -818,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
818 * error if st_ino won't fit in target struct field. Use 32bit counter 985 * error if st_ino won't fit in target struct field. Use 32bit counter
819 * here to attempt to avoid that. 986 * here to attempt to avoid that.
820 */ 987 */
988 static DEFINE_SPINLOCK(iunique_lock);
821 static unsigned int counter; 989 static unsigned int counter;
822 struct inode *inode;
823 struct hlist_head *head;
824 ino_t res; 990 ino_t res;
825 991
826 spin_lock(&inode_lock); 992 spin_lock(&inode_lock);
993 spin_lock(&iunique_lock);
827 do { 994 do {
828 if (counter <= max_reserved) 995 if (counter <= max_reserved)
829 counter = max_reserved + 1; 996 counter = max_reserved + 1;
830 res = counter++; 997 res = counter++;
831 head = inode_hashtable + hash(sb, res); 998 } while (!test_inode_iunique(sb, res));
832 inode = find_inode_fast(sb, head, res); 999 spin_unlock(&iunique_lock);
833 } while (inode != NULL);
834 spin_unlock(&inode_lock); 1000 spin_unlock(&inode_lock);
835 1001
836 return res; 1002 return res;
@@ -840,7 +1006,7 @@ EXPORT_SYMBOL(iunique);
840struct inode *igrab(struct inode *inode) 1006struct inode *igrab(struct inode *inode)
841{ 1007{
842 spin_lock(&inode_lock); 1008 spin_lock(&inode_lock);
843 if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) 1009 if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
844 __iget(inode); 1010 __iget(inode);
845 else 1011 else
846 /* 1012 /*
@@ -882,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
882 spin_lock(&inode_lock); 1048 spin_lock(&inode_lock);
883 inode = find_inode(sb, head, test, data); 1049 inode = find_inode(sb, head, test, data);
884 if (inode) { 1050 if (inode) {
885 __iget(inode);
886 spin_unlock(&inode_lock); 1051 spin_unlock(&inode_lock);
887 if (likely(wait)) 1052 if (likely(wait))
888 wait_on_inode(inode); 1053 wait_on_inode(inode);
@@ -915,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
915 spin_lock(&inode_lock); 1080 spin_lock(&inode_lock);
916 inode = find_inode_fast(sb, head, ino); 1081 inode = find_inode_fast(sb, head, ino);
917 if (inode) { 1082 if (inode) {
918 __iget(inode);
919 spin_unlock(&inode_lock); 1083 spin_unlock(&inode_lock);
920 wait_on_inode(inode); 1084 wait_on_inode(inode);
921 return inode; 1085 return inode;
@@ -1089,7 +1253,7 @@ int insert_inode_locked(struct inode *inode)
1089 continue; 1253 continue;
1090 if (old->i_sb != sb) 1254 if (old->i_sb != sb)
1091 continue; 1255 continue;
1092 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) 1256 if (old->i_state & (I_FREEING|I_WILL_FREE))
1093 continue; 1257 continue;
1094 break; 1258 break;
1095 } 1259 }
@@ -1101,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
1101 __iget(old); 1265 __iget(old);
1102 spin_unlock(&inode_lock); 1266 spin_unlock(&inode_lock);
1103 wait_on_inode(old); 1267 wait_on_inode(old);
1104 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1268 if (unlikely(!inode_unhashed(old))) {
1105 iput(old); 1269 iput(old);
1106 return -EBUSY; 1270 return -EBUSY;
1107 } 1271 }
@@ -1128,7 +1292,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1128 continue; 1292 continue;
1129 if (!test(old, data)) 1293 if (!test(old, data))
1130 continue; 1294 continue;
1131 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) 1295 if (old->i_state & (I_FREEING|I_WILL_FREE))
1132 continue; 1296 continue;
1133 break; 1297 break;
1134 } 1298 }
@@ -1140,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1140 __iget(old); 1304 __iget(old);
1141 spin_unlock(&inode_lock); 1305 spin_unlock(&inode_lock);
1142 wait_on_inode(old); 1306 wait_on_inode(old);
1143 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1307 if (unlikely(!inode_unhashed(old))) {
1144 iput(old); 1308 iput(old);
1145 return -EBUSY; 1309 return -EBUSY;
1146 } 1310 }
@@ -1149,100 +1313,53 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1149} 1313}
1150EXPORT_SYMBOL(insert_inode_locked4); 1314EXPORT_SYMBOL(insert_inode_locked4);
1151 1315
1152/**
1153 * __insert_inode_hash - hash an inode
1154 * @inode: unhashed inode
1155 * @hashval: unsigned long value used to locate this object in the
1156 * inode_hashtable.
1157 *
1158 * Add an inode to the inode hash for this superblock.
1159 */
1160void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1161{
1162 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1163 spin_lock(&inode_lock);
1164 hlist_add_head(&inode->i_hash, head);
1165 spin_unlock(&inode_lock);
1166}
1167EXPORT_SYMBOL(__insert_inode_hash);
1168 1316
1169/** 1317int generic_delete_inode(struct inode *inode)
1170 * remove_inode_hash - remove an inode from the hash
1171 * @inode: inode to unhash
1172 *
1173 * Remove an inode from the superblock.
1174 */
1175void remove_inode_hash(struct inode *inode)
1176{ 1318{
1177 spin_lock(&inode_lock); 1319 return 1;
1178 hlist_del_init(&inode->i_hash);
1179 spin_unlock(&inode_lock);
1180} 1320}
1181EXPORT_SYMBOL(remove_inode_hash); 1321EXPORT_SYMBOL(generic_delete_inode);
1182 1322
1183/* 1323/*
1184 * Tell the filesystem that this inode is no longer of any interest and should 1324 * Normal UNIX filesystem behaviour: delete the
1185 * be completely destroyed. 1325 * inode when the usage count drops to zero, and
1186 * 1326 * i_nlink is zero.
1187 * We leave the inode in the inode hash table until *after* the filesystem's
1188 * ->delete_inode completes. This ensures that an iget (such as nfsd might
1189 * instigate) will always find up-to-date information either in the hash or on
1190 * disk.
1191 *
1192 * I_FREEING is set so that no-one will take a new reference to the inode while
1193 * it is being deleted.
1194 */ 1327 */
1195void generic_delete_inode(struct inode *inode) 1328int generic_drop_inode(struct inode *inode)
1196{ 1329{
1197 const struct super_operations *op = inode->i_sb->s_op; 1330 return !inode->i_nlink || inode_unhashed(inode);
1198
1199 list_del_init(&inode->i_list);
1200 list_del_init(&inode->i_sb_list);
1201 WARN_ON(inode->i_state & I_NEW);
1202 inode->i_state |= I_FREEING;
1203 inodes_stat.nr_inodes--;
1204 spin_unlock(&inode_lock);
1205
1206 if (op->delete_inode) {
1207 void (*delete)(struct inode *) = op->delete_inode;
1208 /* Filesystems implementing their own
1209 * s_op->delete_inode are required to call
1210 * truncate_inode_pages and clear_inode()
1211 * internally */
1212 delete(inode);
1213 } else {
1214 truncate_inode_pages(&inode->i_data, 0);
1215 clear_inode(inode);
1216 }
1217 spin_lock(&inode_lock);
1218 hlist_del_init(&inode->i_hash);
1219 spin_unlock(&inode_lock);
1220 wake_up_inode(inode);
1221 BUG_ON(inode->i_state != I_CLEAR);
1222 destroy_inode(inode);
1223} 1331}
1224EXPORT_SYMBOL(generic_delete_inode); 1332EXPORT_SYMBOL_GPL(generic_drop_inode);
1225 1333
1226/** 1334/*
1227 * generic_detach_inode - remove inode from inode lists 1335 * Called when we're dropping the last reference
1228 * @inode: inode to remove 1336 * to an inode.
1229 *
1230 * Remove inode from inode lists, write it if it's dirty. This is just an
1231 * internal VFS helper exported for hugetlbfs. Do not use!
1232 * 1337 *
1233 * Returns 1 if inode should be completely destroyed. 1338 * Call the FS "drop_inode()" function, defaulting to
1339 * the legacy UNIX filesystem behaviour. If it tells
1340 * us to evict inode, do so. Otherwise, retain inode
1341 * in cache if fs is alive, sync and evict if fs is
1342 * shutting down.
1234 */ 1343 */
1235int generic_detach_inode(struct inode *inode) 1344static void iput_final(struct inode *inode)
1236{ 1345{
1237 struct super_block *sb = inode->i_sb; 1346 struct super_block *sb = inode->i_sb;
1347 const struct super_operations *op = inode->i_sb->s_op;
1348 int drop;
1238 1349
1239 if (!hlist_unhashed(&inode->i_hash)) { 1350 if (op && op->drop_inode)
1240 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1351 drop = op->drop_inode(inode);
1241 list_move(&inode->i_list, &inode_unused); 1352 else
1242 inodes_stat.nr_unused++; 1353 drop = generic_drop_inode(inode);
1354
1355 if (!drop) {
1243 if (sb->s_flags & MS_ACTIVE) { 1356 if (sb->s_flags & MS_ACTIVE) {
1357 inode->i_state |= I_REFERENCED;
1358 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1359 inode_lru_list_add(inode);
1360 }
1244 spin_unlock(&inode_lock); 1361 spin_unlock(&inode_lock);
1245 return 0; 1362 return;
1246 } 1363 }
1247 WARN_ON(inode->i_state & I_NEW); 1364 WARN_ON(inode->i_state & I_NEW);
1248 inode->i_state |= I_WILL_FREE; 1365 inode->i_state |= I_WILL_FREE;
@@ -1251,65 +1368,28 @@ int generic_detach_inode(struct inode *inode)
1251 spin_lock(&inode_lock); 1368 spin_lock(&inode_lock);
1252 WARN_ON(inode->i_state & I_NEW); 1369 WARN_ON(inode->i_state & I_NEW);
1253 inode->i_state &= ~I_WILL_FREE; 1370 inode->i_state &= ~I_WILL_FREE;
1254 inodes_stat.nr_unused--; 1371 __remove_inode_hash(inode);
1255 hlist_del_init(&inode->i_hash);
1256 } 1372 }
1257 list_del_init(&inode->i_list); 1373
1258 list_del_init(&inode->i_sb_list);
1259 WARN_ON(inode->i_state & I_NEW); 1374 WARN_ON(inode->i_state & I_NEW);
1260 inode->i_state |= I_FREEING; 1375 inode->i_state |= I_FREEING;
1261 inodes_stat.nr_inodes--;
1262 spin_unlock(&inode_lock);
1263 return 1;
1264}
1265EXPORT_SYMBOL_GPL(generic_detach_inode);
1266 1376
1267static void generic_forget_inode(struct inode *inode) 1377 /*
1268{ 1378 * Move the inode off the IO lists and LRU once I_FREEING is
1269 if (!generic_detach_inode(inode)) 1379 * set so that it won't get moved back on there if it is dirty.
1270 return; 1380 */
1271 if (inode->i_data.nrpages) 1381 inode_lru_list_del(inode);
1272 truncate_inode_pages(&inode->i_data, 0); 1382 list_del_init(&inode->i_wb_list);
1273 clear_inode(inode); 1383
1384 __inode_sb_list_del(inode);
1385 spin_unlock(&inode_lock);
1386 evict(inode);
1387 remove_inode_hash(inode);
1274 wake_up_inode(inode); 1388 wake_up_inode(inode);
1389 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1275 destroy_inode(inode); 1390 destroy_inode(inode);
1276} 1391}
1277 1392
1278/*
1279 * Normal UNIX filesystem behaviour: delete the
1280 * inode when the usage count drops to zero, and
1281 * i_nlink is zero.
1282 */
1283void generic_drop_inode(struct inode *inode)
1284{
1285 if (!inode->i_nlink)
1286 generic_delete_inode(inode);
1287 else
1288 generic_forget_inode(inode);
1289}
1290EXPORT_SYMBOL_GPL(generic_drop_inode);
1291
1292/*
1293 * Called when we're dropping the last reference
1294 * to an inode.
1295 *
1296 * Call the FS "drop()" function, defaulting to
1297 * the legacy UNIX filesystem behaviour..
1298 *
1299 * NOTE! NOTE! NOTE! We're called with the inode lock
1300 * held, and the drop function is supposed to release
1301 * the lock!
1302 */
1303static inline void iput_final(struct inode *inode)
1304{
1305 const struct super_operations *op = inode->i_sb->s_op;
1306 void (*drop)(struct inode *) = generic_drop_inode;
1307
1308 if (op && op->drop_inode)
1309 drop = op->drop_inode;
1310 drop(inode);
1311}
1312
1313/** 1393/**
1314 * iput - put an inode 1394 * iput - put an inode
1315 * @inode: inode to put 1395 * @inode: inode to put
@@ -1322,7 +1402,7 @@ static inline void iput_final(struct inode *inode)
1322void iput(struct inode *inode) 1402void iput(struct inode *inode)
1323{ 1403{
1324 if (inode) { 1404 if (inode) {
1325 BUG_ON(inode->i_state == I_CLEAR); 1405 BUG_ON(inode->i_state & I_CLEAR);
1326 1406
1327 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1407 if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
1328 iput_final(inode); 1408 iput_final(inode);
@@ -1568,6 +1648,8 @@ void __init inode_init(void)
1568 SLAB_MEM_SPREAD), 1648 SLAB_MEM_SPREAD),
1569 init_once); 1649 init_once);
1570 register_shrinker(&icache_shrinker); 1650 register_shrinker(&icache_shrinker);
1651 percpu_counter_init(&nr_inodes, 0);
1652 percpu_counter_init(&nr_inodes_unused, 0);
1571 1653
1572 /* Hash may have been set up in inode_init_early */ 1654 /* Hash may have been set up in inode_init_early */
1573 if (!hashdist) 1655 if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a6..e43b9a4dbf4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/lglock.h>
13
12struct super_block; 14struct super_block;
13struct linux_binprm; 15struct linux_binprm;
14struct path; 16struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 72
71extern void __init mnt_init(void); 73extern void __init mnt_init(void);
72 74
73extern spinlock_t vfsmount_lock; 75DECLARE_BRLOCK(vfsmount_lock);
76
74 77
75/* 78/*
76 * fs_struct.c 79 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
80/* 83/*
81 * file_table.c 84 * file_table.c
82 */ 85 */
86extern void file_sb_list_add(struct file *f, struct super_block *sb);
87extern void file_sb_list_del(struct file *f);
83extern void mark_files_ro(struct super_block *); 88extern void mark_files_ro(struct super_block *);
84extern struct file *get_empty_filp(void); 89extern struct file *get_empty_filp(void);
85 90
@@ -96,3 +101,10 @@ extern void put_super(struct super_block *sb);
96struct nameidata; 101struct nameidata;
97extern struct file *nameidata_to_filp(struct nameidata *); 102extern struct file *nameidata_to_filp(struct nameidata *);
98extern void release_open_intent(struct nameidata *); 103extern void release_open_intent(struct nameidata *);
104
105/*
106 * inode.c
107 */
108extern int get_nr_dirty_inodes(void);
109extern void evict_inodes(struct super_block *);
110extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a71386..e92fdbb3bc3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
29 * @arg: command-specific argument for ioctl 29 * @arg: command-specific argument for ioctl
30 * 30 *
31 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise 31 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
32 * invokes filesystem specific ->ioctl method. If neither method exists,
33 * returns -ENOTTY. 32 * returns -ENOTTY.
34 * 33 *
35 * Returns 0 on success, -errno on error. 34 * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
39{ 38{
40 int error = -ENOTTY; 39 int error = -ENOTTY;
41 40
42 if (!filp->f_op) 41 if (!filp->f_op || !filp->f_op->unlocked_ioctl)
43 goto out; 42 goto out;
44 43
45 if (filp->f_op->unlocked_ioctl) { 44 error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
46 error = filp->f_op->unlocked_ioctl(filp, cmd, arg); 45 if (error == -ENOIOCTLCMD)
47 if (error == -ENOIOCTLCMD) 46 error = -EINVAL;
48 error = -EINVAL;
49 goto out;
50 } else if (filp->f_op->ioctl) {
51 lock_kernel();
52 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
53 filp, cmd, arg);
54 unlock_kernel();
55 }
56
57 out: 47 out:
58 return error; 48 return error;
59} 49}
@@ -540,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
540 return thaw_super(sb); 530 return thaw_super(sb);
541} 531}
542 532
533static int ioctl_fstrim(struct file *filp, void __user *argp)
534{
535 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
536 struct fstrim_range range;
537 int ret = 0;
538
539 if (!capable(CAP_SYS_ADMIN))
540 return -EPERM;
541
542 /* If filesystem doesn't support trim feature, return. */
543 if (sb->s_op->trim_fs == NULL)
544 return -EOPNOTSUPP;
545
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 if (argp == NULL) {
551 range.start = 0;
552 range.len = ULLONG_MAX;
553 range.minlen = 0;
554 } else if (copy_from_user(&range, argp, sizeof(range)))
555 return -EFAULT;
556
557 ret = sb->s_op->trim_fs(sb, &range);
558 if (ret < 0)
559 return ret;
560
561 if ((argp != NULL) &&
562 (copy_to_user(argp, &range, sizeof(range))))
563 return -EFAULT;
564
565 return 0;
566}
567
543/* 568/*
544 * When you add any new common ioctls to the switches above and below 569 * When you add any new common ioctls to the switches above and below
545 * please update compat_sys_ioctl() too. 570 * please update compat_sys_ioctl() too.
@@ -590,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
590 error = ioctl_fsthaw(filp); 615 error = ioctl_fsthaw(filp);
591 break; 616 break;
592 617
618 case FITRIM:
619 error = ioctl_fstrim(filp, argp);
620 break;
621
593 case FS_IOC_FIEMAP: 622 case FS_IOC_FIEMAP:
594 return ioctl_fiemap(filp, arg); 623 return ioctl_fiemap(filp, arg);
595 624
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac6..0542b6eedf8 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h>
14#include <linux/gfp.h> 13#include <linux/gfp.h>
15#include "isofs.h" 14#include "isofs.h"
16 15
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
255 char *tmpname; 254 char *tmpname;
256 struct iso_directory_record *tmpde; 255 struct iso_directory_record *tmpde;
257 struct inode *inode = filp->f_path.dentry->d_inode; 256 struct inode *inode = filp->f_path.dentry->d_inode;
257 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
258 258
259 tmpname = (char *)__get_free_page(GFP_KERNEL); 259 tmpname = (char *)__get_free_page(GFP_KERNEL);
260 if (tmpname == NULL) 260 if (tmpname == NULL)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 lock_kernel(); 263 mutex_lock(&sbi->s_mutex);
264 tmpde = (struct iso_directory_record *) (tmpname+1024); 264 tmpde = (struct iso_directory_record *) (tmpname+1024);
265 265
266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
267 267
268 free_page((unsigned long) tmpname); 268 free_page((unsigned long) tmpname);
269 unlock_kernel(); 269 mutex_unlock(&sbi->s_mutex);
270 return result; 270 return result;
271} 271}
272 272
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f294..bfdeb82a53b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/nls.h> 18#include <linux/nls.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
20#include <linux/smp_lock.h>
21#include <linux/statfs.h> 20#include <linux/statfs.h>
22#include <linux/cdrom.h> 21#include <linux/cdrom.h>
23#include <linux/parser.h> 22#include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 43 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45 44
46#ifdef CONFIG_JOLIET 45#ifdef CONFIG_JOLIET
47 lock_kernel();
48
49 unload_nls(sbi->s_nls_iocharset); 46 unload_nls(sbi->s_nls_iocharset);
50
51 unlock_kernel();
52#endif 47#endif
53 48
54 kfree(sbi); 49 kfree(sbi);
@@ -549,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
549} 544}
550 545
551/* 546/*
547 * Check if root directory is empty (has less than 3 files).
548 *
549 * Used to detect broken CDs where ISO root directory is empty but Joliet root
550 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
551 * (and Joliet used instead) or else no files would be visible.
552 */
553static bool rootdir_empty(struct super_block *sb, unsigned long block)
554{
555 int offset = 0, files = 0, de_len;
556 struct iso_directory_record *de;
557 struct buffer_head *bh;
558
559 bh = sb_bread(sb, block);
560 if (!bh)
561 return true;
562 while (files < 3) {
563 de = (struct iso_directory_record *) (bh->b_data + offset);
564 de_len = *(unsigned char *) de;
565 if (de_len == 0)
566 break;
567 files++;
568 offset += de_len;
569 }
570 brelse(bh);
571 return files < 3;
572}
573
574/*
552 * Initialize the superblock and read the root inode. 575 * Initialize the superblock and read the root inode.
553 * 576 *
554 * Note: a check_disk_change() has been done immediately prior 577 * Note: a check_disk_change() has been done immediately prior
@@ -722,7 +745,12 @@ root_found:
722 } 745 }
723 746
724 s->s_magic = ISOFS_SUPER_MAGIC; 747 s->s_magic = ISOFS_SUPER_MAGIC;
725 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ 748
749 /*
750 * With multi-extent files, file size is only limited by the maximum
751 * size of a file system, which is 8 TB.
752 */
753 s->s_maxbytes = 0x80000000000LL;
726 754
727 /* 755 /*
728 * The CDROM is read-only, has no nodes (devices) on it, and since 756 * The CDROM is read-only, has no nodes (devices) on it, and since
@@ -818,6 +846,7 @@ root_found:
818 sbi->s_utf8 = opt.utf8; 846 sbi->s_utf8 = opt.utf8;
819 sbi->s_nocompress = opt.nocompress; 847 sbi->s_nocompress = opt.nocompress;
820 sbi->s_overriderockperm = opt.overriderockperm; 848 sbi->s_overriderockperm = opt.overriderockperm;
849 mutex_init(&sbi->s_mutex);
821 /* 850 /*
822 * It would be incredibly stupid to allow people to mark every file 851 * It would be incredibly stupid to allow people to mark every file
823 * on the disk as suid, so we merely allow them to set the default 852 * on the disk as suid, so we merely allow them to set the default
@@ -842,6 +871,18 @@ root_found:
842 goto out_no_root; 871 goto out_no_root;
843 872
844 /* 873 /*
874 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
875 * correct Joliet root directory.
876 */
877 if (sbi->s_rock == 1 && joliet_level &&
878 rootdir_empty(s, sbi->s_firstdatazone)) {
879 printk(KERN_NOTICE
880 "ISOFS: primary root directory is empty. "
881 "Disabling Rock Ridge and switching to Joliet.");
882 sbi->s_rock = 0;
883 }
884
885 /*
845 * If this disk has both Rock Ridge and Joliet on it, then we 886 * If this disk has both Rock Ridge and Joliet on it, then we
846 * want to use Rock Ridge by default. This can be overridden 887 * want to use Rock Ridge by default. This can be overridden
847 * by using the norock mount option. There is still one other 888 * by using the norock mount option. There is still one other
@@ -961,27 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
961 * or getblk() if they are not. Returns the number of blocks inserted 1002 * or getblk() if they are not. Returns the number of blocks inserted
962 * (-ve == error.) 1003 * (-ve == error.)
963 */ 1004 */
964int isofs_get_blocks(struct inode *inode, sector_t iblock_s, 1005int isofs_get_blocks(struct inode *inode, sector_t iblock,
965 struct buffer_head **bh, unsigned long nblocks) 1006 struct buffer_head **bh, unsigned long nblocks)
966{ 1007{
967 unsigned long b_off; 1008 unsigned long b_off = iblock;
968 unsigned offset, sect_size; 1009 unsigned offset, sect_size;
969 unsigned int firstext; 1010 unsigned int firstext;
970 unsigned long nextblk, nextoff; 1011 unsigned long nextblk, nextoff;
971 long iblock = (long)iblock_s;
972 int section, rv, error; 1012 int section, rv, error;
973 struct iso_inode_info *ei = ISOFS_I(inode); 1013 struct iso_inode_info *ei = ISOFS_I(inode);
974 1014
975 lock_kernel();
976
977 error = -EIO; 1015 error = -EIO;
978 rv = 0; 1016 rv = 0;
979 if (iblock < 0 || iblock != iblock_s) { 1017 if (iblock != b_off) {
980 printk(KERN_DEBUG "%s: block number too large\n", __func__); 1018 printk(KERN_DEBUG "%s: block number too large\n", __func__);
981 goto abort; 1019 goto abort;
982 } 1020 }
983 1021
984 b_off = iblock;
985 1022
986 offset = 0; 1023 offset = 0;
987 firstext = ei->i_first_extent; 1024 firstext = ei->i_first_extent;
@@ -999,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
999 * I/O errors. 1036 * I/O errors.
1000 */ 1037 */
1001 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 1038 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
1002 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", 1039 printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
1003 __func__, iblock, (unsigned long) inode->i_size); 1040 __func__, b_off,
1041 (unsigned long long)inode->i_size);
1004 goto abort; 1042 goto abort;
1005 } 1043 }
1006 1044
@@ -1026,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1026 if (++section > 100) { 1064 if (++section > 100) {
1027 printk(KERN_DEBUG "%s: More than 100 file sections ?!?" 1065 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
1028 " aborting...\n", __func__); 1066 " aborting...\n", __func__);
1029 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " 1067 printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
1030 "nextblk=%lu nextoff=%lu\n", __func__, 1068 "nextblk=%lu nextoff=%lu\n", __func__,
1031 iblock, firstext, (unsigned) sect_size, 1069 b_off, firstext, (unsigned) sect_size,
1032 nextblk, nextoff); 1070 nextblk, nextoff);
1033 goto abort; 1071 goto abort;
1034 } 1072 }
@@ -1049,7 +1087,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1049 1087
1050 error = 0; 1088 error = 0;
1051abort: 1089abort:
1052 unlock_kernel();
1053 return rv != 0 ? rv : error; 1090 return rv != 0 ? rv : error;
1054} 1091}
1055 1092
@@ -1470,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
1470 return inode; 1507 return inode;
1471} 1508}
1472 1509
1473static int isofs_get_sb(struct file_system_type *fs_type, 1510static struct dentry *isofs_mount(struct file_system_type *fs_type,
1474 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1511 int flags, const char *dev_name, void *data)
1475{ 1512{
1476 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, 1513 return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
1477 mnt);
1478} 1514}
1479 1515
1480static struct file_system_type iso9660_fs_type = { 1516static struct file_system_type iso9660_fs_type = {
1481 .owner = THIS_MODULE, 1517 .owner = THIS_MODULE,
1482 .name = "iso9660", 1518 .name = "iso9660",
1483 .get_sb = isofs_get_sb, 1519 .mount = isofs_mount,
1484 .kill_sb = kill_block_super, 1520 .kill_sb = kill_block_super,
1485 .fs_flags = FS_REQUIRES_DEV, 1521 .fs_flags = FS_REQUIRES_DEV,
1486}; 1522};
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52..2882dc089f8 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
55 gid_t s_gid; 55 gid_t s_gid;
56 uid_t s_uid; 56 uid_t s_uid;
57 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
58 struct mutex s_mutex; /* replaces BKL, please remove if possible */
58}; 59};
59 60
60#define ISOFS_INVALID_MODE ((mode_t) -1) 61#define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867..0d23abfd428 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
6 * (C) 1991 Linus Torvalds - minix filesystem 6 * (C) 1991 Linus Torvalds - minix filesystem
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/gfp.h> 9#include <linux/gfp.h>
11#include "isofs.h" 10#include "isofs.h"
12 11
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
168 int found; 167 int found;
169 unsigned long uninitialized_var(block); 168 unsigned long uninitialized_var(block);
170 unsigned long uninitialized_var(offset); 169 unsigned long uninitialized_var(offset);
170 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
171 struct inode *inode; 171 struct inode *inode;
172 struct page *page; 172 struct page *page;
173 173
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
177 if (!page) 177 if (!page)
178 return ERR_PTR(-ENOMEM); 178 return ERR_PTR(-ENOMEM);
179 179
180 lock_kernel(); 180 mutex_lock(&sbi->s_mutex);
181 found = isofs_find_entry(dir, dentry, 181 found = isofs_find_entry(dir, dentry,
182 &block, &offset, 182 &block, &offset,
183 page_address(page), 183 page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
188 if (found) { 188 if (found) {
189 inode = isofs_iget(dir->i_sb, block, offset); 189 inode = isofs_iget(dir->i_sb, block, offset);
190 if (IS_ERR(inode)) { 190 if (IS_ERR(inode)) {
191 unlock_kernel(); 191 mutex_unlock(&sbi->s_mutex);
192 return ERR_CAST(inode); 192 return ERR_CAST(inode);
193 } 193 }
194 } 194 }
195 unlock_kernel(); 195 mutex_unlock(&sbi->s_mutex);
196 return d_splice_alias(inode, dentry); 196 return d_splice_alias(inode, dentry);
197} 197}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550f..f9cd04db6ea 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/smp_lock.h>
12 11
13#include "isofs.h" 12#include "isofs.h"
14#include "rock.h" 13#include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
661{ 660{
662 struct inode *inode = page->mapping->host; 661 struct inode *inode = page->mapping->host;
663 struct iso_inode_info *ei = ISOFS_I(inode); 662 struct iso_inode_info *ei = ISOFS_I(inode);
663 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
664 char *link = kmap(page); 664 char *link = kmap(page);
665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
666 struct buffer_head *bh; 666 struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
673 struct rock_state rs; 673 struct rock_state rs;
674 int ret; 674 int ret;
675 675
676 if (!ISOFS_SB(inode->i_sb)->s_rock) 676 if (!sbi->s_rock)
677 goto error; 677 goto error;
678 678
679 init_rock_state(&rs, inode); 679 init_rock_state(&rs, inode);
680 block = ei->i_iget5_block; 680 block = ei->i_iget5_block;
681 lock_kernel(); 681 mutex_lock(&sbi->s_mutex);
682 bh = sb_bread(inode->i_sb, block); 682 bh = sb_bread(inode->i_sb, block);
683 if (!bh) 683 if (!bh)
684 goto out_noread; 684 goto out_noread;
@@ -748,7 +748,7 @@ repeat:
748 goto fail; 748 goto fail;
749 brelse(bh); 749 brelse(bh);
750 *rpnt = '\0'; 750 *rpnt = '\0';
751 unlock_kernel(); 751 mutex_unlock(&sbi->s_mutex);
752 SetPageUptodate(page); 752 SetPageUptodate(page);
753 kunmap(page); 753 kunmap(page);
754 unlock_page(page); 754 unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
765 printk("symlink spans iso9660 blocks\n"); 765 printk("symlink spans iso9660 blocks\n");
766fail: 766fail:
767 brelse(bh); 767 brelse(bh);
768 unlock_kernel(); 768 mutex_unlock(&sbi->s_mutex);
769error: 769error:
770 SetPageError(page); 770 SetPageError(page);
771 kunmap(page); 771 kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654..e4b87bc1fa5 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
221 goto restart; 221 goto restart;
222 } 222 }
223 if (buffer_locked(bh)) { 223 if (buffer_locked(bh)) {
224 atomic_inc(&bh->b_count); 224 get_bh(bh);
225 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
226 jbd_unlock_bh_state(bh); 226 jbd_unlock_bh_state(bh);
227 wait_on_buffer(bh); 227 wait_on_buffer(bh);
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
254{ 254{
255 int i; 255 int i;
256 256
257 ll_rw_block(SWRITE, *batch_count, bhs); 257 for (i = 0; i < *batch_count; i++)
258 write_dirty_buffer(bhs[i], WRITE);
259
258 for (i = 0; i < *batch_count; i++) { 260 for (i = 0; i < *batch_count; i++) {
259 struct buffer_head *bh = bhs[i]; 261 struct buffer_head *bh = bhs[i];
260 clear_buffer_jwrite(bh); 262 clear_buffer_jwrite(bh);
@@ -281,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
281 int ret = 0; 283 int ret = 0;
282 284
283 if (buffer_locked(bh)) { 285 if (buffer_locked(bh)) {
284 atomic_inc(&bh->b_count); 286 get_bh(bh);
285 spin_unlock(&journal->j_list_lock); 287 spin_unlock(&journal->j_list_lock);
286 jbd_unlock_bh_state(bh); 288 jbd_unlock_bh_state(bh);
287 wait_on_buffer(bh); 289 wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c4..34a4861c14b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
119 struct buffer_head *bh; 119 struct buffer_head *bh;
120 journal_header_t *header; 120 journal_header_t *header;
121 int ret; 121 int ret;
122 int barrier_done = 0;
123 122
124 if (is_journal_aborted(journal)) 123 if (is_journal_aborted(journal))
125 return 0; 124 return 0;
@@ -137,34 +136,12 @@ static int journal_write_commit_record(journal_t *journal,
137 136
138 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
139 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
140 if (journal->j_flags & JFS_BARRIER) {
141 set_buffer_ordered(bh);
142 barrier_done = 1;
143 }
144 ret = sync_dirty_buffer(bh);
145 if (barrier_done)
146 clear_buffer_ordered(bh);
147 /* is it possible for another commit to fail at roughly
148 * the same time as this one? If so, we don't want to
149 * trust the barrier flag in the super, but instead want
150 * to remember if we sent a barrier request
151 */
152 if (ret == -EOPNOTSUPP && barrier_done) {
153 char b[BDEVNAME_SIZE];
154
155 printk(KERN_WARNING
156 "JBD: barrier-based sync failed on %s - "
157 "disabling barriers\n",
158 bdevname(journal->j_dev, b));
159 spin_lock(&journal->j_state_lock);
160 journal->j_flags &= ~JFS_BARRIER;
161 spin_unlock(&journal->j_state_lock);
162 139
163 /* And try again, without the barrier */ 140 if (journal->j_flags & JFS_BARRIER)
164 set_buffer_uptodate(bh); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
165 set_buffer_dirty(bh); 142 else
166 ret = sync_dirty_buffer(bh); 143 ret = sync_dirty_buffer(bh);
167 } 144
168 put_bh(bh); /* One for getblk() */ 145 put_bh(bh); /* One for getblk() */
169 journal_put_journal_head(descriptor); 146 journal_put_journal_head(descriptor);
170 147
@@ -317,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
317 int first_tag = 0; 294 int first_tag = 0;
318 int tag_flag; 295 int tag_flag;
319 int i; 296 int i;
320 int write_op = WRITE; 297 int write_op = WRITE_SYNC;
321 298
322 /* 299 /*
323 * First job: lock down the current transaction and wait for 300 * First job: lock down the current transaction and wait for
@@ -610,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
610 /* Bump b_count to prevent truncate from stumbling over 587 /* Bump b_count to prevent truncate from stumbling over
611 the shadowed buffer! @@@ This can go if we ever get 588 the shadowed buffer! @@@ This can go if we ever get
612 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 589 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
613 atomic_inc(&jh2bh(jh)->b_count); 590 get_bh(jh2bh(jh));
614 591
615 /* Make a temporary IO buffer with which to write it out 592 /* Make a temporary IO buffer with which to write it out
616 (this will requeue both the metadata buffer and the 593 (this will requeue both the metadata buffer and the
617 temporary IO buffer). new_bh goes on BJ_IO*/ 594 temporary IO buffer). new_bh goes on BJ_IO*/
618 595
619 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 596 set_buffer_jwrite(jh2bh(jh));
620 /* 597 /*
621 * akpm: journal_write_metadata_buffer() sets 598 * akpm: journal_write_metadata_buffer() sets
622 * new_bh->b_transaction to commit_transaction. 599 * new_bh->b_transaction to commit_transaction.
@@ -626,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
626 JBUFFER_TRACE(jh, "ph3: write metadata"); 603 JBUFFER_TRACE(jh, "ph3: write metadata");
627 flags = journal_write_metadata_buffer(commit_transaction, 604 flags = journal_write_metadata_buffer(commit_transaction,
628 jh, &new_jh, blocknr); 605 jh, &new_jh, blocknr);
629 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 606 set_buffer_jwrite(jh2bh(new_jh));
630 wbuf[bufs++] = jh2bh(new_jh); 607 wbuf[bufs++] = jh2bh(new_jh);
631 608
632 /* Record the new block's tag in the current descriptor 609 /* Record the new block's tag in the current descriptor
@@ -736,7 +713,7 @@ wait_for_iobuf:
736 shadowed buffer */ 713 shadowed buffer */
737 jh = commit_transaction->t_shadow_list->b_tprev; 714 jh = commit_transaction->t_shadow_list->b_tprev;
738 bh = jh2bh(jh); 715 bh = jh2bh(jh);
739 clear_bit(BH_JWrite, &bh->b_state); 716 clear_buffer_jwrite(bh);
740 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 717 J_ASSERT_BH(bh, buffer_jbddirty(bh));
741 718
742 /* The metadata is now released for reuse, but we need 719 /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 93d1e47647b..da1b5e4ffce 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/ratelimit.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/page.h> 42#include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
84 85
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno); 87static void __journal_abort_soft (journal_t *journal, int errno);
88static const char *journal_dev_name(journal_t *journal, char *buffer);
87 89
88/* 90/*
89 * Helper function used to manage commit timeouts 91 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
439 */ 441 */
440 if (!tid_geq(journal->j_commit_request, target)) { 442 if (!tid_geq(journal->j_commit_request, target)) {
441 /* 443 /*
442 * We want a new commit: OK, mark the request and wakup the 444 * We want a new commit: OK, mark the request and wakeup the
443 * commit thread. We do _not_ do the commit ourselves. 445 * commit thread. We do _not_ do the commit ourselves.
444 */ 446 */
445 447
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
950 if (err) 952 if (err)
951 return err; 953 return err;
952 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 954 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
955 if (unlikely(!bh))
956 return -ENOMEM;
953 lock_buffer(bh); 957 lock_buffer(bh);
954 memset (bh->b_data, 0, journal->j_blocksize); 958 memset (bh->b_data, 0, journal->j_blocksize);
955 BUFFER_TRACE(bh, "marking dirty"); 959 BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
1010 goto out; 1014 goto out;
1011 } 1015 }
1012 1016
1017 if (buffer_write_io_error(bh)) {
1018 char b[BDEVNAME_SIZE];
1019 /*
1020 * Oh, dear. A previous attempt to write the journal
1021 * superblock failed. This could happen because the
1022 * USB device was yanked out. Or it could happen to
1023 * be a transient write error and maybe the block will
1024 * be remapped. Nothing we can do but to retry the
1025 * write and hope for the best.
1026 */
1027 printk(KERN_ERR "JBD: previous I/O error detected "
1028 "for journal superblock update for %s.\n",
1029 journal_dev_name(journal, b));
1030 clear_buffer_write_io_error(bh);
1031 set_buffer_uptodate(bh);
1032 }
1033
1013 spin_lock(&journal->j_state_lock); 1034 spin_lock(&journal->j_state_lock);
1014 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", 1035 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
1015 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1036 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,10 +1042,18 @@ void journal_update_superblock(journal_t *journal, int wait)
1021 1042
1022 BUFFER_TRACE(bh, "marking dirty"); 1043 BUFFER_TRACE(bh, "marking dirty");
1023 mark_buffer_dirty(bh); 1044 mark_buffer_dirty(bh);
1024 if (wait) 1045 if (wait) {
1025 sync_dirty_buffer(bh); 1046 sync_dirty_buffer(bh);
1026 else 1047 if (buffer_write_io_error(bh)) {
1027 ll_rw_block(SWRITE, 1, &bh); 1048 char b[BDEVNAME_SIZE];
1049 printk(KERN_ERR "JBD: I/O error detected "
1050 "when updating journal superblock for %s.\n",
1051 journal_dev_name(journal, b));
1052 clear_buffer_write_io_error(bh);
1053 set_buffer_uptodate(bh);
1054 }
1055 } else
1056 write_dirty_buffer(bh, WRITE);
1028 1057
1029out: 1058out:
1030 /* If we have just flushed the log (by marking s_start==0), then 1059 /* If we have just flushed the log (by marking s_start==0), then
@@ -1281,13 +1310,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
1281int journal_check_available_features (journal_t *journal, unsigned long compat, 1310int journal_check_available_features (journal_t *journal, unsigned long compat,
1282 unsigned long ro, unsigned long incompat) 1311 unsigned long ro, unsigned long incompat)
1283{ 1312{
1284 journal_superblock_t *sb;
1285
1286 if (!compat && !ro && !incompat) 1313 if (!compat && !ro && !incompat)
1287 return 1; 1314 return 1;
1288 1315
1289 sb = journal->j_superblock;
1290
1291 /* We can support any known requested features iff the 1316 /* We can support any known requested features iff the
1292 * superblock is in version 2. Otherwise we fail to support any 1317 * superblock is in version 2. Otherwise we fail to support any
1293 * extended sb features. */ 1318 * extended sb features. */
@@ -1481,7 +1506,6 @@ int journal_flush(journal_t *journal)
1481 1506
1482int journal_wipe(journal_t *journal, int write) 1507int journal_wipe(journal_t *journal, int write)
1483{ 1508{
1484 journal_superblock_t *sb;
1485 int err = 0; 1509 int err = 0;
1486 1510
1487 J_ASSERT (!(journal->j_flags & JFS_LOADED)); 1511 J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1490,8 +1514,6 @@ int journal_wipe(journal_t *journal, int write)
1490 if (err) 1514 if (err)
1491 return err; 1515 return err;
1492 1516
1493 sb = journal->j_superblock;
1494
1495 if (!journal->j_tail) 1517 if (!journal->j_tail)
1496 goto no_recovery; 1518 goto no_recovery;
1497 1519
@@ -1726,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
1726static struct journal_head *journal_alloc_journal_head(void) 1748static struct journal_head *journal_alloc_journal_head(void)
1727{ 1749{
1728 struct journal_head *ret; 1750 struct journal_head *ret;
1729 static unsigned long last_warning;
1730 1751
1731#ifdef CONFIG_JBD_DEBUG 1752#ifdef CONFIG_JBD_DEBUG
1732 atomic_inc(&nr_journal_heads); 1753 atomic_inc(&nr_journal_heads);
@@ -1734,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
1734 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1755 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1735 if (ret == NULL) { 1756 if (ret == NULL) {
1736 jbd_debug(1, "out of memory for journal_head\n"); 1757 jbd_debug(1, "out of memory for journal_head\n");
1737 if (time_after(jiffies, last_warning + 5*HZ)) { 1758 printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1738 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 1759 __func__);
1739 __func__); 1760
1740 last_warning = jiffies;
1741 }
1742 while (ret == NULL) { 1761 while (ret == NULL) {
1743 yield(); 1762 yield();
1744 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1763 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b1..5b43e96788e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
283int journal_skip_recovery(journal_t *journal) 283int journal_skip_recovery(journal_t *journal)
284{ 284{
285 int err; 285 int err;
286 journal_superblock_t * sb;
287
288 struct recovery_info info; 286 struct recovery_info info;
289 287
290 memset (&info, 0, sizeof(info)); 288 memset (&info, 0, sizeof(info));
291 sb = journal->j_superblock;
292 289
293 err = do_one_pass(journal, &info, PASS_SCAN); 290 err = do_one_pass(journal, &info, PASS_SCAN);
294 291
@@ -297,11 +294,12 @@ int journal_skip_recovery(journal_t *journal)
297 ++journal->j_transaction_sequence; 294 ++journal->j_transaction_sequence;
298 } else { 295 } else {
299#ifdef CONFIG_JBD_DEBUG 296#ifdef CONFIG_JBD_DEBUG
300 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 297 int dropped = info.end_transaction -
301#endif 298 be32_to_cpu(journal->j_superblock->s_sequence);
302 jbd_debug(1, 299 jbd_debug(1,
303 "JBD: ignoring %d transaction%s from the journal.\n", 300 "JBD: ignoring %d transaction%s from the journal.\n",
304 dropped, (dropped == 1) ? "" : "s"); 301 dropped, (dropped == 1) ? "" : "s");
302#endif
305 journal->j_transaction_sequence = ++info.end_transaction; 303 journal->j_transaction_sequence = ++info.end_transaction;
306 } 304 }
307 305
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
321 unsigned int sequence; 319 unsigned int sequence;
322 int blocktype; 320 int blocktype;
323 321
324 /* Precompute the maximum metadata descriptors in a descriptor block */
325 int MAX_BLOCKS_PER_DESC;
326 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
327 / sizeof(journal_block_tag_t));
328
329 /* 322 /*
330 * First thing is to establish what we expect to find in the log 323 * First thing is to establish what we expect to find in the log
331 * (in terms of transaction IDs), and where (in terms of log 324 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343..d29018307e2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
617 set_buffer_jwrite(bh); 617 set_buffer_jwrite(bh);
618 BUFFER_TRACE(bh, "write"); 618 BUFFER_TRACE(bh, "write");
619 set_buffer_dirty(bh); 619 set_buffer_dirty(bh);
620 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 620 write_dirty_buffer(bh, write_op);
621} 621}
622#endif 622#endif
623 623
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a49..846a3f31411 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
293 jbd_free_handle(handle); 293 jbd_free_handle(handle);
294 current->journal_info = NULL; 294 current->journal_info = NULL;
295 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
296 goto out;
297 } 296 }
298out:
299 return handle; 297 return handle;
300} 298}
301 299
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
528 transaction = handle->h_transaction; 526 transaction = handle->h_transaction;
529 journal = transaction->t_journal; 527 journal = transaction->t_journal;
530 528
531 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 529 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
532 530
533 JBUFFER_TRACE(jh, "entry"); 531 JBUFFER_TRACE(jh, "entry");
534repeat: 532repeat:
@@ -713,7 +711,7 @@ done:
713 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 711 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
714 "Possible IO failure.\n"); 712 "Possible IO failure.\n");
715 page = jh2bh(jh)->b_page; 713 page = jh2bh(jh)->b_page;
716 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 714 offset = offset_in_page(jh2bh(jh)->b_data);
717 source = kmap_atomic(page, KM_USER0); 715 source = kmap_atomic(page, KM_USER0);
718 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 716 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
719 kunmap_atomic(source, KM_USER0); 717 kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f9..6a79fd0a1a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
118void __jbd2_log_wait_for_space(journal_t *journal) 118void __jbd2_log_wait_for_space(journal_t *journal)
119{ 119{
120 int nblocks, space_left; 120 int nblocks, space_left;
121 assert_spin_locked(&journal->j_state_lock); 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (__jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 spin_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
128 mutex_lock(&journal->j_checkpoint_mutex); 128 mutex_lock(&journal->j_checkpoint_mutex);
129 129
130 /* 130 /*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
138 * filesystem, so abort the journal and leave a stack 138 * filesystem, so abort the journal and leave a stack
139 * trace for forensic evidence. 139 * trace for forensic evidence.
140 */ 140 */
141 spin_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
149 if (journal->j_committing_transaction) 149 if (journal->j_committing_transaction)
150 tid = journal->j_committing_transaction->t_tid; 150 tid = journal->j_committing_transaction->t_tid;
151 spin_unlock(&journal->j_list_lock); 151 spin_unlock(&journal->j_list_lock);
152 spin_unlock(&journal->j_state_lock); 152 write_unlock(&journal->j_state_lock);
153 if (chkpt) { 153 if (chkpt) {
154 jbd2_log_do_checkpoint(journal); 154 jbd2_log_do_checkpoint(journal);
155 } else if (jbd2_cleanup_journal_tail(journal) == 0) { 155 } else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
167 WARN_ON(1); 167 WARN_ON(1);
168 jbd2_journal_abort(journal, 0); 168 jbd2_journal_abort(journal, 0);
169 } 169 }
170 spin_lock(&journal->j_state_lock); 170 write_lock(&journal->j_state_lock);
171 } else { 171 } else {
172 spin_unlock(&journal->j_list_lock); 172 spin_unlock(&journal->j_list_lock);
173 } 173 }
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
255{ 255{
256 int i; 256 int i;
257 257
258 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); 258 for (i = 0; i < *batch_count; i++)
259 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
260
259 for (i = 0; i < *batch_count; i++) { 261 for (i = 0; i < *batch_count; i++) {
260 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 262 struct buffer_head *bh = journal->j_chkpt_bhs[i];
261 clear_buffer_jwrite(bh); 263 clear_buffer_jwrite(bh);
@@ -297,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
297 transaction->t_chp_stats.cs_forced_to_close++; 299 transaction->t_chp_stats.cs_forced_to_close++;
298 spin_unlock(&journal->j_list_lock); 300 spin_unlock(&journal->j_list_lock);
299 jbd_unlock_bh_state(bh); 301 jbd_unlock_bh_state(bh);
302 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
303 /*
304 * The journal thread is dead; so starting and
305 * waiting for a commit to finish will cause
306 * us to wait for a _very_ long time.
307 */
308 printk(KERN_ERR "JBD2: %s: "
309 "Waiting for Godot: block %llu\n",
310 journal->j_devname,
311 (unsigned long long) bh->b_blocknr);
300 jbd2_log_start_commit(journal, tid); 312 jbd2_log_start_commit(journal, tid);
301 jbd2_log_wait_commit(journal, tid); 313 jbd2_log_wait_commit(journal, tid);
302 ret = 1; 314 ret = 1;
@@ -474,7 +486,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
474 * next transaction ID we will write, and where it will 486 * next transaction ID we will write, and where it will
475 * start. */ 487 * start. */
476 488
477 spin_lock(&journal->j_state_lock); 489 write_lock(&journal->j_state_lock);
478 spin_lock(&journal->j_list_lock); 490 spin_lock(&journal->j_list_lock);
479 transaction = journal->j_checkpoint_transactions; 491 transaction = journal->j_checkpoint_transactions;
480 if (transaction) { 492 if (transaction) {
@@ -496,7 +508,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
496 /* If the oldest pinned transaction is at the tail of the log 508 /* If the oldest pinned transaction is at the tail of the log
497 already then there's not much we can do right now. */ 509 already then there's not much we can do right now. */
498 if (journal->j_tail_sequence == first_tid) { 510 if (journal->j_tail_sequence == first_tid) {
499 spin_unlock(&journal->j_state_lock); 511 write_unlock(&journal->j_state_lock);
500 return 1; 512 return 1;
501 } 513 }
502 514
@@ -516,7 +528,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
516 journal->j_free += freed; 528 journal->j_free += freed;
517 journal->j_tail_sequence = first_tid; 529 journal->j_tail_sequence = first_tid;
518 journal->j_tail = blocknr; 530 journal->j_tail = blocknr;
519 spin_unlock(&journal->j_state_lock); 531 write_unlock(&journal->j_state_lock);
520 532
521 /* 533 /*
522 * If there is an external journal, we need to make sure that 534 * If there is an external journal, we need to make sure that
@@ -530,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 542 */
531 if ((journal->j_fs_dev != journal->j_dev) && 543 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 544 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 545 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
534 BLKDEV_IFL_WAIT);
535 if (!(journal->j_flags & JBD2_ABORT)) 546 if (!(journal->j_flags & JBD2_ABORT))
536 jbd2_journal_update_superblock(journal, 1); 547 jbd2_journal_update_superblock(journal, 1);
537 return 0; 548 return 0;
@@ -775,7 +786,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
775 J_ASSERT(transaction->t_log_list == NULL); 786 J_ASSERT(transaction->t_log_list == NULL);
776 J_ASSERT(transaction->t_checkpoint_list == NULL); 787 J_ASSERT(transaction->t_checkpoint_list == NULL);
777 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 788 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
778 J_ASSERT(transaction->t_updates == 0); 789 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
779 J_ASSERT(journal->j_committing_transaction != transaction); 790 J_ASSERT(journal->j_committing_transaction != transaction);
780 J_ASSERT(journal->j_running_transaction != transaction); 791 J_ASSERT(journal->j_running_transaction != transaction);
781 792
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 75716d3d2be..f3ad1598b20 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/bitops.h>
29#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31#include <asm/system.h>
30 32
31/* 33/*
32 * Default IO end handler for temporary BJ_IO buffer_heads. 34 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -101,7 +103,6 @@ static int journal_submit_commit_record(journal_t *journal,
101 struct commit_header *tmp; 103 struct commit_header *tmp;
102 struct buffer_head *bh; 104 struct buffer_head *bh;
103 int ret; 105 int ret;
104 int barrier_done = 0;
105 struct timespec now = current_kernel_time(); 106 struct timespec now = current_kernel_time();
106 107
107 if (is_journal_aborted(journal)) 108 if (is_journal_aborted(journal))
@@ -135,33 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
135 136
136 if (journal->j_flags & JBD2_BARRIER && 137 if (journal->j_flags & JBD2_BARRIER &&
137 !JBD2_HAS_INCOMPAT_FEATURE(journal, 138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
139 set_buffer_ordered(bh); 140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
140 barrier_done = 1; 141 else
141 }
142 ret = submit_bh(WRITE_SYNC_PLUG, bh);
143 if (barrier_done)
144 clear_buffer_ordered(bh);
145
146 /* is it possible for another commit to fail at roughly
147 * the same time as this one? If so, we don't want to
148 * trust the barrier flag in the super, but instead want
149 * to remember if we sent a barrier request
150 */
151 if (ret == -EOPNOTSUPP && barrier_done) {
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n", journal->j_devname);
155 spin_lock(&journal->j_state_lock);
156 journal->j_flags &= ~JBD2_BARRIER;
157 spin_unlock(&journal->j_state_lock);
158
159 /* And try again, without the barrier */
160 lock_buffer(bh);
161 set_buffer_uptodate(bh);
162 clear_buffer_dirty(bh);
163 ret = submit_bh(WRITE_SYNC_PLUG, bh); 142 ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 } 143
165 *cbh = bh; 144 *cbh = bh;
166 return ret; 145 return ret;
167} 146}
@@ -175,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
175{ 154{
176 int ret = 0; 155 int ret = 0;
177 156
178retry:
179 clear_buffer_dirty(bh); 157 clear_buffer_dirty(bh);
180 wait_on_buffer(bh); 158 wait_on_buffer(bh);
181 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 printk(KERN_WARNING
183 "JBD2: wait_on_commit_record: sync failed on %s - "
184 "disabling barriers\n", journal->j_devname);
185 spin_lock(&journal->j_state_lock);
186 journal->j_flags &= ~JBD2_BARRIER;
187 spin_unlock(&journal->j_state_lock);
188
189 lock_buffer(bh);
190 clear_buffer_dirty(bh);
191 set_buffer_uptodate(bh);
192 bh->b_end_io = journal_end_buffer_io_sync;
193
194 ret = submit_bh(WRITE_SYNC_PLUG, bh);
195 if (ret) {
196 unlock_buffer(bh);
197 return ret;
198 }
199 goto retry;
200 }
201 159
202 if (unlikely(!buffer_uptodate(bh))) 160 if (unlikely(!buffer_uptodate(bh)))
203 ret = -EIO; 161 ret = -EIO;
@@ -245,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
245 spin_lock(&journal->j_list_lock); 203 spin_lock(&journal->j_list_lock);
246 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 204 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 mapping = jinode->i_vfs_inode->i_mapping; 205 mapping = jinode->i_vfs_inode->i_mapping;
248 jinode->i_flags |= JI_COMMIT_RUNNING; 206 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
249 spin_unlock(&journal->j_list_lock); 207 spin_unlock(&journal->j_list_lock);
250 /* 208 /*
251 * submit the inode data buffers. We use writepage 209 * submit the inode data buffers. We use writepage
@@ -260,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
260 spin_lock(&journal->j_list_lock); 218 spin_lock(&journal->j_list_lock);
261 J_ASSERT(jinode->i_transaction == commit_transaction); 219 J_ASSERT(jinode->i_transaction == commit_transaction);
262 commit_transaction->t_flushed_data_blocks = 1; 220 commit_transaction->t_flushed_data_blocks = 1;
263 jinode->i_flags &= ~JI_COMMIT_RUNNING; 221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 smp_mb__after_clear_bit();
264 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265 } 224 }
266 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
@@ -281,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
281 /* For locking, see the comment in journal_submit_data_buffers() */ 240 /* For locking, see the comment in journal_submit_data_buffers() */
282 spin_lock(&journal->j_list_lock); 241 spin_lock(&journal->j_list_lock);
283 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
284 jinode->i_flags |= JI_COMMIT_RUNNING; 243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
285 spin_unlock(&journal->j_list_lock); 244 spin_unlock(&journal->j_list_lock);
286 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
287 if (err) { 246 if (err) {
@@ -297,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
297 ret = err; 256 ret = err;
298 } 257 }
299 spin_lock(&journal->j_list_lock); 258 spin_lock(&journal->j_list_lock);
300 jinode->i_flags &= ~JI_COMMIT_RUNNING; 259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 smp_mb__after_clear_bit();
301 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
302 } 262 }
303 263
@@ -369,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
369 int tag_bytes = journal_tag_bytes(journal); 329 int tag_bytes = journal_tag_bytes(journal);
370 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
371 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
372 int write_op = WRITE; 332 int write_op = WRITE_SYNC;
373 333
374 /* 334 /*
375 * First job: lock down the current transaction and wait for 335 * First job: lock down the current transaction and wait for
@@ -400,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
400 jbd_debug(1, "JBD: starting commit of transaction %d\n", 360 jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 commit_transaction->t_tid); 361 commit_transaction->t_tid);
402 362
403 spin_lock(&journal->j_state_lock); 363 write_lock(&journal->j_state_lock);
404 commit_transaction->t_state = T_LOCKED; 364 commit_transaction->t_state = T_LOCKED;
405 365
406 /* 366 /*
@@ -417,23 +377,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
417 stats.run.rs_locked); 377 stats.run.rs_locked);
418 378
419 spin_lock(&commit_transaction->t_handle_lock); 379 spin_lock(&commit_transaction->t_handle_lock);
420 while (commit_transaction->t_updates) { 380 while (atomic_read(&commit_transaction->t_updates)) {
421 DEFINE_WAIT(wait); 381 DEFINE_WAIT(wait);
422 382
423 prepare_to_wait(&journal->j_wait_updates, &wait, 383 prepare_to_wait(&journal->j_wait_updates, &wait,
424 TASK_UNINTERRUPTIBLE); 384 TASK_UNINTERRUPTIBLE);
425 if (commit_transaction->t_updates) { 385 if (atomic_read(&commit_transaction->t_updates)) {
426 spin_unlock(&commit_transaction->t_handle_lock); 386 spin_unlock(&commit_transaction->t_handle_lock);
427 spin_unlock(&journal->j_state_lock); 387 write_unlock(&journal->j_state_lock);
428 schedule(); 388 schedule();
429 spin_lock(&journal->j_state_lock); 389 write_lock(&journal->j_state_lock);
430 spin_lock(&commit_transaction->t_handle_lock); 390 spin_lock(&commit_transaction->t_handle_lock);
431 } 391 }
432 finish_wait(&journal->j_wait_updates, &wait); 392 finish_wait(&journal->j_wait_updates, &wait);
433 } 393 }
434 spin_unlock(&commit_transaction->t_handle_lock); 394 spin_unlock(&commit_transaction->t_handle_lock);
435 395
436 J_ASSERT (commit_transaction->t_outstanding_credits <= 396 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
437 journal->j_max_transaction_buffers); 397 journal->j_max_transaction_buffers);
438 398
439 /* 399 /*
@@ -497,7 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 start_time = ktime_get(); 457 start_time = ktime_get();
498 commit_transaction->t_log_start = journal->j_head; 458 commit_transaction->t_log_start = journal->j_head;
499 wake_up(&journal->j_wait_transaction_locked); 459 wake_up(&journal->j_wait_transaction_locked);
500 spin_unlock(&journal->j_state_lock); 460 write_unlock(&journal->j_state_lock);
501 461
502 jbd_debug (3, "JBD: commit phase 2\n"); 462 jbd_debug (3, "JBD: commit phase 2\n");
503 463
@@ -519,19 +479,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
519 * transaction! Now comes the tricky part: we need to write out 479 * transaction! Now comes the tricky part: we need to write out
520 * metadata. Loop over the transaction's entire buffer list: 480 * metadata. Loop over the transaction's entire buffer list:
521 */ 481 */
522 spin_lock(&journal->j_state_lock); 482 write_lock(&journal->j_state_lock);
523 commit_transaction->t_state = T_COMMIT; 483 commit_transaction->t_state = T_COMMIT;
524 spin_unlock(&journal->j_state_lock); 484 write_unlock(&journal->j_state_lock);
525 485
526 trace_jbd2_commit_logging(journal, commit_transaction); 486 trace_jbd2_commit_logging(journal, commit_transaction);
527 stats.run.rs_logging = jiffies; 487 stats.run.rs_logging = jiffies;
528 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 488 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 stats.run.rs_logging); 489 stats.run.rs_logging);
530 stats.run.rs_blocks = commit_transaction->t_outstanding_credits; 490 stats.run.rs_blocks =
491 atomic_read(&commit_transaction->t_outstanding_credits);
531 stats.run.rs_blocks_logged = 0; 492 stats.run.rs_blocks_logged = 0;
532 493
533 J_ASSERT(commit_transaction->t_nr_buffers <= 494 J_ASSERT(commit_transaction->t_nr_buffers <=
534 commit_transaction->t_outstanding_credits); 495 atomic_read(&commit_transaction->t_outstanding_credits));
535 496
536 err = 0; 497 err = 0;
537 descriptor = NULL; 498 descriptor = NULL;
@@ -616,7 +577,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 * the free space in the log, but this counter is changed 577 * the free space in the log, but this counter is changed
617 * by jbd2_journal_next_log_block() also. 578 * by jbd2_journal_next_log_block() also.
618 */ 579 */
619 commit_transaction->t_outstanding_credits--; 580 atomic_dec(&commit_transaction->t_outstanding_credits);
620 581
621 /* Bump b_count to prevent truncate from stumbling over 582 /* Bump b_count to prevent truncate from stumbling over
622 the shadowed buffer! @@@ This can go if we ever get 583 the shadowed buffer! @@@ This can go if we ever get
@@ -709,6 +670,16 @@ start_journal_io:
709 } 670 }
710 } 671 }
711 672
673 err = journal_finish_inode_data_buffers(journal, commit_transaction);
674 if (err) {
675 printk(KERN_WARNING
676 "JBD2: Detected IO errors while flushing file data "
677 "on %s\n", journal->j_devname);
678 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
679 jbd2_journal_abort(journal, err);
680 err = 0;
681 }
682
712 /* 683 /*
713 * If the journal is not located on the file system device, 684 * If the journal is not located on the file system device,
714 * then we must flush the file system device before we issue 685 * then we must flush the file system device before we issue
@@ -717,8 +688,7 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 688 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 689 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 690 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 691 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
721 BLKDEV_IFL_WAIT);
722 692
723 /* Done it all: now write the commit record asynchronously. */ 693 /* Done it all: now write the commit record asynchronously. */
724 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 694 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,19 +697,6 @@ start_journal_io:
727 &cbh, crc32_sum); 697 &cbh, crc32_sum);
728 if (err) 698 if (err)
729 __jbd2_journal_abort_hard(journal); 699 __jbd2_journal_abort_hard(journal);
730 if (journal->j_flags & JBD2_BARRIER)
731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
733 }
734
735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
736 if (err) {
737 printk(KERN_WARNING
738 "JBD2: Detected IO errors while flushing file data "
739 "on %s\n", journal->j_devname);
740 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
741 jbd2_journal_abort(journal, err);
742 err = 0;
743 } 700 }
744 701
745 /* Lo and behold: we have just managed to send a transaction to 702 /* Lo and behold: we have just managed to send a transaction to
@@ -853,6 +810,11 @@ wait_for_iobuf:
853 } 810 }
854 if (!err && !is_journal_aborted(journal)) 811 if (!err && !is_journal_aborted(journal))
855 err = journal_wait_on_commit_record(journal, cbh); 812 err = journal_wait_on_commit_record(journal, cbh);
813 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
815 journal->j_flags & JBD2_BARRIER) {
816 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
817 }
856 818
857 if (err) 819 if (err)
858 jbd2_journal_abort(journal, err); 820 jbd2_journal_abort(journal, err);
@@ -977,7 +939,7 @@ restart_loop:
977 * __jbd2_journal_drop_transaction(). Otherwise we could race with 939 * __jbd2_journal_drop_transaction(). Otherwise we could race with
978 * other checkpointing code processing the transaction... 940 * other checkpointing code processing the transaction...
979 */ 941 */
980 spin_lock(&journal->j_state_lock); 942 write_lock(&journal->j_state_lock);
981 spin_lock(&journal->j_list_lock); 943 spin_lock(&journal->j_list_lock);
982 /* 944 /*
983 * Now recheck if some buffers did not get attached to the transaction 945 * Now recheck if some buffers did not get attached to the transaction
@@ -985,7 +947,7 @@ restart_loop:
985 */ 947 */
986 if (commit_transaction->t_forget) { 948 if (commit_transaction->t_forget) {
987 spin_unlock(&journal->j_list_lock); 949 spin_unlock(&journal->j_list_lock);
988 spin_unlock(&journal->j_state_lock); 950 write_unlock(&journal->j_state_lock);
989 goto restart_loop; 951 goto restart_loop;
990 } 952 }
991 953
@@ -1003,7 +965,8 @@ restart_loop:
1003 * File the transaction statistics 965 * File the transaction statistics
1004 */ 966 */
1005 stats.ts_tid = commit_transaction->t_tid; 967 stats.ts_tid = commit_transaction->t_tid;
1006 stats.run.rs_handle_count = commit_transaction->t_handle_count; 968 stats.run.rs_handle_count =
969 atomic_read(&commit_transaction->t_handle_count);
1007 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 970 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1008 commit_transaction->t_tid, &stats.run); 971 commit_transaction->t_tid, &stats.run);
1009 972
@@ -1037,7 +1000,7 @@ restart_loop:
1037 journal->j_average_commit_time*3) / 4; 1000 journal->j_average_commit_time*3) / 4;
1038 else 1001 else
1039 journal->j_average_commit_time = commit_time; 1002 journal->j_average_commit_time = commit_time;
1040 spin_unlock(&journal->j_state_lock); 1003 write_unlock(&journal->j_state_lock);
1041 1004
1042 if (commit_transaction->t_checkpoint_list == NULL && 1005 if (commit_transaction->t_checkpoint_list == NULL &&
1043 commit_transaction->t_checkpoint_io_list == NULL) { 1006 commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 036880895bf..c590d155c09 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,15 +41,16 @@
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h> 42#include <linux/log2.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h>
45#include <linux/bitops.h>
44 46
45#define CREATE_TRACE_POINTS 47#define CREATE_TRACE_POINTS
46#include <trace/events/jbd2.h> 48#include <trace/events/jbd2.h>
47 49
48#include <asm/uaccess.h> 50#include <asm/uaccess.h>
49#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/system.h>
50 53
51EXPORT_SYMBOL(jbd2_journal_start);
52EXPORT_SYMBOL(jbd2_journal_restart);
53EXPORT_SYMBOL(jbd2_journal_extend); 54EXPORT_SYMBOL(jbd2_journal_extend);
54EXPORT_SYMBOL(jbd2_journal_stop); 55EXPORT_SYMBOL(jbd2_journal_stop);
55EXPORT_SYMBOL(jbd2_journal_lock_updates); 56EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +144,7 @@ static int kjournald2(void *arg)
143 /* 144 /*
144 * And now, wait forever for commit wakeup events. 145 * And now, wait forever for commit wakeup events.
145 */ 146 */
146 spin_lock(&journal->j_state_lock); 147 write_lock(&journal->j_state_lock);
147 148
148loop: 149loop:
149 if (journal->j_flags & JBD2_UNMOUNT) 150 if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +155,10 @@ loop:
154 155
155 if (journal->j_commit_sequence != journal->j_commit_request) { 156 if (journal->j_commit_sequence != journal->j_commit_request) {
156 jbd_debug(1, "OK, requests differ\n"); 157 jbd_debug(1, "OK, requests differ\n");
157 spin_unlock(&journal->j_state_lock); 158 write_unlock(&journal->j_state_lock);
158 del_timer_sync(&journal->j_commit_timer); 159 del_timer_sync(&journal->j_commit_timer);
159 jbd2_journal_commit_transaction(journal); 160 jbd2_journal_commit_transaction(journal);
160 spin_lock(&journal->j_state_lock); 161 write_lock(&journal->j_state_lock);
161 goto loop; 162 goto loop;
162 } 163 }
163 164
@@ -169,9 +170,9 @@ loop:
169 * be already stopped. 170 * be already stopped.
170 */ 171 */
171 jbd_debug(1, "Now suspending kjournald2\n"); 172 jbd_debug(1, "Now suspending kjournald2\n");
172 spin_unlock(&journal->j_state_lock); 173 write_unlock(&journal->j_state_lock);
173 refrigerator(); 174 refrigerator();
174 spin_lock(&journal->j_state_lock); 175 write_lock(&journal->j_state_lock);
175 } else { 176 } else {
176 /* 177 /*
177 * We assume on resume that commits are already there, 178 * We assume on resume that commits are already there,
@@ -191,9 +192,9 @@ loop:
191 if (journal->j_flags & JBD2_UNMOUNT) 192 if (journal->j_flags & JBD2_UNMOUNT)
192 should_sleep = 0; 193 should_sleep = 0;
193 if (should_sleep) { 194 if (should_sleep) {
194 spin_unlock(&journal->j_state_lock); 195 write_unlock(&journal->j_state_lock);
195 schedule(); 196 schedule();
196 spin_lock(&journal->j_state_lock); 197 write_lock(&journal->j_state_lock);
197 } 198 }
198 finish_wait(&journal->j_wait_commit, &wait); 199 finish_wait(&journal->j_wait_commit, &wait);
199 } 200 }
@@ -211,7 +212,7 @@ loop:
211 goto loop; 212 goto loop;
212 213
213end_loop: 214end_loop:
214 spin_unlock(&journal->j_state_lock); 215 write_unlock(&journal->j_state_lock);
215 del_timer_sync(&journal->j_commit_timer); 216 del_timer_sync(&journal->j_commit_timer);
216 journal->j_task = NULL; 217 journal->j_task = NULL;
217 wake_up(&journal->j_wait_done_commit); 218 wake_up(&journal->j_wait_done_commit);
@@ -234,16 +235,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
234 235
235static void journal_kill_thread(journal_t *journal) 236static void journal_kill_thread(journal_t *journal)
236{ 237{
237 spin_lock(&journal->j_state_lock); 238 write_lock(&journal->j_state_lock);
238 journal->j_flags |= JBD2_UNMOUNT; 239 journal->j_flags |= JBD2_UNMOUNT;
239 240
240 while (journal->j_task) { 241 while (journal->j_task) {
241 wake_up(&journal->j_wait_commit); 242 wake_up(&journal->j_wait_commit);
242 spin_unlock(&journal->j_state_lock); 243 write_unlock(&journal->j_state_lock);
243 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 244 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
244 spin_lock(&journal->j_state_lock); 245 write_lock(&journal->j_state_lock);
245 } 246 }
246 spin_unlock(&journal->j_state_lock); 247 write_unlock(&journal->j_state_lock);
247} 248}
248 249
249/* 250/*
@@ -310,7 +311,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
310 */ 311 */
311 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 312 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
312 313
313 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 314retry_alloc:
315 new_bh = alloc_buffer_head(GFP_NOFS);
316 if (!new_bh) {
317 /*
318 * Failure is not an option, but __GFP_NOFAIL is going
319 * away; so we retry ourselves here.
320 */
321 congestion_wait(BLK_RW_ASYNC, HZ/50);
322 goto retry_alloc;
323 }
324
314 /* keep subsequent assertions sane */ 325 /* keep subsequent assertions sane */
315 new_bh->b_state = 0; 326 new_bh->b_state = 0;
316 init_buffer(new_bh, NULL, NULL); 327 init_buffer(new_bh, NULL, NULL);
@@ -442,7 +453,7 @@ int __jbd2_log_space_left(journal_t *journal)
442{ 453{
443 int left = journal->j_free; 454 int left = journal->j_free;
444 455
445 assert_spin_locked(&journal->j_state_lock); 456 /* assert_spin_locked(&journal->j_state_lock); */
446 457
447 /* 458 /*
448 * Be pessimistic here about the number of those free blocks which 459 * Be pessimistic here about the number of those free blocks which
@@ -469,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
469 */ 480 */
470 if (!tid_geq(journal->j_commit_request, target)) { 481 if (!tid_geq(journal->j_commit_request, target)) {
471 /* 482 /*
472 * We want a new commit: OK, mark the request and wakup the 483 * We want a new commit: OK, mark the request and wakeup the
473 * commit thread. We do _not_ do the commit ourselves. 484 * commit thread. We do _not_ do the commit ourselves.
474 */ 485 */
475 486
@@ -487,9 +498,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
487{ 498{
488 int ret; 499 int ret;
489 500
490 spin_lock(&journal->j_state_lock); 501 write_lock(&journal->j_state_lock);
491 ret = __jbd2_log_start_commit(journal, tid); 502 ret = __jbd2_log_start_commit(journal, tid);
492 spin_unlock(&journal->j_state_lock); 503 write_unlock(&journal->j_state_lock);
493 return ret; 504 return ret;
494} 505}
495 506
@@ -508,7 +519,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
508 transaction_t *transaction = NULL; 519 transaction_t *transaction = NULL;
509 tid_t tid; 520 tid_t tid;
510 521
511 spin_lock(&journal->j_state_lock); 522 read_lock(&journal->j_state_lock);
512 if (journal->j_running_transaction && !current->journal_info) { 523 if (journal->j_running_transaction && !current->journal_info) {
513 transaction = journal->j_running_transaction; 524 transaction = journal->j_running_transaction;
514 __jbd2_log_start_commit(journal, transaction->t_tid); 525 __jbd2_log_start_commit(journal, transaction->t_tid);
@@ -516,12 +527,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
516 transaction = journal->j_committing_transaction; 527 transaction = journal->j_committing_transaction;
517 528
518 if (!transaction) { 529 if (!transaction) {
519 spin_unlock(&journal->j_state_lock); 530 read_unlock(&journal->j_state_lock);
520 return 0; /* Nothing to retry */ 531 return 0; /* Nothing to retry */
521 } 532 }
522 533
523 tid = transaction->t_tid; 534 tid = transaction->t_tid;
524 spin_unlock(&journal->j_state_lock); 535 read_unlock(&journal->j_state_lock);
525 jbd2_log_wait_commit(journal, tid); 536 jbd2_log_wait_commit(journal, tid);
526 return 1; 537 return 1;
527} 538}
@@ -535,7 +546,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
535{ 546{
536 int ret = 0; 547 int ret = 0;
537 548
538 spin_lock(&journal->j_state_lock); 549 write_lock(&journal->j_state_lock);
539 if (journal->j_running_transaction) { 550 if (journal->j_running_transaction) {
540 tid_t tid = journal->j_running_transaction->t_tid; 551 tid_t tid = journal->j_running_transaction->t_tid;
541 552
@@ -554,7 +565,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
554 *ptid = journal->j_committing_transaction->t_tid; 565 *ptid = journal->j_committing_transaction->t_tid;
555 ret = 1; 566 ret = 1;
556 } 567 }
557 spin_unlock(&journal->j_state_lock); 568 write_unlock(&journal->j_state_lock);
558 return ret; 569 return ret;
559} 570}
560 571
@@ -566,26 +577,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
566{ 577{
567 int err = 0; 578 int err = 0;
568 579
580 read_lock(&journal->j_state_lock);
569#ifdef CONFIG_JBD2_DEBUG 581#ifdef CONFIG_JBD2_DEBUG
570 spin_lock(&journal->j_state_lock);
571 if (!tid_geq(journal->j_commit_request, tid)) { 582 if (!tid_geq(journal->j_commit_request, tid)) {
572 printk(KERN_EMERG 583 printk(KERN_EMERG
573 "%s: error: j_commit_request=%d, tid=%d\n", 584 "%s: error: j_commit_request=%d, tid=%d\n",
574 __func__, journal->j_commit_request, tid); 585 __func__, journal->j_commit_request, tid);
575 } 586 }
576 spin_unlock(&journal->j_state_lock);
577#endif 587#endif
578 spin_lock(&journal->j_state_lock);
579 while (tid_gt(tid, journal->j_commit_sequence)) { 588 while (tid_gt(tid, journal->j_commit_sequence)) {
580 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 589 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
581 tid, journal->j_commit_sequence); 590 tid, journal->j_commit_sequence);
582 wake_up(&journal->j_wait_commit); 591 wake_up(&journal->j_wait_commit);
583 spin_unlock(&journal->j_state_lock); 592 read_unlock(&journal->j_state_lock);
584 wait_event(journal->j_wait_done_commit, 593 wait_event(journal->j_wait_done_commit,
585 !tid_gt(tid, journal->j_commit_sequence)); 594 !tid_gt(tid, journal->j_commit_sequence));
586 spin_lock(&journal->j_state_lock); 595 read_lock(&journal->j_state_lock);
587 } 596 }
588 spin_unlock(&journal->j_state_lock); 597 read_unlock(&journal->j_state_lock);
589 598
590 if (unlikely(is_journal_aborted(journal))) { 599 if (unlikely(is_journal_aborted(journal))) {
591 printk(KERN_EMERG "journal commit I/O error\n"); 600 printk(KERN_EMERG "journal commit I/O error\n");
@@ -602,7 +611,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
602{ 611{
603 unsigned long blocknr; 612 unsigned long blocknr;
604 613
605 spin_lock(&journal->j_state_lock); 614 write_lock(&journal->j_state_lock);
606 J_ASSERT(journal->j_free > 1); 615 J_ASSERT(journal->j_free > 1);
607 616
608 blocknr = journal->j_head; 617 blocknr = journal->j_head;
@@ -610,7 +619,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
610 journal->j_free--; 619 journal->j_free--;
611 if (journal->j_head == journal->j_last) 620 if (journal->j_head == journal->j_last)
612 journal->j_head = journal->j_first; 621 journal->j_head = journal->j_first;
613 spin_unlock(&journal->j_state_lock); 622 write_unlock(&journal->j_state_lock);
614 return jbd2_journal_bmap(journal, blocknr, retp); 623 return jbd2_journal_bmap(journal, blocknr, retp);
615} 624}
616 625
@@ -830,7 +839,7 @@ static journal_t * journal_init_common (void)
830 mutex_init(&journal->j_checkpoint_mutex); 839 mutex_init(&journal->j_checkpoint_mutex);
831 spin_lock_init(&journal->j_revoke_lock); 840 spin_lock_init(&journal->j_revoke_lock);
832 spin_lock_init(&journal->j_list_lock); 841 spin_lock_init(&journal->j_list_lock);
833 spin_lock_init(&journal->j_state_lock); 842 rwlock_init(&journal->j_state_lock);
834 843
835 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 844 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
836 journal->j_min_batch_time = 0; 845 journal->j_min_batch_time = 0;
@@ -1096,14 +1105,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1096 set_buffer_uptodate(bh); 1105 set_buffer_uptodate(bh);
1097 } 1106 }
1098 1107
1099 spin_lock(&journal->j_state_lock); 1108 read_lock(&journal->j_state_lock);
1100 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1109 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1101 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1110 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
1102 1111
1103 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1112 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1104 sb->s_start = cpu_to_be32(journal->j_tail); 1113 sb->s_start = cpu_to_be32(journal->j_tail);
1105 sb->s_errno = cpu_to_be32(journal->j_errno); 1114 sb->s_errno = cpu_to_be32(journal->j_errno);
1106 spin_unlock(&journal->j_state_lock); 1115 read_unlock(&journal->j_state_lock);
1107 1116
1108 BUFFER_TRACE(bh, "marking dirty"); 1117 BUFFER_TRACE(bh, "marking dirty");
1109 mark_buffer_dirty(bh); 1118 mark_buffer_dirty(bh);
@@ -1117,19 +1126,19 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1117 set_buffer_uptodate(bh); 1126 set_buffer_uptodate(bh);
1118 } 1127 }
1119 } else 1128 } else
1120 ll_rw_block(SWRITE, 1, &bh); 1129 write_dirty_buffer(bh, WRITE);
1121 1130
1122out: 1131out:
1123 /* If we have just flushed the log (by marking s_start==0), then 1132 /* If we have just flushed the log (by marking s_start==0), then
1124 * any future commit will have to be careful to update the 1133 * any future commit will have to be careful to update the
1125 * superblock again to re-record the true start of the log. */ 1134 * superblock again to re-record the true start of the log. */
1126 1135
1127 spin_lock(&journal->j_state_lock); 1136 write_lock(&journal->j_state_lock);
1128 if (sb->s_start) 1137 if (sb->s_start)
1129 journal->j_flags &= ~JBD2_FLUSHED; 1138 journal->j_flags &= ~JBD2_FLUSHED;
1130 else 1139 else
1131 journal->j_flags |= JBD2_FLUSHED; 1140 journal->j_flags |= JBD2_FLUSHED;
1132 spin_unlock(&journal->j_state_lock); 1141 write_unlock(&journal->j_state_lock);
1133} 1142}
1134 1143
1135/* 1144/*
@@ -1364,6 +1373,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1364 1373
1365 if (!compat && !ro && !incompat) 1374 if (!compat && !ro && !incompat)
1366 return 1; 1375 return 1;
1376 /* Load journal superblock if it is not loaded yet. */
1377 if (journal->j_format_version == 0 &&
1378 journal_get_superblock(journal) != 0)
1379 return 0;
1367 if (journal->j_format_version == 1) 1380 if (journal->j_format_version == 1)
1368 return 0; 1381 return 0;
1369 1382
@@ -1391,13 +1404,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1391int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, 1404int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1392 unsigned long ro, unsigned long incompat) 1405 unsigned long ro, unsigned long incompat)
1393{ 1406{
1394 journal_superblock_t *sb;
1395
1396 if (!compat && !ro && !incompat) 1407 if (!compat && !ro && !incompat)
1397 return 1; 1408 return 1;
1398 1409
1399 sb = journal->j_superblock;
1400
1401 /* We can support any known requested features iff the 1410 /* We can support any known requested features iff the
1402 * superblock is in version 2. Otherwise we fail to support any 1411 * superblock is in version 2. Otherwise we fail to support any
1403 * extended sb features. */ 1412 * extended sb features. */
@@ -1545,7 +1554,7 @@ int jbd2_journal_flush(journal_t *journal)
1545 transaction_t *transaction = NULL; 1554 transaction_t *transaction = NULL;
1546 unsigned long old_tail; 1555 unsigned long old_tail;
1547 1556
1548 spin_lock(&journal->j_state_lock); 1557 write_lock(&journal->j_state_lock);
1549 1558
1550 /* Force everything buffered to the log... */ 1559 /* Force everything buffered to the log... */
1551 if (journal->j_running_transaction) { 1560 if (journal->j_running_transaction) {
@@ -1558,10 +1567,10 @@ int jbd2_journal_flush(journal_t *journal)
1558 if (transaction) { 1567 if (transaction) {
1559 tid_t tid = transaction->t_tid; 1568 tid_t tid = transaction->t_tid;
1560 1569
1561 spin_unlock(&journal->j_state_lock); 1570 write_unlock(&journal->j_state_lock);
1562 jbd2_log_wait_commit(journal, tid); 1571 jbd2_log_wait_commit(journal, tid);
1563 } else { 1572 } else {
1564 spin_unlock(&journal->j_state_lock); 1573 write_unlock(&journal->j_state_lock);
1565 } 1574 }
1566 1575
1567 /* ...and flush everything in the log out to disk. */ 1576 /* ...and flush everything in the log out to disk. */
@@ -1585,12 +1594,12 @@ int jbd2_journal_flush(journal_t *journal)
1585 * the magic code for a fully-recovered superblock. Any future 1594 * the magic code for a fully-recovered superblock. Any future
1586 * commits of data to the journal will restore the current 1595 * commits of data to the journal will restore the current
1587 * s_start value. */ 1596 * s_start value. */
1588 spin_lock(&journal->j_state_lock); 1597 write_lock(&journal->j_state_lock);
1589 old_tail = journal->j_tail; 1598 old_tail = journal->j_tail;
1590 journal->j_tail = 0; 1599 journal->j_tail = 0;
1591 spin_unlock(&journal->j_state_lock); 1600 write_unlock(&journal->j_state_lock);
1592 jbd2_journal_update_superblock(journal, 1); 1601 jbd2_journal_update_superblock(journal, 1);
1593 spin_lock(&journal->j_state_lock); 1602 write_lock(&journal->j_state_lock);
1594 journal->j_tail = old_tail; 1603 journal->j_tail = old_tail;
1595 1604
1596 J_ASSERT(!journal->j_running_transaction); 1605 J_ASSERT(!journal->j_running_transaction);
@@ -1598,7 +1607,7 @@ int jbd2_journal_flush(journal_t *journal)
1598 J_ASSERT(!journal->j_checkpoint_transactions); 1607 J_ASSERT(!journal->j_checkpoint_transactions);
1599 J_ASSERT(journal->j_head == journal->j_tail); 1608 J_ASSERT(journal->j_head == journal->j_tail);
1600 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1609 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1601 spin_unlock(&journal->j_state_lock); 1610 write_unlock(&journal->j_state_lock);
1602 return 0; 1611 return 0;
1603} 1612}
1604 1613
@@ -1617,7 +1626,6 @@ int jbd2_journal_flush(journal_t *journal)
1617 1626
1618int jbd2_journal_wipe(journal_t *journal, int write) 1627int jbd2_journal_wipe(journal_t *journal, int write)
1619{ 1628{
1620 journal_superblock_t *sb;
1621 int err = 0; 1629 int err = 0;
1622 1630
1623 J_ASSERT (!(journal->j_flags & JBD2_LOADED)); 1631 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1626,8 +1634,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1626 if (err) 1634 if (err)
1627 return err; 1635 return err;
1628 1636
1629 sb = journal->j_superblock;
1630
1631 if (!journal->j_tail) 1637 if (!journal->j_tail)
1632 goto no_recovery; 1638 goto no_recovery;
1633 1639
@@ -1665,12 +1671,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
1665 printk(KERN_ERR "Aborting journal on device %s.\n", 1671 printk(KERN_ERR "Aborting journal on device %s.\n",
1666 journal->j_devname); 1672 journal->j_devname);
1667 1673
1668 spin_lock(&journal->j_state_lock); 1674 write_lock(&journal->j_state_lock);
1669 journal->j_flags |= JBD2_ABORT; 1675 journal->j_flags |= JBD2_ABORT;
1670 transaction = journal->j_running_transaction; 1676 transaction = journal->j_running_transaction;
1671 if (transaction) 1677 if (transaction)
1672 __jbd2_log_start_commit(journal, transaction->t_tid); 1678 __jbd2_log_start_commit(journal, transaction->t_tid);
1673 spin_unlock(&journal->j_state_lock); 1679 write_unlock(&journal->j_state_lock);
1674} 1680}
1675 1681
1676/* Soft abort: record the abort error status in the journal superblock, 1682/* Soft abort: record the abort error status in the journal superblock,
@@ -1755,12 +1761,12 @@ int jbd2_journal_errno(journal_t *journal)
1755{ 1761{
1756 int err; 1762 int err;
1757 1763
1758 spin_lock(&journal->j_state_lock); 1764 read_lock(&journal->j_state_lock);
1759 if (journal->j_flags & JBD2_ABORT) 1765 if (journal->j_flags & JBD2_ABORT)
1760 err = -EROFS; 1766 err = -EROFS;
1761 else 1767 else
1762 err = journal->j_errno; 1768 err = journal->j_errno;
1763 spin_unlock(&journal->j_state_lock); 1769 read_unlock(&journal->j_state_lock);
1764 return err; 1770 return err;
1765} 1771}
1766 1772
@@ -1775,12 +1781,12 @@ int jbd2_journal_clear_err(journal_t *journal)
1775{ 1781{
1776 int err = 0; 1782 int err = 0;
1777 1783
1778 spin_lock(&journal->j_state_lock); 1784 write_lock(&journal->j_state_lock);
1779 if (journal->j_flags & JBD2_ABORT) 1785 if (journal->j_flags & JBD2_ABORT)
1780 err = -EROFS; 1786 err = -EROFS;
1781 else 1787 else
1782 journal->j_errno = 0; 1788 journal->j_errno = 0;
1783 spin_unlock(&journal->j_state_lock); 1789 write_unlock(&journal->j_state_lock);
1784 return err; 1790 return err;
1785} 1791}
1786 1792
@@ -1793,10 +1799,10 @@ int jbd2_journal_clear_err(journal_t *journal)
1793 */ 1799 */
1794void jbd2_journal_ack_err(journal_t *journal) 1800void jbd2_journal_ack_err(journal_t *journal)
1795{ 1801{
1796 spin_lock(&journal->j_state_lock); 1802 write_lock(&journal->j_state_lock);
1797 if (journal->j_errno) 1803 if (journal->j_errno)
1798 journal->j_flags |= JBD2_ACK_ERR; 1804 journal->j_flags |= JBD2_ACK_ERR;
1799 spin_unlock(&journal->j_state_lock); 1805 write_unlock(&journal->j_state_lock);
1800} 1806}
1801 1807
1802int jbd2_journal_blocks_per_page(struct inode *inode) 1808int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -1832,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
1832 */ 1838 */
1833#define JBD2_MAX_SLABS 8 1839#define JBD2_MAX_SLABS 8
1834static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; 1840static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1835static DECLARE_MUTEX(jbd2_slab_create_sem);
1836 1841
1837static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { 1842static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1838 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", 1843 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1853,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
1853 1858
1854static int jbd2_journal_create_slab(size_t size) 1859static int jbd2_journal_create_slab(size_t size)
1855{ 1860{
1861 static DEFINE_MUTEX(jbd2_slab_create_mutex);
1856 int i = order_base_2(size) - 10; 1862 int i = order_base_2(size) - 10;
1857 size_t slab_size; 1863 size_t slab_size;
1858 1864
@@ -1864,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
1864 1870
1865 if (unlikely(i < 0)) 1871 if (unlikely(i < 0))
1866 i = 0; 1872 i = 0;
1867 down(&jbd2_slab_create_sem); 1873 mutex_lock(&jbd2_slab_create_mutex);
1868 if (jbd2_slab[i]) { 1874 if (jbd2_slab[i]) {
1869 up(&jbd2_slab_create_sem); 1875 mutex_unlock(&jbd2_slab_create_mutex);
1870 return 0; /* Already created */ 1876 return 0; /* Already created */
1871 } 1877 }
1872 1878
1873 slab_size = 1 << (i+10); 1879 slab_size = 1 << (i+10);
1874 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, 1880 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1875 slab_size, 0, NULL); 1881 slab_size, 0, NULL);
1876 up(&jbd2_slab_create_sem); 1882 mutex_unlock(&jbd2_slab_create_mutex);
1877 if (!jbd2_slab[i]) { 1883 if (!jbd2_slab[i]) {
1878 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); 1884 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1879 return -ENOMEM; 1885 return -ENOMEM;
@@ -2201,14 +2207,12 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2201void jbd2_journal_release_jbd_inode(journal_t *journal, 2207void jbd2_journal_release_jbd_inode(journal_t *journal,
2202 struct jbd2_inode *jinode) 2208 struct jbd2_inode *jinode)
2203{ 2209{
2204 int writeout = 0;
2205
2206 if (!journal) 2210 if (!journal)
2207 return; 2211 return;
2208restart: 2212restart:
2209 spin_lock(&journal->j_list_lock); 2213 spin_lock(&journal->j_list_lock);
2210 /* Is commit writing out inode - we have to wait */ 2214 /* Is commit writing out inode - we have to wait */
2211 if (jinode->i_flags & JI_COMMIT_RUNNING) { 2215 if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
2212 wait_queue_head_t *wq; 2216 wait_queue_head_t *wq;
2213 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2217 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2214 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2218 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -2219,9 +2223,6 @@ restart:
2219 goto restart; 2223 goto restart;
2220 } 2224 }
2221 2225
2222 /* Do we need to wait for data writeback? */
2223 if (journal->j_committing_transaction == jinode->i_transaction)
2224 writeout = 1;
2225 if (jinode->i_transaction) { 2226 if (jinode->i_transaction) {
2226 list_del(&jinode->i_list); 2227 list_del(&jinode->i_list);
2227 jinode->i_transaction = NULL; 2228 jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb8..2bc4d5f116f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
285int jbd2_journal_skip_recovery(journal_t *journal) 285int jbd2_journal_skip_recovery(journal_t *journal)
286{ 286{
287 int err; 287 int err;
288 journal_superblock_t * sb;
289 288
290 struct recovery_info info; 289 struct recovery_info info;
291 290
292 memset (&info, 0, sizeof(info)); 291 memset (&info, 0, sizeof(info));
293 sb = journal->j_superblock;
294 292
295 err = do_one_pass(journal, &info, PASS_SCAN); 293 err = do_one_pass(journal, &info, PASS_SCAN);
296 294
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299 ++journal->j_transaction_sequence; 297 ++journal->j_transaction_sequence;
300 } else { 298 } else {
301#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
302 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence);
303#endif 302#endif
304 jbd_debug(1, 303 jbd_debug(1,
305 "JBD: ignoring %d transaction%s from the journal.\n", 304 "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
365 int tag_bytes = journal_tag_bytes(journal); 364 int tag_bytes = journal_tag_bytes(journal);
366 __u32 crc32_sum = ~0; /* Transactional Checksums */ 365 __u32 crc32_sum = ~0; /* Transactional Checksums */
367 366
368 /* Precompute the maximum metadata descriptors in a descriptor block */
369 int MAX_BLOCKS_PER_DESC;
370 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
371 / tag_bytes);
372
373 /* 367 /*
374 * First thing is to establish what we expect to find in the log 368 * First thing is to establish what we expect to find in the log
375 * (in terms of transaction IDs), and where (in terms of log 369 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e..9ad321fd63f 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
625 set_buffer_jwrite(bh); 625 set_buffer_jwrite(bh);
626 BUFFER_TRACE(bh, "write"); 626 BUFFER_TRACE(bh, "write");
627 set_buffer_dirty(bh); 627 set_buffer_dirty(bh);
628 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 628 write_dirty_buffer(bh, write_op);
629} 629}
630#endif 630#endif
631 631
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b8e0806681b..6bf0a242613 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h>
30#include <linux/module.h>
29 31
30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
31 33
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
53 transaction->t_tid = journal->j_transaction_sequence++; 55 transaction->t_tid = journal->j_transaction_sequence++;
54 transaction->t_expires = jiffies + journal->j_commit_interval; 56 transaction->t_expires = jiffies + journal->j_commit_interval;
55 spin_lock_init(&transaction->t_handle_lock); 57 spin_lock_init(&transaction->t_handle_lock);
58 atomic_set(&transaction->t_updates, 0);
59 atomic_set(&transaction->t_outstanding_credits, 0);
60 atomic_set(&transaction->t_handle_count, 0);
56 INIT_LIST_HEAD(&transaction->t_inode_list); 61 INIT_LIST_HEAD(&transaction->t_inode_list);
57 INIT_LIST_HEAD(&transaction->t_private_list); 62 INIT_LIST_HEAD(&transaction->t_private_list);
58 63
@@ -77,71 +82,107 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
77 */ 82 */
78 83
79/* 84/*
85 * Update transiaction's maximum wait time, if debugging is enabled.
86 *
87 * In order for t_max_wait to be reliable, it must be protected by a
88 * lock. But doing so will mean that start_this_handle() can not be
89 * run in parallel on SMP systems, which limits our scalability. So
90 * unless debugging is enabled, we no longer update t_max_wait, which
91 * means that maximum wait time reported by the jbd2_run_stats
92 * tracepoint will always be zero.
93 */
94static inline void update_t_max_wait(transaction_t *transaction)
95{
96#ifdef CONFIG_JBD2_DEBUG
97 unsigned long ts = jiffies;
98
99 if (jbd2_journal_enable_debug &&
100 time_after(transaction->t_start, ts)) {
101 ts = jbd2_time_diff(ts, transaction->t_start);
102 spin_lock(&transaction->t_handle_lock);
103 if (ts > transaction->t_max_wait)
104 transaction->t_max_wait = ts;
105 spin_unlock(&transaction->t_handle_lock);
106 }
107#endif
108}
109
110/*
80 * start_this_handle: Given a handle, deal with any locking or stalling 111 * start_this_handle: Given a handle, deal with any locking or stalling
81 * needed to make sure that there is enough journal space for the handle 112 * needed to make sure that there is enough journal space for the handle
82 * to begin. Attach the handle to a transaction and set up the 113 * to begin. Attach the handle to a transaction and set up the
83 * transaction's buffer credits. 114 * transaction's buffer credits.
84 */ 115 */
85 116
86static int start_this_handle(journal_t *journal, handle_t *handle) 117static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask)
87{ 119{
88 transaction_t *transaction; 120 transaction_t *transaction;
89 int needed; 121 int needed;
90 int nblocks = handle->h_buffer_credits; 122 int nblocks = handle->h_buffer_credits;
91 transaction_t *new_transaction = NULL; 123 transaction_t *new_transaction = NULL;
92 int ret = 0;
93 unsigned long ts = jiffies;
94 124
95 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
96 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
97 current->comm, nblocks, 127 current->comm, nblocks,
98 journal->j_max_transaction_buffers); 128 journal->j_max_transaction_buffers);
99 ret = -ENOSPC; 129 return -ENOSPC;
100 goto out;
101 } 130 }
102 131
103alloc_transaction: 132alloc_transaction:
104 if (!journal->j_running_transaction) { 133 if (!journal->j_running_transaction) {
105 new_transaction = kzalloc(sizeof(*new_transaction), 134 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
106 GFP_NOFS|__GFP_NOFAIL);
107 if (!new_transaction) { 135 if (!new_transaction) {
108 ret = -ENOMEM; 136 /*
109 goto out; 137 * If __GFP_FS is not present, then we may be
138 * being called from inside the fs writeback
139 * layer, so we MUST NOT fail. Since
140 * __GFP_NOFAIL is going away, we will arrange
141 * to retry the allocation ourselves.
142 */
143 if ((gfp_mask & __GFP_FS) == 0) {
144 congestion_wait(BLK_RW_ASYNC, HZ/50);
145 goto alloc_transaction;
146 }
147 return -ENOMEM;
110 } 148 }
111 } 149 }
112 150
113 jbd_debug(3, "New handle %p going live.\n", handle); 151 jbd_debug(3, "New handle %p going live.\n", handle);
114 152
115repeat:
116
117 /* 153 /*
118 * We need to hold j_state_lock until t_updates has been incremented, 154 * We need to hold j_state_lock until t_updates has been incremented,
119 * for proper journal barrier handling 155 * for proper journal barrier handling
120 */ 156 */
121 spin_lock(&journal->j_state_lock); 157repeat:
122repeat_locked: 158 read_lock(&journal->j_state_lock);
159 BUG_ON(journal->j_flags & JBD2_UNMOUNT);
123 if (is_journal_aborted(journal) || 160 if (is_journal_aborted(journal) ||
124 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 161 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
125 spin_unlock(&journal->j_state_lock); 162 read_unlock(&journal->j_state_lock);
126 ret = -EROFS; 163 kfree(new_transaction);
127 goto out; 164 return -EROFS;
128 } 165 }
129 166
130 /* Wait on the journal's transaction barrier if necessary */ 167 /* Wait on the journal's transaction barrier if necessary */
131 if (journal->j_barrier_count) { 168 if (journal->j_barrier_count) {
132 spin_unlock(&journal->j_state_lock); 169 read_unlock(&journal->j_state_lock);
133 wait_event(journal->j_wait_transaction_locked, 170 wait_event(journal->j_wait_transaction_locked,
134 journal->j_barrier_count == 0); 171 journal->j_barrier_count == 0);
135 goto repeat; 172 goto repeat;
136 } 173 }
137 174
138 if (!journal->j_running_transaction) { 175 if (!journal->j_running_transaction) {
139 if (!new_transaction) { 176 read_unlock(&journal->j_state_lock);
140 spin_unlock(&journal->j_state_lock); 177 if (!new_transaction)
141 goto alloc_transaction; 178 goto alloc_transaction;
179 write_lock(&journal->j_state_lock);
180 if (!journal->j_running_transaction) {
181 jbd2_get_transaction(journal, new_transaction);
182 new_transaction = NULL;
142 } 183 }
143 jbd2_get_transaction(journal, new_transaction); 184 write_unlock(&journal->j_state_lock);
144 new_transaction = NULL; 185 goto repeat;
145 } 186 }
146 187
147 transaction = journal->j_running_transaction; 188 transaction = journal->j_running_transaction;
@@ -155,7 +196,7 @@ repeat_locked:
155 196
156 prepare_to_wait(&journal->j_wait_transaction_locked, 197 prepare_to_wait(&journal->j_wait_transaction_locked,
157 &wait, TASK_UNINTERRUPTIBLE); 198 &wait, TASK_UNINTERRUPTIBLE);
158 spin_unlock(&journal->j_state_lock); 199 read_unlock(&journal->j_state_lock);
159 schedule(); 200 schedule();
160 finish_wait(&journal->j_wait_transaction_locked, &wait); 201 finish_wait(&journal->j_wait_transaction_locked, &wait);
161 goto repeat; 202 goto repeat;
@@ -166,8 +207,8 @@ repeat_locked:
166 * buffers requested by this operation, we need to stall pending a log 207 * buffers requested by this operation, we need to stall pending a log
167 * checkpoint to free some more log space. 208 * checkpoint to free some more log space.
168 */ 209 */
169 spin_lock(&transaction->t_handle_lock); 210 needed = atomic_add_return(nblocks,
170 needed = transaction->t_outstanding_credits + nblocks; 211 &transaction->t_outstanding_credits);
171 212
172 if (needed > journal->j_max_transaction_buffers) { 213 if (needed > journal->j_max_transaction_buffers) {
173 /* 214 /*
@@ -178,11 +219,11 @@ repeat_locked:
178 DEFINE_WAIT(wait); 219 DEFINE_WAIT(wait);
179 220
180 jbd_debug(2, "Handle %p starting new commit...\n", handle); 221 jbd_debug(2, "Handle %p starting new commit...\n", handle);
181 spin_unlock(&transaction->t_handle_lock); 222 atomic_sub(nblocks, &transaction->t_outstanding_credits);
182 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
183 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
184 __jbd2_log_start_commit(journal, transaction->t_tid); 225 __jbd2_log_start_commit(journal, transaction->t_tid);
185 spin_unlock(&journal->j_state_lock); 226 read_unlock(&journal->j_state_lock);
186 schedule(); 227 schedule();
187 finish_wait(&journal->j_wait_transaction_locked, &wait); 228 finish_wait(&journal->j_wait_transaction_locked, &wait);
188 goto repeat; 229 goto repeat;
@@ -215,35 +256,31 @@ repeat_locked:
215 */ 256 */
216 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
217 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 258 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
218 spin_unlock(&transaction->t_handle_lock); 259 atomic_sub(nblocks, &transaction->t_outstanding_credits);
219 __jbd2_log_wait_for_space(journal); 260 read_unlock(&journal->j_state_lock);
220 goto repeat_locked; 261 write_lock(&journal->j_state_lock);
262 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
263 __jbd2_log_wait_for_space(journal);
264 write_unlock(&journal->j_state_lock);
265 goto repeat;
221 } 266 }
222 267
223 /* OK, account for the buffers that this operation expects to 268 /* OK, account for the buffers that this operation expects to
224 * use and add the handle to the running transaction. */ 269 * use and add the handle to the running transaction.
225 270 */
226 if (time_after(transaction->t_start, ts)) { 271 update_t_max_wait(transaction);
227 ts = jbd2_time_diff(ts, transaction->t_start);
228 if (ts > transaction->t_max_wait)
229 transaction->t_max_wait = ts;
230 }
231
232 handle->h_transaction = transaction; 272 handle->h_transaction = transaction;
233 transaction->t_outstanding_credits += nblocks; 273 atomic_inc(&transaction->t_updates);
234 transaction->t_updates++; 274 atomic_inc(&transaction->t_handle_count);
235 transaction->t_handle_count++;
236 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 275 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
237 handle, nblocks, transaction->t_outstanding_credits, 276 handle, nblocks,
277 atomic_read(&transaction->t_outstanding_credits),
238 __jbd2_log_space_left(journal)); 278 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 279 read_unlock(&journal->j_state_lock);
240 spin_unlock(&journal->j_state_lock);
241 280
242 lock_map_acquire(&handle->h_lockdep_map); 281 lock_map_acquire(&handle->h_lockdep_map);
243out: 282 kfree(new_transaction);
244 if (unlikely(new_transaction)) /* It's usually NULL */ 283 return 0;
245 kfree(new_transaction);
246 return ret;
247} 284}
248 285
249static struct lock_class_key jbd2_handle_key; 286static struct lock_class_key jbd2_handle_key;
@@ -278,7 +315,7 @@ static handle_t *new_handle(int nblocks)
278 * 315 *
279 * Return a pointer to a newly allocated handle, or NULL on failure 316 * Return a pointer to a newly allocated handle, or NULL on failure
280 */ 317 */
281handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 318handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
282{ 319{
283 handle_t *handle = journal_current_handle(); 320 handle_t *handle = journal_current_handle();
284 int err; 321 int err;
@@ -298,7 +335,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
298 335
299 current->journal_info = handle; 336 current->journal_info = handle;
300 337
301 err = start_this_handle(journal, handle); 338 err = start_this_handle(journal, handle, gfp_mask);
302 if (err < 0) { 339 if (err < 0) {
303 jbd2_free_handle(handle); 340 jbd2_free_handle(handle);
304 current->journal_info = NULL; 341 current->journal_info = NULL;
@@ -308,6 +345,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
308out: 345out:
309 return handle; 346 return handle;
310} 347}
348EXPORT_SYMBOL(jbd2__journal_start);
349
350
351handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
352{
353 return jbd2__journal_start(journal, nblocks, GFP_NOFS);
354}
355EXPORT_SYMBOL(jbd2_journal_start);
356
311 357
312/** 358/**
313 * int jbd2_journal_extend() - extend buffer credits. 359 * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +388,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
342 388
343 result = 1; 389 result = 1;
344 390
345 spin_lock(&journal->j_state_lock); 391 read_lock(&journal->j_state_lock);
346 392
347 /* Don't extend a locked-down transaction! */ 393 /* Don't extend a locked-down transaction! */
348 if (handle->h_transaction->t_state != T_RUNNING) { 394 if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +398,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
352 } 398 }
353 399
354 spin_lock(&transaction->t_handle_lock); 400 spin_lock(&transaction->t_handle_lock);
355 wanted = transaction->t_outstanding_credits + nblocks; 401 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
356 402
357 if (wanted > journal->j_max_transaction_buffers) { 403 if (wanted > journal->j_max_transaction_buffers) {
358 jbd_debug(3, "denied handle %p %d blocks: " 404 jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +413,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
367 } 413 }
368 414
369 handle->h_buffer_credits += nblocks; 415 handle->h_buffer_credits += nblocks;
370 transaction->t_outstanding_credits += nblocks; 416 atomic_add(nblocks, &transaction->t_outstanding_credits);
371 result = 0; 417 result = 0;
372 418
373 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 419 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
374unlock: 420unlock:
375 spin_unlock(&transaction->t_handle_lock); 421 spin_unlock(&transaction->t_handle_lock);
376error_out: 422error_out:
377 spin_unlock(&journal->j_state_lock); 423 read_unlock(&journal->j_state_lock);
378out: 424out:
379 return result; 425 return result;
380} 426}
@@ -394,8 +440,7 @@ out:
394 * transaction capabable of guaranteeing the requested number of 440 * transaction capabable of guaranteeing the requested number of
395 * credits. 441 * credits.
396 */ 442 */
397 443int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
398int jbd2_journal_restart(handle_t *handle, int nblocks)
399{ 444{
400 transaction_t *transaction = handle->h_transaction; 445 transaction_t *transaction = handle->h_transaction;
401 journal_t *journal = transaction->t_journal; 446 journal_t *journal = transaction->t_journal;
@@ -410,29 +455,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
410 * First unlink the handle from its current transaction, and start the 455 * First unlink the handle from its current transaction, and start the
411 * commit on that. 456 * commit on that.
412 */ 457 */
413 J_ASSERT(transaction->t_updates > 0); 458 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
414 J_ASSERT(journal_current_handle() == handle); 459 J_ASSERT(journal_current_handle() == handle);
415 460
416 spin_lock(&journal->j_state_lock); 461 read_lock(&journal->j_state_lock);
417 spin_lock(&transaction->t_handle_lock); 462 spin_lock(&transaction->t_handle_lock);
418 transaction->t_outstanding_credits -= handle->h_buffer_credits; 463 atomic_sub(handle->h_buffer_credits,
419 transaction->t_updates--; 464 &transaction->t_outstanding_credits);
420 465 if (atomic_dec_and_test(&transaction->t_updates))
421 if (!transaction->t_updates)
422 wake_up(&journal->j_wait_updates); 466 wake_up(&journal->j_wait_updates);
423 spin_unlock(&transaction->t_handle_lock); 467 spin_unlock(&transaction->t_handle_lock);
424 468
425 jbd_debug(2, "restarting handle %p\n", handle); 469 jbd_debug(2, "restarting handle %p\n", handle);
426 __jbd2_log_start_commit(journal, transaction->t_tid); 470 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 471 read_unlock(&journal->j_state_lock);
428 472
429 lock_map_release(&handle->h_lockdep_map); 473 lock_map_release(&handle->h_lockdep_map);
430 handle->h_buffer_credits = nblocks; 474 handle->h_buffer_credits = nblocks;
431 ret = start_this_handle(journal, handle); 475 ret = start_this_handle(journal, handle, gfp_mask);
432 return ret; 476 return ret;
433} 477}
478EXPORT_SYMBOL(jbd2__journal_restart);
434 479
435 480
481int jbd2_journal_restart(handle_t *handle, int nblocks)
482{
483 return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
484}
485EXPORT_SYMBOL(jbd2_journal_restart);
486
436/** 487/**
437 * void jbd2_journal_lock_updates () - establish a transaction barrier. 488 * void jbd2_journal_lock_updates () - establish a transaction barrier.
438 * @journal: Journal to establish a barrier on. 489 * @journal: Journal to establish a barrier on.
@@ -447,7 +498,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
447{ 498{
448 DEFINE_WAIT(wait); 499 DEFINE_WAIT(wait);
449 500
450 spin_lock(&journal->j_state_lock); 501 write_lock(&journal->j_state_lock);
451 ++journal->j_barrier_count; 502 ++journal->j_barrier_count;
452 503
453 /* Wait until there are no running updates */ 504 /* Wait until there are no running updates */
@@ -458,19 +509,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
458 break; 509 break;
459 510
460 spin_lock(&transaction->t_handle_lock); 511 spin_lock(&transaction->t_handle_lock);
461 if (!transaction->t_updates) { 512 if (!atomic_read(&transaction->t_updates)) {
462 spin_unlock(&transaction->t_handle_lock); 513 spin_unlock(&transaction->t_handle_lock);
463 break; 514 break;
464 } 515 }
465 prepare_to_wait(&journal->j_wait_updates, &wait, 516 prepare_to_wait(&journal->j_wait_updates, &wait,
466 TASK_UNINTERRUPTIBLE); 517 TASK_UNINTERRUPTIBLE);
467 spin_unlock(&transaction->t_handle_lock); 518 spin_unlock(&transaction->t_handle_lock);
468 spin_unlock(&journal->j_state_lock); 519 write_unlock(&journal->j_state_lock);
469 schedule(); 520 schedule();
470 finish_wait(&journal->j_wait_updates, &wait); 521 finish_wait(&journal->j_wait_updates, &wait);
471 spin_lock(&journal->j_state_lock); 522 write_lock(&journal->j_state_lock);
472 } 523 }
473 spin_unlock(&journal->j_state_lock); 524 write_unlock(&journal->j_state_lock);
474 525
475 /* 526 /*
476 * We have now established a barrier against other normal updates, but 527 * We have now established a barrier against other normal updates, but
@@ -494,9 +545,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
494 J_ASSERT(journal->j_barrier_count != 0); 545 J_ASSERT(journal->j_barrier_count != 0);
495 546
496 mutex_unlock(&journal->j_barrier); 547 mutex_unlock(&journal->j_barrier);
497 spin_lock(&journal->j_state_lock); 548 write_lock(&journal->j_state_lock);
498 --journal->j_barrier_count; 549 --journal->j_barrier_count;
499 spin_unlock(&journal->j_state_lock); 550 write_unlock(&journal->j_state_lock);
500 wake_up(&journal->j_wait_transaction_locked); 551 wake_up(&journal->j_wait_transaction_locked);
501} 552}
502 553
@@ -1238,7 +1289,8 @@ int jbd2_journal_stop(handle_t *handle)
1238{ 1289{
1239 transaction_t *transaction = handle->h_transaction; 1290 transaction_t *transaction = handle->h_transaction;
1240 journal_t *journal = transaction->t_journal; 1291 journal_t *journal = transaction->t_journal;
1241 int err; 1292 int err, wait_for_commit = 0;
1293 tid_t tid;
1242 pid_t pid; 1294 pid_t pid;
1243 1295
1244 J_ASSERT(journal_current_handle() == handle); 1296 J_ASSERT(journal_current_handle() == handle);
@@ -1246,7 +1298,7 @@ int jbd2_journal_stop(handle_t *handle)
1246 if (is_handle_aborted(handle)) 1298 if (is_handle_aborted(handle))
1247 err = -EIO; 1299 err = -EIO;
1248 else { 1300 else {
1249 J_ASSERT(transaction->t_updates > 0); 1301 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1250 err = 0; 1302 err = 0;
1251 } 1303 }
1252 1304
@@ -1291,9 +1343,9 @@ int jbd2_journal_stop(handle_t *handle)
1291 1343
1292 journal->j_last_sync_writer = pid; 1344 journal->j_last_sync_writer = pid;
1293 1345
1294 spin_lock(&journal->j_state_lock); 1346 read_lock(&journal->j_state_lock);
1295 commit_time = journal->j_average_commit_time; 1347 commit_time = journal->j_average_commit_time;
1296 spin_unlock(&journal->j_state_lock); 1348 read_unlock(&journal->j_state_lock);
1297 1349
1298 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1350 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1299 transaction->t_start_time)); 1351 transaction->t_start_time));
@@ -1314,14 +1366,8 @@ int jbd2_journal_stop(handle_t *handle)
1314 if (handle->h_sync) 1366 if (handle->h_sync)
1315 transaction->t_synchronous_commit = 1; 1367 transaction->t_synchronous_commit = 1;
1316 current->journal_info = NULL; 1368 current->journal_info = NULL;
1317 spin_lock(&transaction->t_handle_lock); 1369 atomic_sub(handle->h_buffer_credits,
1318 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1370 &transaction->t_outstanding_credits);
1319 transaction->t_updates--;
1320 if (!transaction->t_updates) {
1321 wake_up(&journal->j_wait_updates);
1322 if (journal->j_barrier_count)
1323 wake_up(&journal->j_wait_transaction_locked);
1324 }
1325 1371
1326 /* 1372 /*
1327 * If the handle is marked SYNC, we need to set another commit 1373 * If the handle is marked SYNC, we need to set another commit
@@ -1330,15 +1376,13 @@ int jbd2_journal_stop(handle_t *handle)
1330 * transaction is too old now. 1376 * transaction is too old now.
1331 */ 1377 */
1332 if (handle->h_sync || 1378 if (handle->h_sync ||
1333 transaction->t_outstanding_credits > 1379 (atomic_read(&transaction->t_outstanding_credits) >
1334 journal->j_max_transaction_buffers || 1380 journal->j_max_transaction_buffers) ||
1335 time_after_eq(jiffies, transaction->t_expires)) { 1381 time_after_eq(jiffies, transaction->t_expires)) {
1336 /* Do this even for aborted journals: an abort still 1382 /* Do this even for aborted journals: an abort still
1337 * completes the commit thread, it just doesn't write 1383 * completes the commit thread, it just doesn't write
1338 * anything to disk. */ 1384 * anything to disk. */
1339 tid_t tid = transaction->t_tid;
1340 1385
1341 spin_unlock(&transaction->t_handle_lock);
1342 jbd_debug(2, "transaction too old, requesting commit for " 1386 jbd_debug(2, "transaction too old, requesting commit for "
1343 "handle %p\n", handle); 1387 "handle %p\n", handle);
1344 /* This is non-blocking */ 1388 /* This is non-blocking */
@@ -1349,11 +1393,25 @@ int jbd2_journal_stop(handle_t *handle)
1349 * to wait for the commit to complete. 1393 * to wait for the commit to complete.
1350 */ 1394 */
1351 if (handle->h_sync && !(current->flags & PF_MEMALLOC)) 1395 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1352 err = jbd2_log_wait_commit(journal, tid); 1396 wait_for_commit = 1;
1353 } else {
1354 spin_unlock(&transaction->t_handle_lock);
1355 } 1397 }
1356 1398
1399 /*
1400 * Once we drop t_updates, if it goes to zero the transaction
1401 * could start commiting on us and eventually disappear. So
1402 * once we do this, we must not dereference transaction
1403 * pointer again.
1404 */
1405 tid = transaction->t_tid;
1406 if (atomic_dec_and_test(&transaction->t_updates)) {
1407 wake_up(&journal->j_wait_updates);
1408 if (journal->j_barrier_count)
1409 wake_up(&journal->j_wait_transaction_locked);
1410 }
1411
1412 if (wait_for_commit)
1413 err = jbd2_log_wait_commit(journal, tid);
1414
1357 lock_map_release(&handle->h_lockdep_map); 1415 lock_map_release(&handle->h_lockdep_map);
1358 1416
1359 jbd2_free_handle(handle); 1417 jbd2_free_handle(handle);
@@ -1719,7 +1777,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1719 goto zap_buffer_unlocked; 1777 goto zap_buffer_unlocked;
1720 1778
1721 /* OK, we have data buffer in journaled mode */ 1779 /* OK, we have data buffer in journaled mode */
1722 spin_lock(&journal->j_state_lock); 1780 write_lock(&journal->j_state_lock);
1723 jbd_lock_bh_state(bh); 1781 jbd_lock_bh_state(bh);
1724 spin_lock(&journal->j_list_lock); 1782 spin_lock(&journal->j_list_lock);
1725 1783
@@ -1772,7 +1830,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1772 jbd2_journal_put_journal_head(jh); 1830 jbd2_journal_put_journal_head(jh);
1773 spin_unlock(&journal->j_list_lock); 1831 spin_unlock(&journal->j_list_lock);
1774 jbd_unlock_bh_state(bh); 1832 jbd_unlock_bh_state(bh);
1775 spin_unlock(&journal->j_state_lock); 1833 write_unlock(&journal->j_state_lock);
1776 return ret; 1834 return ret;
1777 } else { 1835 } else {
1778 /* There is no currently-running transaction. So the 1836 /* There is no currently-running transaction. So the
@@ -1786,7 +1844,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1786 jbd2_journal_put_journal_head(jh); 1844 jbd2_journal_put_journal_head(jh);
1787 spin_unlock(&journal->j_list_lock); 1845 spin_unlock(&journal->j_list_lock);
1788 jbd_unlock_bh_state(bh); 1846 jbd_unlock_bh_state(bh);
1789 spin_unlock(&journal->j_state_lock); 1847 write_unlock(&journal->j_state_lock);
1790 return ret; 1848 return ret;
1791 } else { 1849 } else {
1792 /* The orphan record's transaction has 1850 /* The orphan record's transaction has
@@ -1810,7 +1868,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1810 jbd2_journal_put_journal_head(jh); 1868 jbd2_journal_put_journal_head(jh);
1811 spin_unlock(&journal->j_list_lock); 1869 spin_unlock(&journal->j_list_lock);
1812 jbd_unlock_bh_state(bh); 1870 jbd_unlock_bh_state(bh);
1813 spin_unlock(&journal->j_state_lock); 1871 write_unlock(&journal->j_state_lock);
1814 return 0; 1872 return 0;
1815 } else { 1873 } else {
1816 /* Good, the buffer belongs to the running transaction. 1874 /* Good, the buffer belongs to the running transaction.
@@ -1829,7 +1887,7 @@ zap_buffer:
1829zap_buffer_no_jh: 1887zap_buffer_no_jh:
1830 spin_unlock(&journal->j_list_lock); 1888 spin_unlock(&journal->j_list_lock);
1831 jbd_unlock_bh_state(bh); 1889 jbd_unlock_bh_state(bh);
1832 spin_unlock(&journal->j_state_lock); 1890 write_unlock(&journal->j_state_lock);
1833zap_buffer_unlocked: 1891zap_buffer_unlocked:
1834 clear_buffer_dirty(bh); 1892 clear_buffer_dirty(bh);
1835 J_ASSERT_BH(bh, !buffer_jbddirty(bh)); 1893 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2136,9 +2194,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2136 /* Locks are here just to force reading of recent values, it is 2194 /* Locks are here just to force reading of recent values, it is
2137 * enough that the transaction was not committing before we started 2195 * enough that the transaction was not committing before we started
2138 * a transaction adding the inode to orphan list */ 2196 * a transaction adding the inode to orphan list */
2139 spin_lock(&journal->j_state_lock); 2197 read_lock(&journal->j_state_lock);
2140 commit_trans = journal->j_committing_transaction; 2198 commit_trans = journal->j_committing_transaction;
2141 spin_unlock(&journal->j_state_lock); 2199 read_unlock(&journal->j_state_lock);
2142 spin_lock(&journal->j_list_lock); 2200 spin_lock(&journal->j_list_lock);
2143 inode_trans = jinode->i_transaction; 2201 inode_trans = jinode->i_transaction;
2144 spin_unlock(&journal->j_list_lock); 2202 spin_unlock(&journal->j_list_lock);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 55f1dde2fa8..404111b016c 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c5e1450d79f..85c6be2db02 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -22,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
22static inline struct jffs2_inode_cache * 23static inline struct jffs2_inode_cache *
23first_inode_chain(int *i, struct jffs2_sb_info *c) 24first_inode_chain(int *i, struct jffs2_sb_info *c)
24{ 25{
25 for (; *i < INOCACHE_HASHSIZE; (*i)++) { 26 for (; *i < c->inocache_hashsize; (*i)++) {
26 if (c->inocache_list[*i]) 27 if (c->inocache_list[*i])
27 return c->inocache_list[*i]; 28 return c->inocache_list[*i];
28 } 29 }
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f0294410868..de4247021d2 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -2,11 +2,12 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Created by Arjan van de Ven <arjanv@redhat.com> 5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
6 *
7 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 6 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
8 * University of Szeged, Hungary 7 * University of Szeged, Hungary
9 * 8 *
9 * Created by Arjan van de Ven <arjan@infradead.org>
10 *
10 * For licensing information, see the file 'LICENCE' in this directory. 11 * For licensing information, see the file 'LICENCE' in this directory.
11 * 12 *
12 */ 13 */
@@ -102,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
102 spin_unlock(&jffs2_compressor_list_lock); 103 spin_unlock(&jffs2_compressor_list_lock);
103 *datalen = orig_slen; 104 *datalen = orig_slen;
104 *cdatalen = orig_dlen; 105 *cdatalen = orig_dlen;
105 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL); 106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
106 spin_lock(&jffs2_compressor_list_lock); 107 spin_lock(&jffs2_compressor_list_lock);
107 this->usecount--; 108 this->usecount--;
108 if (!compr_ret) { 109 if (!compr_ret) {
@@ -151,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
151 spin_unlock(&jffs2_compressor_list_lock); 152 spin_unlock(&jffs2_compressor_list_lock);
152 *datalen = orig_slen; 153 *datalen = orig_slen;
153 *cdatalen = orig_dlen; 154 *cdatalen = orig_dlen;
154 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL); 155 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
155 spin_lock(&jffs2_compressor_list_lock); 156 spin_lock(&jffs2_compressor_list_lock);
156 this->usecount--; 157 this->usecount--;
157 if (!compr_ret) { 158 if (!compr_ret) {
@@ -219,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
219 if (comprtype == this->compr) { 220 if (comprtype == this->compr) {
220 this->usecount++; 221 this->usecount++;
221 spin_unlock(&jffs2_compressor_list_lock); 222 spin_unlock(&jffs2_compressor_list_lock);
222 ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL); 223 ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
223 spin_lock(&jffs2_compressor_list_lock); 224 spin_lock(&jffs2_compressor_list_lock);
224 if (ret) { 225 if (ret) {
225 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); 226 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 7d1d72faa77..13bb7597ab3 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
5 * University of Szeged, Hungary 5 * University of Szeged, Hungary
6 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
6 * 7 *
7 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
8 * 9 *
@@ -48,9 +49,9 @@ struct jffs2_compressor {
48 char *name; 49 char *name;
49 char compr; /* JFFS2_COMPR_XXX */ 50 char compr; /* JFFS2_COMPR_XXX */
50 int (*compress)(unsigned char *data_in, unsigned char *cpage_out, 51 int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
51 uint32_t *srclen, uint32_t *destlen, void *model); 52 uint32_t *srclen, uint32_t *destlen);
52 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, 53 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
53 uint32_t cdatalen, uint32_t datalen, void *model); 54 uint32_t cdatalen, uint32_t datalen);
54 int usecount; 55 int usecount;
55 int disabled; /* if set the compressor won't compress */ 56 int disabled; /* if set the compressor won't compress */
56 unsigned char *compr_buf; /* used by size compr. mode */ 57 unsigned char *compr_buf; /* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index cd02acafde8..af186ee674d 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2007 Nokia Corporation. All rights reserved. 4 * Copyright © 2007 Nokia Corporation. All rights reserved.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Richard Purdie <rpurdie@openedhand.com> 7 * Created by Richard Purdie <rpurdie@openedhand.com>
7 * 8 *
@@ -41,7 +42,7 @@ static int __init alloc_workspace(void)
41} 42}
42 43
43static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, 44static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
44 uint32_t *sourcelen, uint32_t *dstlen, void *model) 45 uint32_t *sourcelen, uint32_t *dstlen)
45{ 46{
46 size_t compress_size; 47 size_t compress_size;
47 int ret; 48 int ret;
@@ -66,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
66} 67}
67 68
68static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, 69static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
69 uint32_t srclen, uint32_t destlen, void *model) 70 uint32_t srclen, uint32_t destlen)
70{ 71{
71 size_t dl = destlen; 72 size_t dl = destlen;
72 int ret; 73 int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 546d1538d07..16a5047903a 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 7 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 8 *
@@ -30,8 +31,7 @@
30/* _compress returns the compressed size, -1 if bigger */ 31/* _compress returns the compressed size, -1 if bigger */
31static int jffs2_rtime_compress(unsigned char *data_in, 32static int jffs2_rtime_compress(unsigned char *data_in,
32 unsigned char *cpage_out, 33 unsigned char *cpage_out,
33 uint32_t *sourcelen, uint32_t *dstlen, 34 uint32_t *sourcelen, uint32_t *dstlen)
34 void *model)
35{ 35{
36 short positions[256]; 36 short positions[256];
37 int outpos = 0; 37 int outpos = 0;
@@ -72,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
72 72
73static int jffs2_rtime_decompress(unsigned char *data_in, 73static int jffs2_rtime_decompress(unsigned char *data_in,
74 unsigned char *cpage_out, 74 unsigned char *cpage_out,
75 uint32_t srclen, uint32_t destlen, 75 uint32_t srclen, uint32_t destlen)
76 void *model)
77{ 76{
78 short positions[256]; 77 short positions[256];
79 int outpos = 0; 78 int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 170d289ac78..9e7cec808c4 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 7 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 8 *
@@ -297,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
297#if 0 298#if 0
298/* _compress returns the compressed size, -1 if bigger */ 299/* _compress returns the compressed size, -1 if bigger */
299int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 300int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
300 uint32_t *sourcelen, uint32_t *dstlen, void *model) 301 uint32_t *sourcelen, uint32_t *dstlen)
301{ 302{
302 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, 303 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
303 cpage_out, sourcelen, dstlen); 304 cpage_out, sourcelen, dstlen);
@@ -305,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
305#endif 306#endif
306static int jffs2_dynrubin_compress(unsigned char *data_in, 307static int jffs2_dynrubin_compress(unsigned char *data_in,
307 unsigned char *cpage_out, 308 unsigned char *cpage_out,
308 uint32_t *sourcelen, uint32_t *dstlen, 309 uint32_t *sourcelen, uint32_t *dstlen)
309 void *model)
310{ 310{
311 int bits[8]; 311 int bits[8];
312 unsigned char histo[256]; 312 unsigned char histo[256];
@@ -386,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
386 386
387static int jffs2_rubinmips_decompress(unsigned char *data_in, 387static int jffs2_rubinmips_decompress(unsigned char *data_in,
388 unsigned char *cpage_out, 388 unsigned char *cpage_out,
389 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen)
390 void *model)
391{ 390{
392 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, 391 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
393 cpage_out, sourcelen, dstlen); 392 cpage_out, sourcelen, dstlen);
@@ -396,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
396 395
397static int jffs2_dynrubin_decompress(unsigned char *data_in, 396static int jffs2_dynrubin_decompress(unsigned char *data_in,
398 unsigned char *cpage_out, 397 unsigned char *cpage_out,
399 uint32_t sourcelen, uint32_t dstlen, 398 uint32_t sourcelen, uint32_t dstlen)
400 void *model)
401{ 399{
402 int bits[8]; 400 int bits[8];
403 int c; 401 int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index b46661a4275..fd05a0b9431 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -67,8 +68,7 @@ static void free_workspaces(void)
67 68
68static int jffs2_zlib_compress(unsigned char *data_in, 69static int jffs2_zlib_compress(unsigned char *data_in,
69 unsigned char *cpage_out, 70 unsigned char *cpage_out,
70 uint32_t *sourcelen, uint32_t *dstlen, 71 uint32_t *sourcelen, uint32_t *dstlen)
71 void *model)
72{ 72{
73 int ret; 73 int ret;
74 74
@@ -135,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
135 135
136static int jffs2_zlib_decompress(unsigned char *data_in, 136static int jffs2_zlib_decompress(unsigned char *data_in,
137 unsigned char *cpage_out, 137 unsigned char *cpage_out,
138 uint32_t srclen, uint32_t destlen, 138 uint32_t srclen, uint32_t destlen)
139 void *model)
140{ 139{
141 int ret; 140 int ret;
142 int wbits = MAX_WBITS; 141 int wbits = MAX_WBITS;
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index ec353841392..e0b76c87a91 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index a113ecc3baf..c4f8eef5ca6 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 166062a6823..92978658ed1 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -232,9 +233,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
232 return 0; 233 return 0;
233 234
234 fail: 235 fail:
235 make_bad_inode(inode); 236 iget_failed(inode);
236 unlock_new_inode(inode);
237 iput(inode);
238 jffs2_free_raw_inode(ri); 237 jffs2_free_raw_inode(ri);
239 return ret; 238 return ret;
240} 239}
@@ -290,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
290 mutex_unlock(&f->sem); 289 mutex_unlock(&f->sem);
291 d_instantiate(dentry, old_dentry->d_inode); 290 d_instantiate(dentry, old_dentry->d_inode);
292 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
293 atomic_inc(&old_dentry->d_inode->i_count); 292 ihold(old_dentry->d_inode);
294 } 293 }
295 return ret; 294 return ret;
296} 295}
@@ -368,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
368 } 367 }
369 368
370 /* We use f->target field to store the target path. */ 369 /* We use f->target field to store the target path. */
371 f->target = kmalloc(targetlen + 1, GFP_KERNEL); 370 f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
372 if (!f->target) { 371 if (!f->target) {
373 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
374 mutex_unlock(&f->sem); 373 mutex_unlock(&f->sem);
@@ -377,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
377 goto fail; 376 goto fail;
378 } 377 }
379 378
380 memcpy(f->target, target, targetlen + 1);
381 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); 379 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
382 380
383 /* No data here. Only a metadata node, which will be 381 /* No data here. Only a metadata node, which will be
@@ -454,9 +452,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
454 return 0; 452 return 0;
455 453
456 fail: 454 fail:
457 make_bad_inode(inode); 455 iget_failed(inode);
458 unlock_new_inode(inode);
459 iput(inode);
460 return ret; 456 return ret;
461} 457}
462 458
@@ -601,9 +597,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
601 return 0; 597 return 0;
602 598
603 fail: 599 fail:
604 make_bad_inode(inode); 600 iget_failed(inode);
605 unlock_new_inode(inode);
606 iput(inode);
607 return ret; 601 return ret;
608} 602}
609 603
@@ -778,9 +772,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
778 return 0; 772 return 0;
779 773
780 fail: 774 fail:
781 make_bad_inode(inode); 775 iget_failed(inode);
782 unlock_new_inode(inode);
783 iput(inode);
784 return ret; 776 return ret;
785} 777}
786 778
@@ -871,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
871 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); 863 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
872 /* Might as well let the VFS know */ 864 /* Might as well let the VFS know */
873 d_instantiate(new_dentry, old_dentry->d_inode); 865 d_instantiate(new_dentry, old_dentry->d_inode);
874 atomic_inc(&old_dentry->d_inode->i_count); 866 ihold(old_dentry->d_inode);
875 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); 867 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
876 return ret; 868 return ret;
877 } 869 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 6286ad9b00f..e513f1913c1 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -150,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
150 } 151 }
151 152
152 /* Be nice */ 153 /* Be nice */
153 yield(); 154 cond_resched();
154 mutex_lock(&c->erase_free_sem); 155 mutex_lock(&c->erase_free_sem);
155 spin_lock(&c->erase_completion_lock); 156 spin_lock(&c->erase_completion_lock);
156 } 157 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 81349702443..1c0a08d711a 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 459d39d1ea0..e896e67767e 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -20,7 +21,6 @@
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
21#include <linux/vfs.h> 22#include <linux/vfs.h>
22#include <linux/crc32.h> 23#include <linux/crc32.h>
23#include <linux/smp_lock.h>
24#include "nodelist.h" 24#include "nodelist.h"
25 25
26static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 169 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 170 jffs2_complete_reservation(c);
171 171
172 /* We have to do the simple_setsize() without f->sem held, since 172 /* We have to do the truncate_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 173 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 174 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 175 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 176 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 simple_setsize(inode, iattr->ia_size); 178 truncate_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 179 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 180 }
181 181
@@ -225,7 +225,7 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
225} 225}
226 226
227 227
228void jffs2_clear_inode (struct inode *inode) 228void jffs2_evict_inode (struct inode *inode)
229{ 229{
230 /* We can forget about this inode for now - drop all 230 /* We can forget about this inode for now - drop all
231 * the nodelists associated with it, etc. 231 * the nodelists associated with it, etc.
@@ -233,7 +233,9 @@ void jffs2_clear_inode (struct inode *inode)
233 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 233 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
234 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); 234 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
235 235
236 D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode)); 236 D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
237 truncate_inode_pages(&inode->i_data, 0);
238 end_writeback(inode);
237 jffs2_do_clear_inode(c, f); 239 jffs2_do_clear_inode(c, f);
238} 240}
239 241
@@ -388,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
388 This also catches the case where it was stopped and this 390 This also catches the case where it was stopped and this
389 is just a remount to restart it. 391 is just a remount to restart it.
390 Flush the writebuffer, if neccecary, else we loose it */ 392 Flush the writebuffer, if neccecary, else we loose it */
391 lock_kernel();
392 if (!(sb->s_flags & MS_RDONLY)) { 393 if (!(sb->s_flags & MS_RDONLY)) {
393 jffs2_stop_garbage_collect_thread(c); 394 jffs2_stop_garbage_collect_thread(c);
394 mutex_lock(&c->alloc_sem); 395 mutex_lock(&c->alloc_sem);
@@ -400,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
400 jffs2_start_garbage_collect_thread(c); 401 jffs2_start_garbage_collect_thread(c);
401 402
402 *flags |= MS_NOATIME; 403 *flags |= MS_NOATIME;
403
404 unlock_kernel();
405 return 0; 404 return 0;
406} 405}
407 406
@@ -475,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
475 return inode; 474 return inode;
476} 475}
477 476
477static int calculate_inocache_hashsize(uint32_t flash_size)
478{
479 /*
480 * Pick a inocache hash size based on the size of the medium.
481 * Count how many megabytes we're dealing with, apply a hashsize twice
482 * that size, but rounding down to the usual big powers of 2. And keep
483 * to sensible bounds.
484 */
485
486 int size_mb = flash_size / 1024 / 1024;
487 int hashsize = (size_mb * 2) & ~0x3f;
488
489 if (hashsize < INOCACHE_HASHSIZE_MIN)
490 return INOCACHE_HASHSIZE_MIN;
491 if (hashsize > INOCACHE_HASHSIZE_MAX)
492 return INOCACHE_HASHSIZE_MAX;
493
494 return hashsize;
495}
478 496
479int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) 497int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
480{ 498{
@@ -521,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
521 if (ret) 539 if (ret)
522 return ret; 540 return ret;
523 541
524 c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); 542 c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
543 c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
525 if (!c->inocache_list) { 544 if (!c->inocache_list) {
526 ret = -ENOMEM; 545 ret = -ENOMEM;
527 goto out_wbuf; 546 goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f5e96bd656e..31dce611337 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -218,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
218 if (!list_empty(&c->erase_complete_list) || 219 if (!list_empty(&c->erase_complete_list) ||
219 !list_empty(&c->erase_pending_list)) { 220 !list_empty(&c->erase_pending_list)) {
220 spin_unlock(&c->erase_completion_lock); 221 spin_unlock(&c->erase_completion_lock);
222 mutex_unlock(&c->alloc_sem);
221 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); 223 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
222 if (jffs2_erase_pending_blocks(c, 1)) { 224 if (jffs2_erase_pending_blocks(c, 1))
223 mutex_unlock(&c->alloc_sem);
224 return 0; 225 return 0;
225 } 226
226 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); 227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
227 spin_lock(&c->erase_completion_lock); 228 spin_lock(&c->erase_completion_lock);
229 mutex_lock(&c->alloc_sem);
228 } 230 }
229 231
230 /* First, work out which block we're garbage-collecting */ 232 /* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 9d41f43e47b..859a598af02 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index c6923da9826..2e4a86763c0 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 85ef6dbb1be..f864005de64 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -99,6 +100,7 @@ struct jffs2_sb_info {
99 wait_queue_head_t erase_wait; /* For waiting for erases to complete */ 100 wait_queue_head_t erase_wait; /* For waiting for erases to complete */
100 101
101 wait_queue_head_t inocache_wq; 102 wait_queue_head_t inocache_wq;
103 int inocache_hashsize;
102 struct jffs2_inode_cache **inocache_list; 104 struct jffs2_inode_cache **inocache_list;
103 spinlock_t inocache_lock; 105 spinlock_t inocache_lock;
104 106
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd13846..5e03233c236 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
420{ 420{
421 struct jffs2_inode_cache *ret; 421 struct jffs2_inode_cache *ret;
422 422
423 ret = c->inocache_list[ino % INOCACHE_HASHSIZE]; 423 ret = c->inocache_list[ino % c->inocache_hashsize];
424 while (ret && ret->ino < ino) { 424 while (ret && ret->ino < ino) {
425 ret = ret->next; 425 ret = ret->next;
426 } 426 }
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
441 441
442 dbg_inocache("add %p (ino #%u)\n", new, new->ino); 442 dbg_inocache("add %p (ino #%u)\n", new, new->ino);
443 443
444 prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; 444 prev = &c->inocache_list[new->ino % c->inocache_hashsize];
445 445
446 while ((*prev) && (*prev)->ino < new->ino) { 446 while ((*prev) && (*prev)->ino < new->ino) {
447 prev = &(*prev)->next; 447 prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
462 dbg_inocache("del %p (ino #%u)\n", old, old->ino); 462 dbg_inocache("del %p (ino #%u)\n", old, old->ino);
463 spin_lock(&c->inocache_lock); 463 spin_lock(&c->inocache_lock);
464 464
465 prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; 465 prev = &c->inocache_list[old->ino % c->inocache_hashsize];
466 466
467 while ((*prev) && (*prev)->ino < old->ino) { 467 while ((*prev) && (*prev)->ino < old->ino) {
468 prev = &(*prev)->next; 468 prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
487 int i; 487 int i;
488 struct jffs2_inode_cache *this, *next; 488 struct jffs2_inode_cache *this, *next;
489 489
490 for (i=0; i<INOCACHE_HASHSIZE; i++) { 490 for (i=0; i < c->inocache_hashsize; i++) {
491 this = c->inocache_list[i]; 491 this = c->inocache_list[i];
492 while (this) { 492 while (this) {
493 next = this->next; 493 next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index a881a42f19e..5a53d9bdb2b 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -24,7 +24,6 @@
24#ifdef __ECOS 24#ifdef __ECOS
25#include "os-ecos.h" 25#include "os-ecos.h"
26#else 26#else
27#include <linux/mtd/compatmac.h> /* For compatibility with older kernels */
28#include "os-linux.h" 27#include "os-linux.h"
29#endif 28#endif
30 29
@@ -200,7 +199,8 @@ struct jffs2_inode_cache {
200#define RAWNODE_CLASS_XATTR_DATUM 1 199#define RAWNODE_CLASS_XATTR_DATUM 1
201#define RAWNODE_CLASS_XATTR_REF 2 200#define RAWNODE_CLASS_XATTR_REF 2
202 201
203#define INOCACHE_HASHSIZE 128 202#define INOCACHE_HASHSIZE_MIN 128
203#define INOCACHE_HASHSIZE_MAX 1024
204 204
205#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) 205#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
206 206
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 4791aacf308..00bae7cc2e4 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -171,7 +171,7 @@ extern const struct inode_operations jffs2_symlink_inode_operations;
171int jffs2_setattr (struct dentry *, struct iattr *); 171int jffs2_setattr (struct dentry *, struct iattr *);
172int jffs2_do_setattr (struct inode *, struct iattr *); 172int jffs2_do_setattr (struct inode *, struct iattr *);
173struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
174void jffs2_clear_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
175void jffs2_dirty_inode(struct inode *inode); 175void jffs2_dirty_inode(struct inode *inode);
176struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc3..b632dddcb48 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
20#include "summary.h" 20#include "summary.h"
21#include "debug.h" 21#include "debug.h"
22 22
23#define DEFAULT_EMPTY_SCAN_SIZE 1024 23#define DEFAULT_EMPTY_SCAN_SIZE 256
24 24
25#define noisy_printk(noise, args...) do { \ 25#define noisy_printk(noise, args...) do { \
26 if (*(noise)) { \ 26 if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { 435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
436 struct jffs2_unknown_node *node; 436 struct jffs2_unknown_node *node;
437 struct jffs2_unknown_node crcnode; 437 struct jffs2_unknown_node crcnode;
438 uint32_t ofs, prevofs; 438 uint32_t ofs, prevofs, max_ofs;
439 uint32_t hdr_crc, buf_ofs, buf_len; 439 uint32_t hdr_crc, buf_ofs, buf_len;
440 int err; 440 int err;
441 int noise = 0; 441 int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
550 550
551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ 551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */
552 ofs = 0; 552 ofs = 0;
553 553 max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
554 /* Scan only 4KiB of 0xFF before declaring it's empty */ 554 /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
555 while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) 555 while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
556 ofs += 4; 556 ofs += 4;
557 557
558 if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { 558 if (ofs == max_ofs) {
559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
560 if (jffs2_cleanmarker_oob(c)) { 560 if (jffs2_cleanmarker_oob(c)) {
561 /* scan oob, take care of cleanmarker */ 561 /* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 511e2d609d1..c86041b866a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/list.h> 16#include <linux/list.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
@@ -135,7 +134,7 @@ static const struct super_operations jffs2_super_operations =
135 .write_super = jffs2_write_super, 134 .write_super = jffs2_write_super,
136 .statfs = jffs2_statfs, 135 .statfs = jffs2_statfs,
137 .remount_fs = jffs2_remount_fs, 136 .remount_fs = jffs2_remount_fs,
138 .clear_inode = jffs2_clear_inode, 137 .evict_inode = jffs2_evict_inode,
139 .dirty_inode = jffs2_dirty_inode, 138 .dirty_inode = jffs2_dirty_inode,
140 .sync_fs = jffs2_sync_fs, 139 .sync_fs = jffs2_sync_fs,
141}; 140};
@@ -146,6 +145,7 @@ static const struct super_operations jffs2_super_operations =
146static int jffs2_fill_super(struct super_block *sb, void *data, int silent) 145static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
147{ 146{
148 struct jffs2_sb_info *c; 147 struct jffs2_sb_info *c;
148 int ret;
149 149
150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" 150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
151 " New superblock for device %d (\"%s\")\n", 151 " New superblock for device %d (\"%s\")\n",
@@ -175,15 +175,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
175#ifdef CONFIG_JFFS2_FS_POSIX_ACL 175#ifdef CONFIG_JFFS2_FS_POSIX_ACL
176 sb->s_flags |= MS_POSIXACL; 176 sb->s_flags |= MS_POSIXACL;
177#endif 177#endif
178 return jffs2_do_fill_super(sb, data, silent); 178 ret = jffs2_do_fill_super(sb, data, silent);
179 return ret;
179} 180}
180 181
181static int jffs2_get_sb(struct file_system_type *fs_type, 182static struct dentry *jffs2_mount(struct file_system_type *fs_type,
182 int flags, const char *dev_name, 183 int flags, const char *dev_name,
183 void *data, struct vfsmount *mnt) 184 void *data)
184{ 185{
185 return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, 186 return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
186 mnt);
187} 187}
188 188
189static void jffs2_put_super (struct super_block *sb) 189static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb)
192 192
193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
194 194
195 lock_kernel();
196
197 if (sb->s_dirt) 195 if (sb->s_dirt)
198 jffs2_write_super(sb); 196 jffs2_write_super(sb);
199 197
@@ -215,8 +213,6 @@ static void jffs2_put_super (struct super_block *sb)
215 if (c->mtd->sync) 213 if (c->mtd->sync)
216 c->mtd->sync(c->mtd); 214 c->mtd->sync(c->mtd);
217 215
218 unlock_kernel();
219
220 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 216 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
221} 217}
222 218
@@ -232,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
232static struct file_system_type jffs2_fs_type = { 228static struct file_system_type jffs2_fs_type = {
233 .owner = THIS_MODULE, 229 .owner = THIS_MODULE,
234 .name = "jffs2", 230 .name = "jffs2",
235 .get_sb = jffs2_get_sb, 231 .mount = jffs2_mount,
236 .kill_sb = jffs2_kill_sb, 232 .kill_sb = jffs2_kill_sb,
237}; 233};
238 234
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d258e261bdc..9b572ca40a4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -588,7 +588,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
588 588
589void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) 589void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
590{ 590{
591 /* It's called from jffs2_clear_inode() on inode removing. 591 /* It's called from jffs2_evict_inode() on inode removing.
592 When an inode with XATTR is removed, those XATTRs must be removed. */ 592 When an inode with XATTR is removed, those XATTRs must be removed. */
593 struct jffs2_xattr_ref *ref, *_ref; 593 struct jffs2_xattr_ref *ref, *_ref;
594 594
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 127263cc865..c5ce6c1d1ff 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -17,6 +17,7 @@
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20#include <linux/mm.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/quotaops.h> 22#include <linux/quotaops.h>
22#include "jfs_incore.h" 23#include "jfs_incore.h"
@@ -107,11 +108,18 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
107 return rc; 108 return rc;
108 } 109 }
109 110
110 rc = inode_setattr(inode, iattr); 111 if ((iattr->ia_valid & ATTR_SIZE) &&
112 iattr->ia_size != i_size_read(inode)) {
113 rc = vmtruncate(inode, iattr->ia_size);
114 if (rc)
115 return rc;
116 }
111 117
112 if (!rc && (iattr->ia_valid & ATTR_MODE)) 118 setattr_copy(inode, iattr);
113 rc = jfs_acl_chmod(inode); 119 mark_inode_dirty(inode);
114 120
121 if (iattr->ia_valid & ATTR_MODE)
122 rc = jfs_acl_chmod(inode);
115 return rc; 123 return rc;
116} 124}
117 125
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ed9ba6fe04f..9978803ceed 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,31 +145,32 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
145 return 0; 145 return 0;
146} 146}
147 147
148void jfs_delete_inode(struct inode *inode) 148void jfs_evict_inode(struct inode *inode)
149{ 149{
150 jfs_info("In jfs_delete_inode, inode = 0x%p", inode); 150 jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
151 151
152 if (!is_bad_inode(inode)) 152 if (!inode->i_nlink && !is_bad_inode(inode)) {
153 dquot_initialize(inode); 153 dquot_initialize(inode);
154 154
155 if (!is_bad_inode(inode) && 155 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
156 (JFS_IP(inode)->fileset == FILESYSTEM_I)) { 156 truncate_inode_pages(&inode->i_data, 0);
157 truncate_inode_pages(&inode->i_data, 0);
158 157
159 if (test_cflag(COMMIT_Freewmap, inode)) 158 if (test_cflag(COMMIT_Freewmap, inode))
160 jfs_free_zero_link(inode); 159 jfs_free_zero_link(inode);
161 160
162 diFree(inode); 161 diFree(inode);
163 162
164 /* 163 /*
165 * Free the inode from the quota allocation. 164 * Free the inode from the quota allocation.
166 */ 165 */
167 dquot_initialize(inode); 166 dquot_initialize(inode);
168 dquot_free_inode(inode); 167 dquot_free_inode(inode);
169 dquot_drop(inode); 168 }
169 } else {
170 truncate_inode_pages(&inode->i_data, 0);
170 } 171 }
171 172 end_writeback(inode);
172 clear_inode(inode); 173 dquot_drop(inode);
173} 174}
174 175
175void jfs_dirty_inode(struct inode *inode) 176void jfs_dirty_inode(struct inode *inode)
@@ -303,8 +304,17 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
303 loff_t pos, unsigned len, unsigned flags, 304 loff_t pos, unsigned len, unsigned flags,
304 struct page **pagep, void **fsdata) 305 struct page **pagep, void **fsdata)
305{ 306{
306 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 307 int ret;
308
309 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
307 jfs_get_block); 310 jfs_get_block);
311 if (unlikely(ret)) {
312 loff_t isize = mapping->host->i_size;
313 if (pos + len > isize)
314 vmtruncate(mapping->host, isize);
315 }
316
317 return ret;
308} 318}
309 319
310static sector_t jfs_bmap(struct address_space *mapping, sector_t block) 320static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
@@ -317,9 +327,24 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
317{ 327{
318 struct file *file = iocb->ki_filp; 328 struct file *file = iocb->ki_filp;
319 struct inode *inode = file->f_mapping->host; 329 struct inode *inode = file->f_mapping->host;
330 ssize_t ret;
320 331
321 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 332 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
322 offset, nr_segs, jfs_get_block, NULL); 333 offset, nr_segs, jfs_get_block, NULL);
334
335 /*
336 * In case of error extending write may have instantiated a few
337 * blocks outside i_size. Trim these off again.
338 */
339 if (unlikely((rw & WRITE) && ret < 0)) {
340 loff_t isize = i_size_read(inode);
341 loff_t end = offset + iov_length(iov, nr_segs);
342
343 if (end > isize)
344 vmtruncate(inode, isize);
345 }
346
347 return ret;
323} 348}
324 349
325const struct address_space_operations jfs_aops = { 350const struct address_space_operations jfs_aops = {
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb..3a09423b6c2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
497 * appear hashed, but do not put on any lists. hlist_del() 497 * appear hashed, but do not put on any lists. hlist_del()
498 * will work fine and require no locking. 498 * will work fine and require no locking.
499 */ 499 */
500 ip->i_hash.pprev = &ip->i_hash.next; 500 hlist_add_fake(&ip->i_hash);
501 501
502 return (ip); 502 return (ip);
503} 503}
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 11042b1f44b..155e91eff07 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -27,7 +27,7 @@ extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode *, struct writeback_control *); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_evict_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
33extern void jfs_truncate_nolock(struct inode *, loff_t); 33extern void jfs_truncate_nolock(struct inode *, loff_t);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a1451..e1b8493b9aa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1010 * option 2 - shutdown file systems 1010 * option 2 - shutdown file systems
1011 * associated with log ? 1011 * associated with log ?
1012 * option 3 - extend log ? 1012 * option 3 - extend log ?
1013 */
1014 /*
1015 * option 4 - second chance 1013 * option 4 - second chance
1016 * 1014 *
1017 * mark log wrapped, and continue. 1015 * mark log wrapped, and continue.
1018 * when all active transactions are completed, 1016 * when all active transactions are completed,
1019 * mark log vaild for recovery. 1017 * mark log valid for recovery.
1020 * if crashed during invalid state, log state 1018 * if crashed during invalid state, log state
1021 * implies invald log, forcing fsck(). 1019 * implies invalid log, forcing fsck().
1022 */ 1020 */
1023 /* mark log state log wrap in log superblock */ 1021 /* mark log state log wrap in log superblock */
1024 /* log->state = LOGWRAP; */ 1022 /* log->state = LOGWRAP; */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45..9895595fd2f 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
97 97
98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); 98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
99 if (ipaimap == NULL) { 99 if (ipaimap == NULL) {
100 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 100 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
101 rc = -EIO; 101 rc = -EIO;
102 goto errout20; 102 goto errout20;
103 } 103 }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (!ipaimap2) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
154 } 154 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b44..9466957ec84 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1279 * lazy commit thread finishes processing 1279 * lazy commit thread finishes processing
1280 */ 1280 */
1281 if (tblk->xflag & COMMIT_DELETE) { 1281 if (tblk->xflag & COMMIT_DELETE) {
1282 atomic_inc(&tblk->u.ip->i_count); 1282 ihold(tblk->u.ip);
1283 /* 1283 /*
1284 * Avoid a rare deadlock 1284 * Avoid a rare deadlock
1285 * 1285 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675b..231ca4af9bc 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
839 ip->i_ctime = CURRENT_TIME; 839 ip->i_ctime = CURRENT_TIME;
840 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 840 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
841 mark_inode_dirty(dir); 841 mark_inode_dirty(dir);
842 atomic_inc(&ip->i_count); 842 ihold(ip);
843 843
844 iplist[0] = ip; 844 iplist[0] = ip;
845 iplist[1] = dir; 845 iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b38f96bef82..0669fc1cc3b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/smp_lock.h>
37 36
38#include "jfs_incore.h" 37#include "jfs_incore.h"
39#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -132,11 +131,6 @@ static void jfs_destroy_inode(struct inode *inode)
132 kmem_cache_free(jfs_inode_cachep, ji); 131 kmem_cache_free(jfs_inode_cachep, ji);
133} 132}
134 133
135static void jfs_clear_inode(struct inode *inode)
136{
137 dquot_drop(inode);
138}
139
140static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 134static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
141{ 135{
142 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); 136 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -181,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
181 175
182 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 176 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
183 177
184 lock_kernel();
185
186 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
187 if (rc) 179 if (rc)
188 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
@@ -193,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
193 iput(sbi->direct_inode); 185 iput(sbi->direct_inode);
194 186
195 kfree(sbi); 187 kfree(sbi);
196
197 unlock_kernel();
198} 188}
199 189
200enum { 190enum {
@@ -374,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
374 if (!parse_options(data, sb, &newLVSize, &flag)) { 364 if (!parse_options(data, sb, &newLVSize, &flag)) {
375 return -EINVAL; 365 return -EINVAL;
376 } 366 }
377 lock_kernel(); 367
378 if (newLVSize) { 368 if (newLVSize) {
379 if (sb->s_flags & MS_RDONLY) { 369 if (sb->s_flags & MS_RDONLY) {
380 printk(KERN_ERR 370 printk(KERN_ERR
381 "JFS: resize requires volume to be mounted read-write\n"); 371 "JFS: resize requires volume to be mounted read-write\n");
382 unlock_kernel();
383 return -EROFS; 372 return -EROFS;
384 } 373 }
385 rc = jfs_extendfs(sb, newLVSize, 0); 374 rc = jfs_extendfs(sb, newLVSize, 0);
386 if (rc) { 375 if (rc)
387 unlock_kernel();
388 return rc; 376 return rc;
389 }
390 } 377 }
391 378
392 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 379 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -402,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
402 /* mark the fs r/w for quota activity */ 389 /* mark the fs r/w for quota activity */
403 sb->s_flags &= ~MS_RDONLY; 390 sb->s_flags &= ~MS_RDONLY;
404 391
405 unlock_kernel();
406 dquot_resume(sb, -1); 392 dquot_resume(sb, -1);
407 return ret; 393 return ret;
408 } 394 }
409 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 395 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
410 rc = dquot_suspend(sb, -1); 396 rc = dquot_suspend(sb, -1);
411 if (rc < 0) { 397 if (rc < 0) {
412 unlock_kernel();
413 return rc; 398 return rc;
414 } 399 }
415 rc = jfs_umount_rw(sb); 400 rc = jfs_umount_rw(sb);
416 JFS_SBI(sb)->flag = flag; 401 JFS_SBI(sb)->flag = flag;
417 unlock_kernel();
418 return rc; 402 return rc;
419 } 403 }
420 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 404 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
421 if (!(sb->s_flags & MS_RDONLY)) { 405 if (!(sb->s_flags & MS_RDONLY)) {
422 rc = jfs_umount_rw(sb); 406 rc = jfs_umount_rw(sb);
423 if (rc) { 407 if (rc)
424 unlock_kernel();
425 return rc; 408 return rc;
426 } 409
427 JFS_SBI(sb)->flag = flag; 410 JFS_SBI(sb)->flag = flag;
428 ret = jfs_mount_rw(sb, 1); 411 ret = jfs_mount_rw(sb, 1);
429 unlock_kernel();
430 return ret; 412 return ret;
431 } 413 }
432 JFS_SBI(sb)->flag = flag; 414 JFS_SBI(sb)->flag = flag;
433 415
434 unlock_kernel();
435 return 0; 416 return 0;
436} 417}
437 418
@@ -451,6 +432,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
451 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); 432 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
452 if (!sbi) 433 if (!sbi)
453 return -ENOMEM; 434 return -ENOMEM;
435
454 sb->s_fs_info = sbi; 436 sb->s_fs_info = sbi;
455 sbi->sb = sb; 437 sbi->sb = sb;
456 sbi->uid = sbi->gid = sbi->umask = -1; 438 sbi->uid = sbi->gid = sbi->umask = -1;
@@ -601,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
601 return 0; 583 return 0;
602} 584}
603 585
604static int jfs_get_sb(struct file_system_type *fs_type, 586static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
605 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 587 int flags, const char *dev_name, void *data)
606{ 588{
607 return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, 589 return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
608 mnt);
609} 590}
610 591
611static int jfs_sync_fs(struct super_block *sb, int wait) 592static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -765,8 +746,7 @@ static const struct super_operations jfs_super_operations = {
765 .destroy_inode = jfs_destroy_inode, 746 .destroy_inode = jfs_destroy_inode,
766 .dirty_inode = jfs_dirty_inode, 747 .dirty_inode = jfs_dirty_inode,
767 .write_inode = jfs_write_inode, 748 .write_inode = jfs_write_inode,
768 .delete_inode = jfs_delete_inode, 749 .evict_inode = jfs_evict_inode,
769 .clear_inode = jfs_clear_inode,
770 .put_super = jfs_put_super, 750 .put_super = jfs_put_super,
771 .sync_fs = jfs_sync_fs, 751 .sync_fs = jfs_sync_fs,
772 .freeze_fs = jfs_freeze, 752 .freeze_fs = jfs_freeze,
@@ -789,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
789static struct file_system_type jfs_fs_type = { 769static struct file_system_type jfs_fs_type = {
790 .owner = THIS_MODULE, 770 .owner = THIS_MODULE,
791 .name = "jfs", 771 .name = "jfs",
792 .get_sb = jfs_get_sb, 772 .mount = jfs_do_mount,
793 .kill_sb = kill_block_super, 773 .kill_sb = kill_block_super,
794 .fs_flags = FS_REQUIRES_DEV, 774 .fs_flags = FS_REQUIRES_DEV,
795}; 775};
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fa96bbb2634..2d7f165d0f1 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -86,46 +86,25 @@ struct ea_buffer {
86#define EA_MALLOC 0x0008 86#define EA_MALLOC 0x0008
87 87
88 88
89static int is_known_namespace(const char *name)
90{
91 if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
92 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
93 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
94 strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
95 return false;
96
97 return true;
98}
99
89/* 100/*
90 * These three routines are used to recognize on-disk extended attributes 101 * These three routines are used to recognize on-disk extended attributes
91 * that are in a recognized namespace. If the attribute is not recognized, 102 * that are in a recognized namespace. If the attribute is not recognized,
92 * "os2." is prepended to the name 103 * "os2." is prepended to the name
93 */ 104 */
94static inline int is_os2_xattr(struct jfs_ea *ea) 105static int is_os2_xattr(struct jfs_ea *ea)
95{ 106{
96 /* 107 return !is_known_namespace(ea->name);
97 * Check for "system."
98 */
99 if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
100 !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
101 return false;
102 /*
103 * Check for "user."
104 */
105 if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
106 !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
107 return false;
108 /*
109 * Check for "security."
110 */
111 if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
112 !strncmp(ea->name, XATTR_SECURITY_PREFIX,
113 XATTR_SECURITY_PREFIX_LEN))
114 return false;
115 /*
116 * Check for "trusted."
117 */
118 if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
119 !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
120 return false;
121 /*
122 * Add any other valid namespace prefixes here
123 */
124
125 /*
126 * We assume it's OS/2's flat namespace
127 */
128 return true;
129} 108}
130 109
131static inline int name_size(struct jfs_ea *ea) 110static inline int name_size(struct jfs_ea *ea)
@@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
764 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 743 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
765 return can_set_system_xattr(inode, name, value, value_len); 744 return can_set_system_xattr(inode, name, value, value_len);
766 745
746 if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
747 /*
748 * This makes sure that we aren't trying to set an
749 * attribute in a different namespace by prefixing it
750 * with "os2."
751 */
752 if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
753 return -EOPNOTSUPP;
754 return 0;
755 }
756
767 /* 757 /*
768 * Don't allow setting an attribute in an unknown namespace. 758 * Don't allow setting an attribute in an unknown namespace.
769 */ 759 */
770 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && 760 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
771 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && 761 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
772 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && 762 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
773 strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
774 return -EOPNOTSUPP; 763 return -EOPNOTSUPP;
775 764
776 return 0; 765 return 0;
@@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
952 int xattr_size; 941 int xattr_size;
953 ssize_t size; 942 ssize_t size;
954 int namelen = strlen(name); 943 int namelen = strlen(name);
955 char *os2name = NULL;
956 char *value; 944 char *value;
957 945
958 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
959 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
960 GFP_KERNEL);
961 if (!os2name)
962 return -ENOMEM;
963 strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
964 name = os2name;
965 namelen -= XATTR_OS2_PREFIX_LEN;
966 }
967
968 down_read(&JFS_IP(inode)->xattr_sem); 946 down_read(&JFS_IP(inode)->xattr_sem);
969 947
970 xattr_size = ea_get(inode, &ea_buf, 0); 948 xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
1002 out: 980 out:
1003 up_read(&JFS_IP(inode)->xattr_sem); 981 up_read(&JFS_IP(inode)->xattr_sem);
1004 982
1005 kfree(os2name);
1006
1007 return size; 983 return size;
1008} 984}
1009 985
@@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
1012{ 988{
1013 int err; 989 int err;
1014 990
991 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
992 /*
993 * skip past "os2." prefix
994 */
995 name += XATTR_OS2_PREFIX_LEN;
996 /*
997 * Don't allow retrieving properly prefixed attributes
998 * by prepending them with "os2."
999 */
1000 if (is_known_namespace(name))
1001 return -EOPNOTSUPP;
1002 }
1003
1015 err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); 1004 err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
1016 1005
1017 return err; 1006 return err;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcaf972cbf1..a3accdf528a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
201 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 201 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
202 * will never be mountable) 202 * will never be mountable)
203 */ 203 */
204int get_sb_pseudo(struct file_system_type *fs_type, char *name, 204struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
205 const struct super_operations *ops, unsigned long magic, 205 const struct super_operations *ops, unsigned long magic)
206 struct vfsmount *mnt)
207{ 206{
208 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 207 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
209 struct dentry *dentry; 208 struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
211 struct qstr d_name = {.name = name, .len = strlen(name)}; 210 struct qstr d_name = {.name = name, .len = strlen(name)};
212 211
213 if (IS_ERR(s)) 212 if (IS_ERR(s))
214 return PTR_ERR(s); 213 return ERR_CAST(s);
215 214
216 s->s_flags = MS_NOUSER; 215 s->s_flags = MS_NOUSER;
217 s->s_maxbytes = MAX_LFS_FILESIZE; 216 s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
241 d_instantiate(dentry, root); 240 d_instantiate(dentry, root);
242 s->s_root = dentry; 241 s->s_root = dentry;
243 s->s_flags |= MS_ACTIVE; 242 s->s_flags |= MS_ACTIVE;
244 simple_set_mnt(mnt, s); 243 return dget(s->s_root);
245 return 0;
246 244
247Enomem: 245Enomem:
248 deactivate_locked_super(s); 246 deactivate_locked_super(s);
249 return -ENOMEM; 247 return ERR_PTR(-ENOMEM);
250} 248}
251 249
252int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 250int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,7 +253,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
255 253
256 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 254 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
257 inc_nlink(inode); 255 inc_nlink(inode);
258 atomic_inc(&inode->i_count); 256 ihold(inode);
259 dget(dentry); 257 dget(dentry);
260 d_instantiate(dentry, inode); 258 d_instantiate(dentry, inode);
261 return 0; 259 return 0;
@@ -327,77 +325,35 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
327} 325}
328 326
329/** 327/**
330 * simple_setsize - handle core mm and vfs requirements for file size change 328 * simple_setattr - setattr for simple filesystem
331 * @inode: inode
332 * @newsize: new file size
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setsize must be called with inode_mutex held.
337 *
338 * simple_setsize will check that the requested new size is OK (see
339 * inode_newsize_ok), and then will perform the necessary i_size update
340 * and pagecache truncation (if necessary). It will be typically be called
341 * from the filesystem's setattr function when ATTR_SIZE is passed in.
342 *
343 * The inode itself must have correct permissions and attributes to allow
344 * i_size to be changed, this function then just checks that the new size
345 * requested is valid.
346 *
347 * In the case of simple in-memory filesystems with inodes stored solely
348 * in the inode cache, and file data in the pagecache, nothing more needs
349 * to be done to satisfy a truncate request. Filesystems with on-disk
350 * blocks for example will need to free them in the case of truncate, in
351 * that case it may be easier not to use simple_setsize (but each of its
352 * components will likely be required at some point to update pagecache
353 * and inode etc).
354 */
355int simple_setsize(struct inode *inode, loff_t newsize)
356{
357 loff_t oldsize;
358 int error;
359
360 error = inode_newsize_ok(inode, newsize);
361 if (error)
362 return error;
363
364 oldsize = inode->i_size;
365 i_size_write(inode, newsize);
366 truncate_pagecache(inode, oldsize, newsize);
367
368 return error;
369}
370EXPORT_SYMBOL(simple_setsize);
371
372/**
373 * simple_setattr - setattr for simple in-memory filesystem
374 * @dentry: dentry 329 * @dentry: dentry
375 * @iattr: iattr structure 330 * @iattr: iattr structure
376 * 331 *
377 * Returns 0 on success, -error on failure. 332 * Returns 0 on success, -error on failure.
378 * 333 *
379 * simple_setattr implements setattr for an in-memory filesystem which 334 * simple_setattr is a simple ->setattr implementation without a proper
380 * does not store its own file data or metadata (eg. uses the page cache 335 * implementation of size changes.
381 * and inode cache as its data store). 336 *
337 * It can either be used for in-memory filesystems or special files
338 * on simple regular filesystems. Anything that needs to change on-disk
339 * or wire state on size changes needs its own setattr method.
382 */ 340 */
383int simple_setattr(struct dentry *dentry, struct iattr *iattr) 341int simple_setattr(struct dentry *dentry, struct iattr *iattr)
384{ 342{
385 struct inode *inode = dentry->d_inode; 343 struct inode *inode = dentry->d_inode;
386 int error; 344 int error;
387 345
346 WARN_ON_ONCE(inode->i_op->truncate);
347
388 error = inode_change_ok(inode, iattr); 348 error = inode_change_ok(inode, iattr);
389 if (error) 349 if (error)
390 return error; 350 return error;
391 351
392 if (iattr->ia_valid & ATTR_SIZE) { 352 if (iattr->ia_valid & ATTR_SIZE)
393 error = simple_setsize(inode, iattr->ia_size); 353 truncate_setsize(inode, iattr->ia_size);
394 if (error) 354 setattr_copy(inode, iattr);
395 return error; 355 mark_inode_dirty(inode);
396 } 356 return 0;
397
398 generic_setattr(inode, iattr);
399
400 return error;
401} 357}
402EXPORT_SYMBOL(simple_setattr); 358EXPORT_SYMBOL(simple_setattr);
403 359
@@ -934,10 +890,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
934 */ 890 */
935int generic_file_fsync(struct file *file, int datasync) 891int generic_file_fsync(struct file *file, int datasync)
936{ 892{
937 struct writeback_control wbc = {
938 .sync_mode = WB_SYNC_ALL,
939 .nr_to_write = 0, /* metadata-only; caller takes care of data */
940 };
941 struct inode *inode = file->f_mapping->host; 893 struct inode *inode = file->f_mapping->host;
942 int err; 894 int err;
943 int ret; 895 int ret;
@@ -948,13 +900,42 @@ int generic_file_fsync(struct file *file, int datasync)
948 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 900 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
949 return ret; 901 return ret;
950 902
951 err = sync_inode(inode, &wbc); 903 err = sync_inode_metadata(inode, 1);
952 if (ret == 0) 904 if (ret == 0)
953 ret = err; 905 ret = err;
954 return ret; 906 return ret;
955} 907}
956EXPORT_SYMBOL(generic_file_fsync); 908EXPORT_SYMBOL(generic_file_fsync);
957 909
910/**
911 * generic_check_addressable - Check addressability of file system
912 * @blocksize_bits: log of file system block size
913 * @num_blocks: number of blocks in file system
914 *
915 * Determine whether a file system with @num_blocks blocks (and a
916 * block size of 2**@blocksize_bits) is addressable by the sector_t
917 * and page cache of the system. Return 0 if so and -EFBIG otherwise.
918 */
919int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
920{
921 u64 last_fs_block = num_blocks - 1;
922 u64 last_fs_page =
923 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
924
925 if (unlikely(num_blocks == 0))
926 return 0;
927
928 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
929 return -EINVAL;
930
931 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
932 (last_fs_page > (pgoff_t)(~0ULL))) {
933 return -EFBIG;
934 }
935 return 0;
936}
937EXPORT_SYMBOL(generic_check_addressable);
938
958/* 939/*
959 * No-op implementation of ->fsync for in-memory filesystems. 940 * No-op implementation of ->fsync for in-memory filesystems.
960 */ 941 */
@@ -968,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
968EXPORT_SYMBOL(dcache_dir_open); 949EXPORT_SYMBOL(dcache_dir_open);
969EXPORT_SYMBOL(dcache_readdir); 950EXPORT_SYMBOL(dcache_readdir);
970EXPORT_SYMBOL(generic_read_dir); 951EXPORT_SYMBOL(generic_read_dir);
971EXPORT_SYMBOL(get_sb_pseudo); 952EXPORT_SYMBOL(mount_pseudo);
972EXPORT_SYMBOL(simple_write_begin); 953EXPORT_SYMBOL(simple_write_begin);
973EXPORT_SYMBOL(simple_write_end); 954EXPORT_SYMBOL(simple_write_end);
974EXPORT_SYMBOL(simple_dir_inode_operations); 955EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993..d5bb86866e6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
42}; 42};
43 43
44static LIST_HEAD(nlm_blocked); 44static LIST_HEAD(nlm_blocked);
45static DEFINE_SPINLOCK(nlm_blocked_lock);
45 46
46/** 47/**
47 * nlmclnt_init - Set up per-NFS mount point lockd data structures 48 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
97 block->b_lock = fl; 98 block->b_lock = fl;
98 init_waitqueue_head(&block->b_wait); 99 init_waitqueue_head(&block->b_wait);
99 block->b_status = nlm_lck_blocked; 100 block->b_status = nlm_lck_blocked;
101
102 spin_lock(&nlm_blocked_lock);
100 list_add(&block->b_list, &nlm_blocked); 103 list_add(&block->b_list, &nlm_blocked);
104 spin_unlock(&nlm_blocked_lock);
101 } 105 }
102 return block; 106 return block;
103} 107}
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
106{ 110{
107 if (block == NULL) 111 if (block == NULL)
108 return; 112 return;
113 spin_lock(&nlm_blocked_lock);
109 list_del(&block->b_list); 114 list_del(&block->b_list);
115 spin_unlock(&nlm_blocked_lock);
110 kfree(block); 116 kfree(block);
111} 117}
112 118
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
154 * Look up blocked request based on arguments. 160 * Look up blocked request based on arguments.
155 * Warning: must not use cookie to match it! 161 * Warning: must not use cookie to match it!
156 */ 162 */
163 spin_lock(&nlm_blocked_lock);
157 list_for_each_entry(block, &nlm_blocked, b_list) { 164 list_for_each_entry(block, &nlm_blocked, b_list) {
158 struct file_lock *fl_blocked = block->b_lock; 165 struct file_lock *fl_blocked = block->b_lock;
159 166
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
178 wake_up(&block->b_wait); 185 wake_up(&block->b_wait);
179 res = nlm_granted; 186 res = nlm_granted;
180 } 187 }
188 spin_unlock(&nlm_blocked_lock);
181 return res; 189 return res;
182} 190}
183 191
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
216 allow_signal(SIGKILL); 224 allow_signal(SIGKILL);
217 225
218 down_write(&host->h_rwsem); 226 down_write(&host->h_rwsem);
219
220 /* This one ensures that our parent doesn't terminate while the
221 * reclaim is in progress */
222 lock_kernel();
223 lockd_up(); /* note: this cannot fail as lockd is already running */ 227 lockd_up(); /* note: this cannot fail as lockd is already running */
224 228
225 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 229 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
260 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); 264 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
261 265
262 /* Now, wake up all processes that sleep on a blocked lock */ 266 /* Now, wake up all processes that sleep on a blocked lock */
267 spin_lock(&nlm_blocked_lock);
263 list_for_each_entry(block, &nlm_blocked, b_list) { 268 list_for_each_entry(block, &nlm_blocked, b_list) {
264 if (block->b_host == host) { 269 if (block->b_host == host) {
265 block->b_status = nlm_lck_denied_grace_period; 270 block->b_status = nlm_lck_denied_grace_period;
266 wake_up(&block->b_wait); 271 wake_up(&block->b_wait);
267 } 272 }
268 } 273 }
274 spin_unlock(&nlm_blocked_lock);
269 275
270 /* Release host handle after use */ 276 /* Release host handle after use */
271 nlm_release_host(host); 277 nlm_release_host(host);
272 lockd_down(); 278 lockd_down();
273 unlock_kernel();
274 return 0; 279 return 0;
275} 280}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab..47ea1e1925b 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
166 /* Set up the argument struct */ 166 /* Set up the argument struct */
167 nlmclnt_setlockargs(call, fl); 167 nlmclnt_setlockargs(call, fl);
168 168
169 lock_kernel();
170 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { 169 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
171 if (fl->fl_type != F_UNLCK) { 170 if (fl->fl_type != F_UNLCK) {
172 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; 171 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
177 status = nlmclnt_test(call, fl); 176 status = nlmclnt_test(call, fl);
178 else 177 else
179 status = -EINVAL; 178 status = -EINVAL;
180
181 fl->fl_ops->fl_release_private(fl); 179 fl->fl_ops->fl_release_private(fl);
182 fl->fl_ops = NULL; 180 fl->fl_ops = NULL;
183 unlock_kernel();
184 181
185 dprintk("lockd: clnt proc returns %d\n", status); 182 dprintk("lockd: clnt proc returns %d\n", status);
186 return status; 183 return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
226 223
227static void nlmclnt_rpc_release(void *data) 224static void nlmclnt_rpc_release(void *data)
228{ 225{
229 lock_kernel();
230 nlm_release_call(data); 226 nlm_release_call(data);
231 unlock_kernel();
232} 227}
233 228
234static int nlm_wait_on_grace(wait_queue_head_t *queue) 229static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
448 443
449static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) 444static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
450{ 445{
446 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
451 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; 447 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
452 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); 448 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
453 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); 449 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
450 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
454} 451}
455 452
456static void nlmclnt_locks_release_private(struct file_lock *fl) 453static void nlmclnt_locks_release_private(struct file_lock *fl)
457{ 454{
455 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
458 list_del(&fl->fl_u.nfs_fl.list); 456 list_del(&fl->fl_u.nfs_fl.list);
457 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
459 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
460} 459}
461 460
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
721die: 720die:
722 return; 721 return;
723 retry_rebind: 722 retry_rebind:
724 lock_kernel();
725 nlm_rebind_host(req->a_host); 723 nlm_rebind_host(req->a_host);
726 unlock_kernel();
727 retry_unlock: 724 retry_unlock:
728 rpc_restart_call(task); 725 rpc_restart_call(task);
729} 726}
@@ -801,9 +798,7 @@ retry_cancel:
801 /* Don't ever retry more than 3 times */ 798 /* Don't ever retry more than 3 times */
802 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 799 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
803 goto die; 800 goto die;
804 lock_kernel();
805 nlm_rebind_host(req->a_host); 801 nlm_rebind_host(req->a_host);
806 unlock_kernel();
807 rpc_restart_call(task); 802 rpc_restart_call(task);
808 rpc_delay(task, 30 * HZ); 803 rpc_delay(task, 30 * HZ);
809} 804}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104..25e21e4023b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
353 .to_retries = 5U, 353 .to_retries = 5U,
354 }; 354 };
355 struct rpc_create_args args = { 355 struct rpc_create_args args = {
356 .net = &init_net,
356 .protocol = host->h_proto, 357 .protocol = host->h_proto,
357 .address = nlm_addr(host), 358 .address = nlm_addr(host),
358 .addrsize = host->h_addrlen, 359 .addrsize = host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fba..e0c91894964 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
70 }; 70 };
71 struct rpc_create_args args = { 71 struct rpc_create_args args = {
72 .net = &init_net,
72 .protocol = XPRT_TRANSPORT_UDP, 73 .protocol = XPRT_TRANSPORT_UDP,
73 .address = (struct sockaddr *)&sin, 74 .address = (struct sockaddr *)&sin,
74 .addrsize = sizeof(sin), 75 .addrsize = sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a039..abfff9d7979 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/smp.h> 24#include <linux/smp.h>
25#include <linux/smp_lock.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/kthread.h> 26#include <linux/kthread.h>
28#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
130 129
131 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 130 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
132 131
133 /*
134 * FIXME: it would be nice if lockd didn't spend its entire life
135 * running under the BKL. At the very least, it would be good to
136 * have someone clarify what it's intended to protect here. I've
137 * seen some handwavy posts about posix locking needing to be
138 * done under the BKL, but it's far from clear.
139 */
140 lock_kernel();
141
142 if (!nlm_timeout) 132 if (!nlm_timeout)
143 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
144 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
195 if (nlmsvc_ops) 185 if (nlmsvc_ops)
196 nlmsvc_invalidate_all(); 186 nlmsvc_invalidate_all();
197 nlm_shutdown_hosts(); 187 nlm_shutdown_hosts();
198 unlock_kernel();
199 return 0; 188 return 0;
200} 189}
201 190
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
206 195
207 xprt = svc_find_xprt(serv, name, family, 0); 196 xprt = svc_find_xprt(serv, name, family, 0);
208 if (xprt == NULL) 197 if (xprt == NULL)
209 return svc_create_xprt(serv, name, family, port, 198 return svc_create_xprt(serv, name, &init_net, family, port,
210 SVC_SOCK_DEFAULTS); 199 SVC_SOCK_DEFAULTS);
211 svc_xprt_put(xprt); 200 svc_xprt_put(xprt);
212 return 0; 201 return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a13..a336e832475 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
230 230
231static void nlm4svc_callback_release(void *data) 231static void nlm4svc_callback_release(void *data)
232{ 232{
233 lock_kernel();
234 nlm_release_call(data); 233 nlm_release_call(data);
235 unlock_kernel();
236} 234}
237 235
238static const struct rpc_call_ops nlm4svc_callback_ops = { 236static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc..c462d346acb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
52 * The list of blocked locks to retry 52 * The list of blocked locks to retry
53 */ 53 */
54static LIST_HEAD(nlm_blocked); 54static LIST_HEAD(nlm_blocked);
55static DEFINE_SPINLOCK(nlm_blocked_lock);
55 56
56/* 57/*
57 * Insert a blocked lock into the global list 58 * Insert a blocked lock into the global list
58 */ 59 */
59static void 60static void
60nlmsvc_insert_block(struct nlm_block *block, unsigned long when) 61nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
61{ 62{
62 struct nlm_block *b; 63 struct nlm_block *b;
63 struct list_head *pos; 64 struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
87 block->b_when = when; 88 block->b_when = when;
88} 89}
89 90
91static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
92{
93 spin_lock(&nlm_blocked_lock);
94 nlmsvc_insert_block_locked(block, when);
95 spin_unlock(&nlm_blocked_lock);
96}
97
90/* 98/*
91 * Remove a block from the global list 99 * Remove a block from the global list
92 */ 100 */
@@ -94,7 +102,9 @@ static inline void
94nlmsvc_remove_block(struct nlm_block *block) 102nlmsvc_remove_block(struct nlm_block *block)
95{ 103{
96 if (!list_empty(&block->b_list)) { 104 if (!list_empty(&block->b_list)) {
105 spin_lock(&nlm_blocked_lock);
97 list_del_init(&block->b_list); 106 list_del_init(&block->b_list);
107 spin_unlock(&nlm_blocked_lock);
98 nlmsvc_release_block(block); 108 nlmsvc_release_block(block);
99 } 109 }
100} 110}
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
651 struct nlm_block *block; 661 struct nlm_block *block;
652 int rc = -ENOENT; 662 int rc = -ENOENT;
653 663
654 lock_kernel(); 664 spin_lock(&nlm_blocked_lock);
655 list_for_each_entry(block, &nlm_blocked, b_list) { 665 list_for_each_entry(block, &nlm_blocked, b_list) {
656 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 666 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
657 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", 667 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
665 } else if (result == 0) 675 } else if (result == 0)
666 block->b_granted = 1; 676 block->b_granted = 1;
667 677
668 nlmsvc_insert_block(block, 0); 678 nlmsvc_insert_block_locked(block, 0);
669 svc_wake_up(block->b_daemon); 679 svc_wake_up(block->b_daemon);
670 rc = 0; 680 rc = 0;
671 break; 681 break;
672 } 682 }
673 } 683 }
674 unlock_kernel(); 684 spin_unlock(&nlm_blocked_lock);
675 if (rc == -ENOENT) 685 if (rc == -ENOENT)
676 printk(KERN_WARNING "lockd: grant for unknown block\n"); 686 printk(KERN_WARNING "lockd: grant for unknown block\n");
677 return rc; 687 return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
690 struct nlm_block *block; 700 struct nlm_block *block;
691 701
692 dprintk("lockd: VFS unblock notification for block %p\n", fl); 702 dprintk("lockd: VFS unblock notification for block %p\n", fl);
703 spin_lock(&nlm_blocked_lock);
693 list_for_each_entry(block, &nlm_blocked, b_list) { 704 list_for_each_entry(block, &nlm_blocked, b_list) {
694 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 705 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
695 nlmsvc_insert_block(block, 0); 706 nlmsvc_insert_block_locked(block, 0);
707 spin_unlock(&nlm_blocked_lock);
696 svc_wake_up(block->b_daemon); 708 svc_wake_up(block->b_daemon);
697 return; 709 return;
698 } 710 }
699 } 711 }
700 712 spin_unlock(&nlm_blocked_lock);
701 printk(KERN_WARNING "lockd: notification for unknown block!\n"); 713 printk(KERN_WARNING "lockd: notification for unknown block!\n");
702} 714}
703 715
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
803 815
804 dprintk("lockd: GRANT_MSG RPC callback\n"); 816 dprintk("lockd: GRANT_MSG RPC callback\n");
805 817
806 lock_kernel(); 818 spin_lock(&nlm_blocked_lock);
807 /* if the block is not on a list at this point then it has 819 /* if the block is not on a list at this point then it has
808 * been invalidated. Don't try to requeue it. 820 * been invalidated. Don't try to requeue it.
809 * 821 *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
825 /* Call was successful, now wait for client callback */ 837 /* Call was successful, now wait for client callback */
826 timeout = 60 * HZ; 838 timeout = 60 * HZ;
827 } 839 }
828 nlmsvc_insert_block(block, timeout); 840 nlmsvc_insert_block_locked(block, timeout);
829 svc_wake_up(block->b_daemon); 841 svc_wake_up(block->b_daemon);
830out: 842out:
831 unlock_kernel(); 843 spin_unlock(&nlm_blocked_lock);
832} 844}
833 845
846/*
847 * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an
848 * .rpc_release rpc_call_op
849 */
834static void nlmsvc_grant_release(void *data) 850static void nlmsvc_grant_release(void *data)
835{ 851{
836 struct nlm_rqst *call = data; 852 struct nlm_rqst *call = data;
837
838 lock_kernel();
839 nlmsvc_release_block(call->a_block); 853 nlmsvc_release_block(call->a_block);
840 unlock_kernel();
841} 854}
842 855
843static const struct rpc_call_ops nlmsvc_grant_ops = { 856static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7..c3069f38d60 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
260 260
261static void nlmsvc_callback_release(void *data) 261static void nlmsvc_callback_release(void *data)
262{ 262{
263 lock_kernel();
264 nlm_release_call(data); 263 nlm_release_call(data);
265 unlock_kernel();
266} 264}
267 265
268static const struct rpc_call_ops nlmsvc_callback_ops = { 266static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3d..1ca0679c80b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
170 170
171again: 171again:
172 file->f_locks = 0; 172 file->f_locks = 0;
173 lock_flocks(); /* protects i_flock list */
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 174 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 175 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 176 continue;
@@ -181,6 +182,7 @@ again:
181 if (match(lockhost, host)) { 182 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 183 struct file_lock lock = *fl;
183 184
185 unlock_flocks();
184 lock.fl_type = F_UNLCK; 186 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 187 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 188 lock.fl_end = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
192 goto again; 194 goto again;
193 } 195 }
194 } 196 }
197 unlock_flocks();
195 198
196 return 0; 199 return 0;
197} 200}
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
226 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 229 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
227 return 1; 230 return 1;
228 231
232 lock_flocks();
229 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 233 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
230 if (fl->fl_lmops == &nlmsvc_lock_operations) 234 if (fl->fl_lmops == &nlmsvc_lock_operations) {
235 unlock_flocks();
231 return 1; 236 return 1;
237 }
232 } 238 }
239 unlock_flocks();
233 file->f_locks = 0; 240 file->f_locks = 0;
234 return 0; 241 return 0;
235} 242}
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc04..65765cb6afe 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,14 +142,32 @@ int lease_break_time = 45;
142 142
143static LIST_HEAD(file_lock_list); 143static LIST_HEAD(file_lock_list);
144static LIST_HEAD(blocked_list); 144static LIST_HEAD(blocked_list);
145static DEFINE_SPINLOCK(file_lock_lock);
146
147/*
148 * Protects the two list heads above, plus the inode->i_flock list
149 * FIXME: should use a spinlock, once lockd and ceph are ready.
150 */
151void lock_flocks(void)
152{
153 spin_lock(&file_lock_lock);
154}
155EXPORT_SYMBOL_GPL(lock_flocks);
156
157void unlock_flocks(void)
158{
159 spin_unlock(&file_lock_lock);
160}
161EXPORT_SYMBOL_GPL(unlock_flocks);
145 162
146static struct kmem_cache *filelock_cache __read_mostly; 163static struct kmem_cache *filelock_cache __read_mostly;
147 164
148/* Allocate an empty lock structure. */ 165/* Allocate an empty lock structure. */
149static struct file_lock *locks_alloc_lock(void) 166struct file_lock *locks_alloc_lock(void)
150{ 167{
151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL); 168 return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
152} 169}
170EXPORT_SYMBOL_GPL(locks_alloc_lock);
153 171
154void locks_release_private(struct file_lock *fl) 172void locks_release_private(struct file_lock *fl)
155{ 173{
@@ -168,7 +186,7 @@ void locks_release_private(struct file_lock *fl)
168EXPORT_SYMBOL_GPL(locks_release_private); 186EXPORT_SYMBOL_GPL(locks_release_private);
169 187
170/* Free a lock which is not in use. */ 188/* Free a lock which is not in use. */
171static void locks_free_lock(struct file_lock *fl) 189void locks_free_lock(struct file_lock *fl)
172{ 190{
173 BUG_ON(waitqueue_active(&fl->fl_wait)); 191 BUG_ON(waitqueue_active(&fl->fl_wait));
174 BUG_ON(!list_empty(&fl->fl_block)); 192 BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl)
177 locks_release_private(fl); 195 locks_release_private(fl);
178 kmem_cache_free(filelock_cache, fl); 196 kmem_cache_free(filelock_cache, fl);
179} 197}
198EXPORT_SYMBOL(locks_free_lock);
180 199
181void locks_init_lock(struct file_lock *fl) 200void locks_init_lock(struct file_lock *fl)
182{ 201{
@@ -216,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
216 fl->fl_ops->fl_copy_lock(new, fl); 235 fl->fl_ops->fl_copy_lock(new, fl);
217 new->fl_ops = fl->fl_ops; 236 new->fl_ops = fl->fl_ops;
218 } 237 }
219 if (fl->fl_lmops) { 238 if (fl->fl_lmops)
220 if (fl->fl_lmops->fl_copy_lock)
221 fl->fl_lmops->fl_copy_lock(new, fl);
222 new->fl_lmops = fl->fl_lmops; 239 new->fl_lmops = fl->fl_lmops;
223 }
224} 240}
225 241
226/* 242/*
@@ -511,9 +527,9 @@ static void __locks_delete_block(struct file_lock *waiter)
511 */ 527 */
512static void locks_delete_block(struct file_lock *waiter) 528static void locks_delete_block(struct file_lock *waiter)
513{ 529{
514 lock_kernel(); 530 lock_flocks();
515 __locks_delete_block(waiter); 531 __locks_delete_block(waiter);
516 unlock_kernel(); 532 unlock_flocks();
517} 533}
518 534
519/* Insert waiter into blocker's block list. 535/* Insert waiter into blocker's block list.
@@ -644,7 +660,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
644{ 660{
645 struct file_lock *cfl; 661 struct file_lock *cfl;
646 662
647 lock_kernel(); 663 lock_flocks();
648 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { 664 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
649 if (!IS_POSIX(cfl)) 665 if (!IS_POSIX(cfl))
650 continue; 666 continue;
@@ -657,7 +673,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
657 fl->fl_pid = pid_vnr(cfl->fl_nspid); 673 fl->fl_pid = pid_vnr(cfl->fl_nspid);
658 } else 674 } else
659 fl->fl_type = F_UNLCK; 675 fl->fl_type = F_UNLCK;
660 unlock_kernel(); 676 unlock_flocks();
661 return; 677 return;
662} 678}
663EXPORT_SYMBOL(posix_test_lock); 679EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +746,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
730 int error = 0; 746 int error = 0;
731 int found = 0; 747 int found = 0;
732 748
733 lock_kernel(); 749 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
734 if (request->fl_flags & FL_ACCESS)
735 goto find_conflict;
736
737 if (request->fl_type != F_UNLCK) {
738 error = -ENOMEM;
739 new_fl = locks_alloc_lock(); 750 new_fl = locks_alloc_lock();
740 if (new_fl == NULL) 751 if (!new_fl)
741 goto out; 752 return -ENOMEM;
742 error = 0;
743 } 753 }
744 754
755 lock_flocks();
756 if (request->fl_flags & FL_ACCESS)
757 goto find_conflict;
758
745 for_each_lock(inode, before) { 759 for_each_lock(inode, before) {
746 struct file_lock *fl = *before; 760 struct file_lock *fl = *before;
747 if (IS_POSIX(fl)) 761 if (IS_POSIX(fl))
@@ -767,8 +781,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
767 * If a higher-priority process was blocked on the old file lock, 781 * If a higher-priority process was blocked on the old file lock,
768 * give it the opportunity to lock the file. 782 * give it the opportunity to lock the file.
769 */ 783 */
770 if (found) 784 if (found) {
785 unlock_flocks();
771 cond_resched(); 786 cond_resched();
787 lock_flocks();
788 }
772 789
773find_conflict: 790find_conflict:
774 for_each_lock(inode, before) { 791 for_each_lock(inode, before) {
@@ -794,7 +811,7 @@ find_conflict:
794 error = 0; 811 error = 0;
795 812
796out: 813out:
797 unlock_kernel(); 814 unlock_flocks();
798 if (new_fl) 815 if (new_fl)
799 locks_free_lock(new_fl); 816 locks_free_lock(new_fl);
800 return error; 817 return error;
@@ -823,7 +840,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
823 new_fl2 = locks_alloc_lock(); 840 new_fl2 = locks_alloc_lock();
824 } 841 }
825 842
826 lock_kernel(); 843 lock_flocks();
827 if (request->fl_type != F_UNLCK) { 844 if (request->fl_type != F_UNLCK) {
828 for_each_lock(inode, before) { 845 for_each_lock(inode, before) {
829 fl = *before; 846 fl = *before;
@@ -991,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
991 locks_wake_up_blocks(left); 1008 locks_wake_up_blocks(left);
992 } 1009 }
993 out: 1010 out:
994 unlock_kernel(); 1011 unlock_flocks();
995 /* 1012 /*
996 * Free any unused locks. 1013 * Free any unused locks.
997 */ 1014 */
@@ -1066,14 +1083,14 @@ int locks_mandatory_locked(struct inode *inode)
1066 /* 1083 /*
1067 * Search the lock list for this inode for any POSIX locks. 1084 * Search the lock list for this inode for any POSIX locks.
1068 */ 1085 */
1069 lock_kernel(); 1086 lock_flocks();
1070 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1087 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1071 if (!IS_POSIX(fl)) 1088 if (!IS_POSIX(fl))
1072 continue; 1089 continue;
1073 if (fl->fl_owner != owner) 1090 if (fl->fl_owner != owner)
1074 break; 1091 break;
1075 } 1092 }
1076 unlock_kernel(); 1093 unlock_flocks();
1077 return fl ? -EAGAIN : 0; 1094 return fl ? -EAGAIN : 0;
1078} 1095}
1079 1096
@@ -1186,7 +1203,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1186 1203
1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1204 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1188 1205
1189 lock_kernel(); 1206 lock_flocks();
1190 1207
1191 time_out_leases(inode); 1208 time_out_leases(inode);
1192 1209
@@ -1247,8 +1264,10 @@ restart:
1247 break_time++; 1264 break_time++;
1248 } 1265 }
1249 locks_insert_block(flock, new_fl); 1266 locks_insert_block(flock, new_fl);
1267 unlock_flocks();
1250 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1268 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1251 !new_fl->fl_next, break_time); 1269 !new_fl->fl_next, break_time);
1270 lock_flocks();
1252 __locks_delete_block(new_fl); 1271 __locks_delete_block(new_fl);
1253 if (error >= 0) { 1272 if (error >= 0) {
1254 if (error == 0) 1273 if (error == 0)
@@ -1263,7 +1282,7 @@ restart:
1263 } 1282 }
1264 1283
1265out: 1284out:
1266 unlock_kernel(); 1285 unlock_flocks();
1267 if (!IS_ERR(new_fl)) 1286 if (!IS_ERR(new_fl))
1268 locks_free_lock(new_fl); 1287 locks_free_lock(new_fl);
1269 return error; 1288 return error;
@@ -1319,7 +1338,7 @@ int fcntl_getlease(struct file *filp)
1319 struct file_lock *fl; 1338 struct file_lock *fl;
1320 int type = F_UNLCK; 1339 int type = F_UNLCK;
1321 1340
1322 lock_kernel(); 1341 lock_flocks();
1323 time_out_leases(filp->f_path.dentry->d_inode); 1342 time_out_leases(filp->f_path.dentry->d_inode);
1324 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1343 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1325 fl = fl->fl_next) { 1344 fl = fl->fl_next) {
@@ -1328,7 +1347,7 @@ int fcntl_getlease(struct file *filp)
1328 break; 1347 break;
1329 } 1348 }
1330 } 1349 }
1331 unlock_kernel(); 1350 unlock_flocks();
1332 return type; 1351 return type;
1333} 1352}
1334 1353
@@ -1341,36 +1360,32 @@ int fcntl_getlease(struct file *filp)
1341 * The (input) flp->fl_lmops->fl_break function is required 1360 * The (input) flp->fl_lmops->fl_break function is required
1342 * by break_lease(). 1361 * by break_lease().
1343 * 1362 *
1344 * Called with kernel lock held. 1363 * Called with file_lock_lock held.
1345 */ 1364 */
1346int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1365int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1347{ 1366{
1348 struct file_lock *fl, **before, **my_before = NULL, *lease; 1367 struct file_lock *fl, **before, **my_before = NULL, *lease;
1349 struct file_lock *new_fl = NULL;
1350 struct dentry *dentry = filp->f_path.dentry; 1368 struct dentry *dentry = filp->f_path.dentry;
1351 struct inode *inode = dentry->d_inode; 1369 struct inode *inode = dentry->d_inode;
1352 int error, rdlease_count = 0, wrlease_count = 0; 1370 int error, rdlease_count = 0, wrlease_count = 0;
1353 1371
1372 lease = *flp;
1373
1374 error = -EACCES;
1354 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1375 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1355 return -EACCES; 1376 goto out;
1377 error = -EINVAL;
1356 if (!S_ISREG(inode->i_mode)) 1378 if (!S_ISREG(inode->i_mode))
1357 return -EINVAL; 1379 goto out;
1358 error = security_file_lock(filp, arg); 1380 error = security_file_lock(filp, arg);
1359 if (error) 1381 if (error)
1360 return error; 1382 goto out;
1361 1383
1362 time_out_leases(inode); 1384 time_out_leases(inode);
1363 1385
1364 BUG_ON(!(*flp)->fl_lmops->fl_break); 1386 BUG_ON(!(*flp)->fl_lmops->fl_break);
1365 1387
1366 lease = *flp;
1367
1368 if (arg != F_UNLCK) { 1388 if (arg != F_UNLCK) {
1369 error = -ENOMEM;
1370 new_fl = locks_alloc_lock();
1371 if (new_fl == NULL)
1372 goto out;
1373
1374 error = -EAGAIN; 1389 error = -EAGAIN;
1375 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1390 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1376 goto out; 1391 goto out;
@@ -1410,12 +1425,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1410 goto out; 1425 goto out;
1411 1426
1412 if (my_before != NULL) { 1427 if (my_before != NULL) {
1413 *flp = *my_before;
1414 error = lease->fl_lmops->fl_change(my_before, arg); 1428 error = lease->fl_lmops->fl_change(my_before, arg);
1429 if (!error)
1430 *flp = *my_before;
1415 goto out; 1431 goto out;
1416 } 1432 }
1417 1433
1418 error = 0;
1419 if (arg == F_UNLCK) 1434 if (arg == F_UNLCK)
1420 goto out; 1435 goto out;
1421 1436
@@ -1423,20 +1438,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1423 if (!leases_enable) 1438 if (!leases_enable)
1424 goto out; 1439 goto out;
1425 1440
1426 locks_copy_lock(new_fl, lease); 1441 locks_insert_lock(before, lease);
1427 locks_insert_lock(before, new_fl);
1428
1429 *flp = new_fl;
1430 return 0; 1442 return 0;
1431 1443
1432out: 1444out:
1433 if (new_fl != NULL)
1434 locks_free_lock(new_fl);
1435 return error; 1445 return error;
1436} 1446}
1437EXPORT_SYMBOL(generic_setlease); 1447EXPORT_SYMBOL(generic_setlease);
1438 1448
1439 /** 1449static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1450{
1451 if (filp->f_op && filp->f_op->setlease)
1452 return filp->f_op->setlease(filp, arg, lease);
1453 else
1454 return generic_setlease(filp, arg, lease);
1455}
1456
1457/**
1440 * vfs_setlease - sets a lease on an open file 1458 * vfs_setlease - sets a lease on an open file
1441 * @filp: file pointer 1459 * @filp: file pointer
1442 * @arg: type of lease to obtain 1460 * @arg: type of lease to obtain
@@ -1467,60 +1485,90 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1467{ 1485{
1468 int error; 1486 int error;
1469 1487
1470 lock_kernel(); 1488 lock_flocks();
1471 if (filp->f_op && filp->f_op->setlease) 1489 error = __vfs_setlease(filp, arg, lease);
1472 error = filp->f_op->setlease(filp, arg, lease); 1490 unlock_flocks();
1473 else
1474 error = generic_setlease(filp, arg, lease);
1475 unlock_kernel();
1476 1491
1477 return error; 1492 return error;
1478} 1493}
1479EXPORT_SYMBOL_GPL(vfs_setlease); 1494EXPORT_SYMBOL_GPL(vfs_setlease);
1480 1495
1481/** 1496static int do_fcntl_delete_lease(struct file *filp)
1482 * fcntl_setlease - sets a lease on an open file
1483 * @fd: open file descriptor
1484 * @filp: file pointer
1485 * @arg: type of lease to obtain
1486 *
1487 * Call this fcntl to establish a lease on the file.
1488 * Note that you also need to call %F_SETSIG to
1489 * receive a signal when the lease is broken.
1490 */
1491int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1492{ 1497{
1493 struct file_lock fl, *flp = &fl; 1498 struct file_lock fl, *flp = &fl;
1499
1500 lease_init(filp, F_UNLCK, flp);
1501
1502 return vfs_setlease(filp, F_UNLCK, &flp);
1503}
1504
1505static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1506{
1507 struct file_lock *fl;
1508 struct fasync_struct *new;
1494 struct inode *inode = filp->f_path.dentry->d_inode; 1509 struct inode *inode = filp->f_path.dentry->d_inode;
1495 int error; 1510 int error;
1496 1511
1497 locks_init_lock(&fl); 1512 fl = lease_alloc(filp, arg);
1498 error = lease_init(filp, arg, &fl); 1513 if (IS_ERR(fl))
1499 if (error) 1514 return PTR_ERR(fl);
1500 return error;
1501 1515
1502 lock_kernel(); 1516 new = fasync_alloc();
1517 if (!new) {
1518 locks_free_lock(fl);
1519 return -ENOMEM;
1520 }
1521 lock_flocks();
1522 error = __vfs_setlease(filp, arg, &fl);
1523 if (error) {
1524 unlock_flocks();
1525 locks_free_lock(fl);
1526 goto out_free_fasync;
1527 }
1503 1528
1504 error = vfs_setlease(filp, arg, &flp); 1529 /*
1505 if (error || arg == F_UNLCK) 1530 * fasync_insert_entry() returns the old entry if any.
1506 goto out_unlock; 1531 * If there was no old entry, then it used 'new' and
1532 * inserted it into the fasync list. Clear new so that
1533 * we don't release it here.
1534 */
1535 if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
1536 new = NULL;
1507 1537
1508 error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
1509 if (error < 0) { 1538 if (error < 0) {
1510 /* remove lease just inserted by setlease */ 1539 /* remove lease just inserted by setlease */
1511 flp->fl_type = F_UNLCK | F_INPROGRESS; 1540 fl->fl_type = F_UNLCK | F_INPROGRESS;
1512 flp->fl_break_time = jiffies - 10; 1541 fl->fl_break_time = jiffies - 10;
1513 time_out_leases(inode); 1542 time_out_leases(inode);
1514 goto out_unlock; 1543 } else {
1544 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1515 } 1545 }
1546 unlock_flocks();
1516 1547
1517 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1548out_free_fasync:
1518out_unlock: 1549 if (new)
1519 unlock_kernel(); 1550 fasync_free(new);
1520 return error; 1551 return error;
1521} 1552}
1522 1553
1523/** 1554/**
1555 * fcntl_setlease - sets a lease on an open file
1556 * @fd: open file descriptor
1557 * @filp: file pointer
1558 * @arg: type of lease to obtain
1559 *
1560 * Call this fcntl to establish a lease on the file.
1561 * Note that you also need to call %F_SETSIG to
1562 * receive a signal when the lease is broken.
1563 */
1564int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1565{
1566 if (arg == F_UNLCK)
1567 return do_fcntl_delete_lease(filp);
1568 return do_fcntl_add_lease(fd, filp, arg);
1569}
1570
1571/**
1524 * flock_lock_file_wait - Apply a FLOCK-style lock to a file 1572 * flock_lock_file_wait - Apply a FLOCK-style lock to a file
1525 * @filp: The file to apply the lock to 1573 * @filp: The file to apply the lock to
1526 * @fl: The lock to be applied 1574 * @fl: The lock to be applied
@@ -2020,7 +2068,7 @@ void locks_remove_flock(struct file *filp)
2020 fl.fl_ops->fl_release_private(&fl); 2068 fl.fl_ops->fl_release_private(&fl);
2021 } 2069 }
2022 2070
2023 lock_kernel(); 2071 lock_flocks();
2024 before = &inode->i_flock; 2072 before = &inode->i_flock;
2025 2073
2026 while ((fl = *before) != NULL) { 2074 while ((fl = *before) != NULL) {
@@ -2038,7 +2086,7 @@ void locks_remove_flock(struct file *filp)
2038 } 2086 }
2039 before = &fl->fl_next; 2087 before = &fl->fl_next;
2040 } 2088 }
2041 unlock_kernel(); 2089 unlock_flocks();
2042} 2090}
2043 2091
2044/** 2092/**
@@ -2053,12 +2101,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
2053{ 2101{
2054 int status = 0; 2102 int status = 0;
2055 2103
2056 lock_kernel(); 2104 lock_flocks();
2057 if (waiter->fl_next) 2105 if (waiter->fl_next)
2058 __locks_delete_block(waiter); 2106 __locks_delete_block(waiter);
2059 else 2107 else
2060 status = -ENOENT; 2108 status = -ENOENT;
2061 unlock_kernel(); 2109 unlock_flocks();
2062 return status; 2110 return status;
2063} 2111}
2064 2112
@@ -2085,7 +2133,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2085#include <linux/seq_file.h> 2133#include <linux/seq_file.h>
2086 2134
2087static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2135static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2088 int id, char *pfx) 2136 loff_t id, char *pfx)
2089{ 2137{
2090 struct inode *inode = NULL; 2138 struct inode *inode = NULL;
2091 unsigned int fl_pid; 2139 unsigned int fl_pid;
@@ -2098,7 +2146,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2098 if (fl->fl_file != NULL) 2146 if (fl->fl_file != NULL)
2099 inode = fl->fl_file->f_path.dentry->d_inode; 2147 inode = fl->fl_file->f_path.dentry->d_inode;
2100 2148
2101 seq_printf(f, "%d:%s ", id, pfx); 2149 seq_printf(f, "%lld:%s ", id, pfx);
2102 if (IS_POSIX(fl)) { 2150 if (IS_POSIX(fl)) {
2103 seq_printf(f, "%6s %s ", 2151 seq_printf(f, "%6s %s ",
2104 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2152 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2209,33 @@ static int locks_show(struct seq_file *f, void *v)
2161 2209
2162 fl = list_entry(v, struct file_lock, fl_link); 2210 fl = list_entry(v, struct file_lock, fl_link);
2163 2211
2164 lock_get_status(f, fl, (long)f->private, ""); 2212 lock_get_status(f, fl, *((loff_t *)f->private), "");
2165 2213
2166 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2214 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2167 lock_get_status(f, bfl, (long)f->private, " ->"); 2215 lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
2168 2216
2169 f->private++;
2170 return 0; 2217 return 0;
2171} 2218}
2172 2219
2173static void *locks_start(struct seq_file *f, loff_t *pos) 2220static void *locks_start(struct seq_file *f, loff_t *pos)
2174{ 2221{
2175 lock_kernel(); 2222 loff_t *p = f->private;
2176 f->private = (void *)1; 2223
2224 lock_flocks();
2225 *p = (*pos + 1);
2177 return seq_list_start(&file_lock_list, *pos); 2226 return seq_list_start(&file_lock_list, *pos);
2178} 2227}
2179 2228
2180static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2229static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2181{ 2230{
2231 loff_t *p = f->private;
2232 ++*p;
2182 return seq_list_next(v, &file_lock_list, pos); 2233 return seq_list_next(v, &file_lock_list, pos);
2183} 2234}
2184 2235
2185static void locks_stop(struct seq_file *f, void *v) 2236static void locks_stop(struct seq_file *f, void *v)
2186{ 2237{
2187 unlock_kernel(); 2238 unlock_flocks();
2188} 2239}
2189 2240
2190static const struct seq_operations locks_seq_operations = { 2241static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2247,14 @@ static const struct seq_operations locks_seq_operations = {
2196 2247
2197static int locks_open(struct inode *inode, struct file *filp) 2248static int locks_open(struct inode *inode, struct file *filp)
2198{ 2249{
2199 return seq_open(filp, &locks_seq_operations); 2250 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
2200} 2251}
2201 2252
2202static const struct file_operations proc_locks_operations = { 2253static const struct file_operations proc_locks_operations = {
2203 .open = locks_open, 2254 .open = locks_open,
2204 .read = seq_read, 2255 .read = seq_read,
2205 .llseek = seq_lseek, 2256 .llseek = seq_lseek,
2206 .release = seq_release, 2257 .release = seq_release_private,
2207}; 2258};
2208 2259
2209static int __init proc_locks_init(void) 2260static int __init proc_locks_init(void)
@@ -2231,7 +2282,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2231{ 2282{
2232 struct file_lock *fl; 2283 struct file_lock *fl;
2233 int result = 1; 2284 int result = 1;
2234 lock_kernel(); 2285 lock_flocks();
2235 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2286 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2236 if (IS_POSIX(fl)) { 2287 if (IS_POSIX(fl)) {
2237 if (fl->fl_type == F_RDLCK) 2288 if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2299,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2248 result = 0; 2299 result = 0;
2249 break; 2300 break;
2250 } 2301 }
2251 unlock_kernel(); 2302 unlock_flocks();
2252 return result; 2303 return result;
2253} 2304}
2254 2305
@@ -2271,7 +2322,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2271{ 2322{
2272 struct file_lock *fl; 2323 struct file_lock *fl;
2273 int result = 1; 2324 int result = 1;
2274 lock_kernel(); 2325 lock_flocks();
2275 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2326 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2276 if (IS_POSIX(fl)) { 2327 if (IS_POSIX(fl)) {
2277 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2328 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2337,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2286 result = 0; 2337 result = 0;
2287 break; 2338 break;
2288 } 2339 }
2289 unlock_kernel(); 2340 unlock_flocks();
2290 return result; 2341 return result;
2291} 2342}
2292 2343
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a304..92ca6fbe09b 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
298 return sync_request(page, bdev, WRITE); 298 return sync_request(page, bdev, WRITE);
299} 299}
300 300
301static void bdev_put_device(struct super_block *sb) 301static void bdev_put_device(struct logfs_super *s)
302{ 302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs) 306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = {
320 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
321}; 321};
322 322
323int logfs_get_sb_bdev(struct file_system_type *type, int flags, 323int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
324 const char *devname, struct vfsmount *mnt) 324 const char *devname)
325{ 325{
326 struct block_device *bdev; 326 struct block_device *bdev;
327 327
@@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags,
332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { 332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 int mtdnr = MINOR(bdev->bd_dev); 333 int mtdnr = MINOR(bdev->bd_dev);
334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
335 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 335 return logfs_get_sb_mtd(p, mtdnr);
336 } 336 }
337 337
338 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); 338 p->s_bdev = bdev;
339 p->s_mtd = NULL;
340 p->s_devops = &bd_devops;
341 return 0;
339} 342}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4..7466e9dcc8c 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
231} 231}
232 232
233static void mtd_put_device(struct super_block *sb) 233static void mtd_put_device(struct logfs_super *s)
234{ 234{
235 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(s->s_mtd);
236} 236}
237 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs) 238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
265 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
266}; 266};
267 267
268int logfs_get_sb_mtd(struct file_system_type *type, int flags, 268int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
269 int mtdnr, struct vfsmount *mnt)
270{ 269{
271 struct mtd_info *mtd; 270 struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
272 const struct logfs_device_ops *devops = &mtd_devops;
273
274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd)) 271 if (IS_ERR(mtd))
276 return PTR_ERR(mtd); 272 return PTR_ERR(mtd);
277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 273
274 s->s_bdev = NULL;
275 s->s_mtd = mtd;
276 s->s_devops = &mtd_devops;
277 return 0;
278} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 72d1893ddd3..409dfd65e9a 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -434,8 +434,11 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
434 int ret; 434 int ret;
435 435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL); 436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta) 437 if (!ta) {
438 inode->i_nlink--;
439 iput(inode);
438 return -ENOMEM; 440 return -ENOMEM;
441 }
439 442
440 ta->state = CREATE_1; 443 ta->state = CREATE_1;
441 ta->ino = inode->i_ino; 444 ta->ino = inode->i_ino;
@@ -566,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
566 return -EMLINK; 569 return -EMLINK;
567 570
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count); 572 ihold(inode);
570 inode->i_nlink++; 573 inode->i_nlink++;
571 mark_inode_dirty_sync(inode); 574 mark_inode_dirty_sync(inode);
572 575
@@ -821,7 +824,8 @@ const struct inode_operations logfs_dir_iops = {
821}; 824};
822const struct file_operations logfs_dir_fops = { 825const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync, 826 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl, 827 .unlocked_ioctl = logfs_ioctl,
825 .readdir = logfs_readdir, 828 .readdir = logfs_readdir,
826 .read = generic_read_dir, 829 .read = generic_read_dir,
830 .llseek = default_llseek,
827}; 831};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index abe1cafbd4c..e86376b87af 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -181,9 +181,9 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
181} 181}
182 182
183 183
184int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 184long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
185 unsigned long arg)
186{ 185{
186 struct inode *inode = file->f_path.dentry->d_inode;
187 struct logfs_inode *li = logfs_inode(inode); 187 struct logfs_inode *li = logfs_inode(inode);
188 unsigned int oldflags, flags; 188 unsigned int oldflags, flags;
189 int err; 189 int err;
@@ -232,15 +232,19 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
232 struct inode *inode = dentry->d_inode; 232 struct inode *inode = dentry->d_inode;
233 int err = 0; 233 int err = 0;
234 234
235 if (attr->ia_valid & ATTR_SIZE) 235 err = inode_change_ok(inode, attr);
236 if (err)
237 return err;
238
239 if (attr->ia_valid & ATTR_SIZE) {
236 err = logfs_truncate(inode, attr->ia_size); 240 err = logfs_truncate(inode, attr->ia_size);
237 attr->ia_valid &= ~ATTR_SIZE; 241 if (err)
242 return err;
243 }
238 244
239 if (!err) 245 setattr_copy(inode, attr);
240 err = inode_change_ok(inode, attr); 246 mark_inode_dirty(inode);
241 if (!err) 247 return 0;
242 err = inode_setattr(inode, attr);
243 return err;
244} 248}
245 249
246const struct inode_operations logfs_reg_iops = { 250const struct inode_operations logfs_reg_iops = {
@@ -251,7 +255,7 @@ const struct file_operations logfs_reg_fops = {
251 .aio_read = generic_file_aio_read, 255 .aio_read = generic_file_aio_read,
252 .aio_write = generic_file_aio_write, 256 .aio_write = generic_file_aio_write,
253 .fsync = logfs_fsync, 257 .fsync = logfs_fsync,
254 .ioctl = logfs_ioctl, 258 .unlocked_ioctl = logfs_ioctl,
255 .llseek = generic_file_llseek, 259 .llseek = generic_file_llseek,
256 .mmap = generic_file_readonly_mmap, 260 .mmap = generic_file_readonly_mmap,
257 .open = generic_file_open, 261 .open = generic_file_open,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index f602e230e16..d8c71ece098 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -235,33 +235,21 @@ static struct inode *logfs_alloc_inode(struct super_block *sb)
235 * purpose is to create a new inode that will not trigger the warning if such 235 * purpose is to create a new inode that will not trigger the warning if such
236 * an inode is still in use. An ugly hack, no doubt. Suggections for 236 * an inode is still in use. An ugly hack, no doubt. Suggections for
237 * improvement are welcome. 237 * improvement are welcome.
238 *
239 * AV: that's what ->put_super() is for...
238 */ 240 */
239struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) 241struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
240{ 242{
241 struct inode *inode; 243 struct inode *inode;
242 244
243 inode = logfs_alloc_inode(sb); 245 inode = new_inode(sb);
244 if (!inode) 246 if (!inode)
245 return ERR_PTR(-ENOMEM); 247 return ERR_PTR(-ENOMEM);
246 248
247 inode->i_mode = S_IFREG; 249 inode->i_mode = S_IFREG;
248 inode->i_ino = ino; 250 inode->i_ino = ino;
249 inode->i_sb = sb; 251 inode->i_data.a_ops = &logfs_reg_aops;
250 252 mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
251 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
252 * to be nonstatic, alas. */
253 {
254 struct address_space * const mapping = &inode->i_data;
255
256 mapping->a_ops = &logfs_reg_aops;
257 mapping->host = inode;
258 mapping->flags = 0;
259 mapping_set_gfp_mask(mapping, GFP_NOFS);
260 mapping->assoc_mapping = NULL;
261 mapping->backing_dev_info = &default_backing_dev_info;
262 inode->i_mapping = mapping;
263 inode->i_nlink = 1;
264 }
265 253
266 return inode; 254 return inode;
267} 255}
@@ -277,7 +265,7 @@ struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
277 265
278 err = logfs_read_inode(inode); 266 err = logfs_read_inode(inode);
279 if (err) { 267 if (err) {
280 destroy_meta_inode(inode); 268 iput(inode);
281 return ERR_PTR(err); 269 return ERR_PTR(err);
282 } 270 }
283 logfs_inode_setops(inode); 271 logfs_inode_setops(inode);
@@ -298,18 +286,8 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
298 return ret; 286 return ret;
299} 287}
300 288
301void destroy_meta_inode(struct inode *inode)
302{
303 if (inode) {
304 if (inode->i_data.nrpages)
305 truncate_inode_pages(&inode->i_data, 0);
306 logfs_clear_inode(inode);
307 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
308 }
309}
310
311/* called with inode_lock held */ 289/* called with inode_lock held */
312static void logfs_drop_inode(struct inode *inode) 290static int logfs_drop_inode(struct inode *inode)
313{ 291{
314 struct logfs_super *super = logfs_super(inode->i_sb); 292 struct logfs_super *super = logfs_super(inode->i_sb);
315 struct logfs_inode *li = logfs_inode(inode); 293 struct logfs_inode *li = logfs_inode(inode);
@@ -317,7 +295,7 @@ static void logfs_drop_inode(struct inode *inode)
317 spin_lock(&logfs_inode_lock); 295 spin_lock(&logfs_inode_lock);
318 list_move(&li->li_freeing_list, &super->s_freeing_list); 296 list_move(&li->li_freeing_list, &super->s_freeing_list);
319 spin_unlock(&logfs_inode_lock); 297 spin_unlock(&logfs_inode_lock);
320 generic_drop_inode(inode); 298 return generic_drop_inode(inode);
321} 299}
322 300
323static void logfs_set_ino_generation(struct super_block *sb, 301static void logfs_set_ino_generation(struct super_block *sb,
@@ -384,12 +362,21 @@ static int logfs_sync_fs(struct super_block *sb, int wait)
384 return 0; 362 return 0;
385} 363}
386 364
365static void logfs_put_super(struct super_block *sb)
366{
367 struct logfs_super *super = logfs_super(sb);
368 /* kill the meta-inodes */
369 iput(super->s_master_inode);
370 iput(super->s_segfile_inode);
371 iput(super->s_mapping_inode);
372}
373
387const struct super_operations logfs_super_operations = { 374const struct super_operations logfs_super_operations = {
388 .alloc_inode = logfs_alloc_inode, 375 .alloc_inode = logfs_alloc_inode,
389 .clear_inode = logfs_clear_inode,
390 .delete_inode = logfs_delete_inode,
391 .destroy_inode = logfs_destroy_inode, 376 .destroy_inode = logfs_destroy_inode,
377 .evict_inode = logfs_evict_inode,
392 .drop_inode = logfs_drop_inode, 378 .drop_inode = logfs_drop_inode,
379 .put_super = logfs_put_super,
393 .write_inode = logfs_write_inode, 380 .write_inode = logfs_write_inode,
394 .statfs = logfs_statfs, 381 .statfs = logfs_statfs,
395 .sync_fs = logfs_sync_fs, 382 .sync_fs = logfs_sync_fs,
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 4b0e0616b35..f46ee8b0e13 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -889,8 +889,6 @@ void logfs_cleanup_journal(struct super_block *sb)
889 struct logfs_super *super = logfs_super(sb); 889 struct logfs_super *super = logfs_super(sb);
890 890
891 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); 891 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
892 destroy_meta_inode(super->s_master_inode);
893 super->s_master_inode = NULL;
894 892
895 kfree(super->s_compressed_je); 893 kfree(super->s_compressed_je);
896 kfree(super->s_je); 894 kfree(super->s_je);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index c838c4d7211..cd51a36b37f 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
136 int (*erase_segment)(struct logfs_area *area); 136 int (*erase_segment)(struct logfs_area *area);
137}; 137};
138 138
139struct logfs_super; /* forward */
139/** 140/**
140 * struct logfs_device_ops - device access operations 141 * struct logfs_device_ops - device access operations
141 * 142 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
156 int ensure_write); 157 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs); 158 int (*can_write_buf)(struct super_block *sb, u64 ofs);
158 void (*sync)(struct super_block *sb); 159 void (*sync)(struct super_block *sb);
159 void (*put_device)(struct super_block *sb); 160 void (*put_device)(struct logfs_super *s);
160}; 161};
161 162
162/** 163/**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
471 472
472/* dev_bdev.c */ 473/* dev_bdev.c */
473#ifdef CONFIG_BLOCK 474#ifdef CONFIG_BLOCK
474int logfs_get_sb_bdev(struct file_system_type *type, int flags, 475int logfs_get_sb_bdev(struct logfs_super *s,
475 const char *devname, struct vfsmount *mnt); 476 struct file_system_type *type,
477 const char *devname);
476#else 478#else
477static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, 479static inline int logfs_get_sb_bdev(struct logfs_super *s,
478 const char *devname, struct vfsmount *mnt) 480 struct file_system_type *type,
481 const char *devname)
479{ 482{
480 return -ENODEV; 483 return -ENODEV;
481} 484}
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
483 486
484/* dev_mtd.c */ 487/* dev_mtd.c */
485#ifdef CONFIG_MTD 488#ifdef CONFIG_MTD
486int logfs_get_sb_mtd(struct file_system_type *type, int flags, 489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
487 int mtdnr, struct vfsmount *mnt);
488#else 490#else
489static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, 491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
490 int mtdnr, struct vfsmount *mnt)
491{ 492{
492 return -ENODEV; 493 return -ENODEV;
493} 494}
@@ -504,8 +505,7 @@ extern const struct inode_operations logfs_reg_iops;
504extern const struct file_operations logfs_reg_fops; 505extern const struct file_operations logfs_reg_fops;
505extern const struct address_space_operations logfs_reg_aops; 506extern const struct address_space_operations logfs_reg_aops;
506int logfs_readpage(struct file *file, struct page *page); 507int logfs_readpage(struct file *file, struct page *page);
507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 508long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
508 unsigned long arg);
509int logfs_fsync(struct file *file, int datasync); 509int logfs_fsync(struct file *file, int datasync);
510 510
511/* gc.c */ 511/* gc.c */
@@ -525,13 +525,11 @@ struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
525struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); 525struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
526int logfs_init_inode_cache(void); 526int logfs_init_inode_cache(void);
527void logfs_destroy_inode_cache(void); 527void logfs_destroy_inode_cache(void);
528void destroy_meta_inode(struct inode *inode);
529void logfs_set_blocks(struct inode *inode, u64 no); 528void logfs_set_blocks(struct inode *inode, u64 no);
530/* these logically belong into inode.c but actually reside in readwrite.c */ 529/* these logically belong into inode.c but actually reside in readwrite.c */
531int logfs_read_inode(struct inode *inode); 530int logfs_read_inode(struct inode *inode);
532int __logfs_write_inode(struct inode *inode, long flags); 531int __logfs_write_inode(struct inode *inode, long flags);
533void logfs_delete_inode(struct inode *inode); 532void logfs_evict_inode(struct inode *inode);
534void logfs_clear_inode(struct inode *inode);
535 533
536/* journal.c */ 534/* journal.c */
537void logfs_write_anchor(struct super_block *sb); 535void logfs_write_anchor(struct super_block *sb);
@@ -622,9 +620,6 @@ void emergency_read_end(struct page *page);
622void logfs_crash_dump(struct super_block *sb); 620void logfs_crash_dump(struct super_block *sb);
623void *memchr_inv(const void *s, int c, size_t n); 621void *memchr_inv(const void *s, int c, size_t n);
624int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); 622int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
625int logfs_get_sb_device(struct file_system_type *type, int flags,
626 struct mtd_info *mtd, struct block_device *bdev,
627 const struct logfs_device_ops *devops, struct vfsmount *mnt);
628int logfs_check_ds(struct logfs_disk_super *ds); 623int logfs_check_ds(struct logfs_disk_super *ds);
629int logfs_write_sb(struct super_block *sb); 624int logfs_write_sb(struct super_block *sb);
630 625
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 0718d112a1a..6127baf0e18 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1972,31 +1972,6 @@ static struct page *inode_to_page(struct inode *inode)
1972 return page; 1972 return page;
1973} 1973}
1974 1974
1975/* Cheaper version of write_inode. All changes are concealed in
1976 * aliases, which are moved back. No write to the medium happens.
1977 */
1978void logfs_clear_inode(struct inode *inode)
1979{
1980 struct super_block *sb = inode->i_sb;
1981 struct logfs_inode *li = logfs_inode(inode);
1982 struct logfs_block *block = li->li_block;
1983 struct page *page;
1984
1985 /* Only deleted files may be dirty at this point */
1986 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1987 if (!block)
1988 return;
1989 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1990 block->ops->free_block(inode->i_sb, block);
1991 return;
1992 }
1993
1994 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1995 page = inode_to_page(inode);
1996 BUG_ON(!page); /* FIXME: Use emergency page */
1997 logfs_put_write_page(page);
1998}
1999
2000static int do_write_inode(struct inode *inode) 1975static int do_write_inode(struct inode *inode)
2001{ 1976{
2002 struct super_block *sb = inode->i_sb; 1977 struct super_block *sb = inode->i_sb;
@@ -2164,18 +2139,40 @@ static int do_delete_inode(struct inode *inode)
2164 * ZOMBIE inodes have already been deleted before and should remain dead, 2139 * ZOMBIE inodes have already been deleted before and should remain dead,
2165 * if it weren't for valid checking. No need to kill them again here. 2140 * if it weren't for valid checking. No need to kill them again here.
2166 */ 2141 */
2167void logfs_delete_inode(struct inode *inode) 2142void logfs_evict_inode(struct inode *inode)
2168{ 2143{
2144 struct super_block *sb = inode->i_sb;
2169 struct logfs_inode *li = logfs_inode(inode); 2145 struct logfs_inode *li = logfs_inode(inode);
2146 struct logfs_block *block = li->li_block;
2147 struct page *page;
2170 2148
2171 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { 2149 if (!inode->i_nlink) {
2172 li->li_flags |= LOGFS_IF_ZOMBIE; 2150 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2173 if (i_size_read(inode) > 0) 2151 li->li_flags |= LOGFS_IF_ZOMBIE;
2174 logfs_truncate(inode, 0); 2152 if (i_size_read(inode) > 0)
2175 do_delete_inode(inode); 2153 logfs_truncate(inode, 0);
2154 do_delete_inode(inode);
2155 }
2176 } 2156 }
2177 truncate_inode_pages(&inode->i_data, 0); 2157 truncate_inode_pages(&inode->i_data, 0);
2178 clear_inode(inode); 2158 end_writeback(inode);
2159
2160 /* Cheaper version of write_inode. All changes are concealed in
2161 * aliases, which are moved back. No write to the medium happens.
2162 */
2163 /* Only deleted files may be dirty at this point */
2164 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
2165 if (!block)
2166 return;
2167 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
2168 block->ops->free_block(inode->i_sb, block);
2169 return;
2170 }
2171
2172 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
2173 page = inode_to_page(inode);
2174 BUG_ON(!page); /* FIXME: Use emergency page */
2175 logfs_put_write_page(page);
2179} 2176}
2180 2177
2181void btree_write_block(struct logfs_block *block) 2178void btree_write_block(struct logfs_block *block)
@@ -2272,7 +2269,6 @@ void logfs_cleanup_rw(struct super_block *sb)
2272{ 2269{
2273 struct logfs_super *super = logfs_super(sb); 2270 struct logfs_super *super = logfs_super(sb);
2274 2271
2275 destroy_meta_inode(super->s_segfile_inode);
2276 logfs_mempool_destroy(super->s_block_pool); 2272 logfs_mempool_destroy(super->s_block_pool);
2277 logfs_mempool_destroy(super->s_shadow_pool); 2273 logfs_mempool_destroy(super->s_shadow_pool);
2278} 2274}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index a9657afb70a..9d518735325 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -929,5 +929,4 @@ void logfs_cleanup_areas(struct super_block *sb)
929 for_each_area(i) 929 for_each_area(i)
930 free_area(super->s_area[i]); 930 free_area(super->s_area[i]);
931 free_area(super->s_journal_area); 931 free_area(super->s_journal_area);
932 destroy_meta_inode(super->s_mapping_inode);
933} 932}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d651e10a1e9..33435e4b14d 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
325 return 0; 325 return 0;
326} 326}
327 327
328static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) 328static int logfs_get_sb_final(struct super_block *sb)
329{ 329{
330 struct logfs_super *super = logfs_super(sb); 330 struct logfs_super *super = logfs_super(sb);
331 struct inode *rootdir; 331 struct inode *rootdir;
@@ -342,24 +342,26 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
342 goto fail; 342 goto fail;
343 } 343 }
344 344
345 /* at that point we know that ->put_super() will be called */
345 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 346 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
346 if (!super->s_erase_page) 347 if (!super->s_erase_page)
347 goto fail; 348 return -ENOMEM;
348 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); 349 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
349 350
350 /* FIXME: check for read-only mounts */ 351 /* FIXME: check for read-only mounts */
351 err = logfs_make_writeable(sb); 352 err = logfs_make_writeable(sb);
352 if (err) 353 if (err) {
353 goto fail1; 354 __free_page(super->s_erase_page);
355 return err;
356 }
354 357
355 log_super("LogFS: Finished mounting\n"); 358 log_super("LogFS: Finished mounting\n");
356 simple_set_mnt(mnt, sb);
357 return 0; 359 return 0;
358 360
359fail1:
360 __free_page(super->s_erase_page);
361fail: 361fail:
362 iput(logfs_super(sb)->s_master_inode); 362 iput(super->s_master_inode);
363 iput(super->s_segfile_inode);
364 iput(super->s_mapping_inode);
363 return -EIO; 365 return -EIO;
364} 366}
365 367
@@ -526,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
526 logfs_cleanup_rw(sb); 528 logfs_cleanup_rw(sb);
527 if (super->s_erase_page) 529 if (super->s_erase_page)
528 __free_page(super->s_erase_page); 530 __free_page(super->s_erase_page);
529 super->s_devops->put_device(sb); 531 super->s_devops->put_device(super);
530 logfs_mempool_destroy(super->s_btree_pool); 532 logfs_mempool_destroy(super->s_btree_pool);
531 logfs_mempool_destroy(super->s_alias_pool); 533 logfs_mempool_destroy(super->s_alias_pool);
532 kfree(super); 534 kfree(super);
533 log_super("LogFS: Finished unmounting\n"); 535 log_super("LogFS: Finished unmounting\n");
534} 536}
535 537
536int logfs_get_sb_device(struct file_system_type *type, int flags, 538static struct dentry *logfs_get_sb_device(struct logfs_super *super,
537 struct mtd_info *mtd, struct block_device *bdev, 539 struct file_system_type *type, int flags)
538 const struct logfs_device_ops *devops, struct vfsmount *mnt)
539{ 540{
540 struct logfs_super *super;
541 struct super_block *sb; 541 struct super_block *sb;
542 int err = -ENOMEM; 542 int err = -ENOMEM;
543 static int mount_count; 543 static int mount_count;
544 544
545 log_super("LogFS: Start mount %x\n", mount_count++); 545 log_super("LogFS: Start mount %x\n", mount_count++);
546 super = kzalloc(sizeof(*super), GFP_KERNEL);
547 if (!super)
548 goto err0;
549 546
550 super->s_mtd = mtd;
551 super->s_bdev = bdev;
552 err = -EINVAL; 547 err = -EINVAL;
553 sb = sget(type, logfs_sb_test, logfs_sb_set, super); 548 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
554 if (IS_ERR(sb)) 549 if (IS_ERR(sb)) {
555 goto err0; 550 super->s_devops->put_device(super);
551 kfree(super);
552 return ERR_CAST(sb);
553 }
556 554
557 if (sb->s_root) { 555 if (sb->s_root) {
558 /* Device is already in use */ 556 /* Device is already in use */
559 err = 0; 557 super->s_devops->put_device(super);
560 simple_set_mnt(mnt, sb); 558 kfree(super);
561 goto err0; 559 return dget(sb->s_root);
562 } 560 }
563 561
564 super->s_devops = devops;
565
566 /* 562 /*
567 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache 563 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
568 * only covers 16TB and the upper 8TB are used for indirect blocks. 564 * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -578,44 +574,58 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
578 goto err1; 574 goto err1;
579 575
580 sb->s_flags |= MS_ACTIVE; 576 sb->s_flags |= MS_ACTIVE;
581 err = logfs_get_sb_final(sb, mnt); 577 err = logfs_get_sb_final(sb);
582 if (err) 578 if (err) {
583 goto err1; 579 deactivate_locked_super(sb);
584 return 0; 580 return ERR_PTR(err);
581 }
582 return dget(sb->s_root);
585 583
586err1: 584err1:
585 /* no ->s_root, no ->put_super() */
586 iput(super->s_master_inode);
587 iput(super->s_segfile_inode);
588 iput(super->s_mapping_inode);
587 deactivate_locked_super(sb); 589 deactivate_locked_super(sb);
588 return err; 590 return ERR_PTR(err);
589err0:
590 kfree(super);
591 //devops->put_device(sb);
592 return err;
593} 591}
594 592
595static int logfs_get_sb(struct file_system_type *type, int flags, 593static struct dentry *logfs_mount(struct file_system_type *type, int flags,
596 const char *devname, void *data, struct vfsmount *mnt) 594 const char *devname, void *data)
597{ 595{
598 ulong mtdnr; 596 ulong mtdnr;
597 struct logfs_super *super;
598 int err;
599 599
600 if (!devname) 600 super = kzalloc(sizeof(*super), GFP_KERNEL);
601 return logfs_get_sb_bdev(type, flags, devname, mnt); 601 if (!super)
602 if (strncmp(devname, "mtd", 3)) 602 return ERR_PTR(-ENOMEM);
603 return logfs_get_sb_bdev(type, flags, devname, mnt);
604 603
605 { 604 if (!devname)
605 err = logfs_get_sb_bdev(super, type, devname);
606 else if (strncmp(devname, "mtd", 3))
607 err = logfs_get_sb_bdev(super, type, devname);
608 else {
606 char *garbage; 609 char *garbage;
607 mtdnr = simple_strtoul(devname+3, &garbage, 0); 610 mtdnr = simple_strtoul(devname+3, &garbage, 0);
608 if (*garbage) 611 if (*garbage)
609 return -EINVAL; 612 err = -EINVAL;
613 else
614 err = logfs_get_sb_mtd(super, mtdnr);
615 }
616
617 if (err) {
618 kfree(super);
619 return ERR_PTR(err);
610 } 620 }
611 621
612 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 622 return logfs_get_sb_device(super, type, flags);
613} 623}
614 624
615static struct file_system_type logfs_fs_type = { 625static struct file_system_type logfs_fs_type = {
616 .owner = THIS_MODULE, 626 .owner = THIS_MODULE,
617 .name = "logfs", 627 .name = "logfs",
618 .get_sb = logfs_get_sb, 628 .mount = logfs_mount,
619 .kill_sb = logfs_kill_sb, 629 .kill_sb = logfs_kill_sb,
620 .fs_flags = FS_REQUIRES_DEV, 630 .fs_flags = FS_REQUIRES_DEV,
621 631
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e28f21b9534..93444747237 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,12 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
79struct mb_cache { 79struct mb_cache {
80 struct list_head c_cache_list; 80 struct list_head c_cache_list;
81 const char *c_name; 81 const char *c_name;
82 struct mb_cache_op c_op;
83 atomic_t c_entry_count; 82 atomic_t c_entry_count;
83 int c_max_entries;
84 int c_bucket_bits; 84 int c_bucket_bits;
85#ifndef MB_CACHE_INDEXES_COUNT 85 struct kmem_cache *c_entry_cache;
86 int c_indexes_count;
87#endif
88 struct kmem_cache *c_entry_cache;
89 struct list_head *c_block_hash; 86 struct list_head *c_block_hash;
90 struct list_head *c_indexes_hash[0]; 87 struct list_head *c_index_hash;
91}; 88};
92 89
93 90
@@ -101,16 +98,6 @@ static LIST_HEAD(mb_cache_list);
101static LIST_HEAD(mb_cache_lru_list); 98static LIST_HEAD(mb_cache_lru_list);
102static DEFINE_SPINLOCK(mb_cache_spinlock); 99static DEFINE_SPINLOCK(mb_cache_spinlock);
103 100
104static inline int
105mb_cache_indexes(struct mb_cache *cache)
106{
107#ifdef MB_CACHE_INDEXES_COUNT
108 return MB_CACHE_INDEXES_COUNT;
109#else
110 return cache->c_indexes_count;
111#endif
112}
113
114/* 101/*
115 * What the mbcache registers as to get shrunk dynamically. 102 * What the mbcache registers as to get shrunk dynamically.
116 */ 103 */
@@ -132,12 +119,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
132static void 119static void
133__mb_cache_entry_unhash(struct mb_cache_entry *ce) 120__mb_cache_entry_unhash(struct mb_cache_entry *ce)
134{ 121{
135 int n;
136
137 if (__mb_cache_entry_is_hashed(ce)) { 122 if (__mb_cache_entry_is_hashed(ce)) {
138 list_del_init(&ce->e_block_list); 123 list_del_init(&ce->e_block_list);
139 for (n=0; n<mb_cache_indexes(ce->e_cache); n++) 124 list_del(&ce->e_index.o_list);
140 list_del(&ce->e_indexes[n].o_list);
141 } 125 }
142} 126}
143 127
@@ -148,16 +132,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
148 struct mb_cache *cache = ce->e_cache; 132 struct mb_cache *cache = ce->e_cache;
149 133
150 mb_assert(!(ce->e_used || ce->e_queued)); 134 mb_assert(!(ce->e_used || ce->e_queued));
151 if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { 135 kmem_cache_free(cache->c_entry_cache, ce);
152 /* free failed -- put back on the lru list 136 atomic_dec(&cache->c_entry_count);
153 for freeing later. */
154 spin_lock(&mb_cache_spinlock);
155 list_add(&ce->e_lru_list, &mb_cache_lru_list);
156 spin_unlock(&mb_cache_spinlock);
157 } else {
158 kmem_cache_free(cache->c_entry_cache, ce);
159 atomic_dec(&cache->c_entry_count);
160 }
161} 137}
162 138
163 139
@@ -201,22 +177,12 @@ static int
201mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 177mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
202{ 178{
203 LIST_HEAD(free_list); 179 LIST_HEAD(free_list);
204 struct list_head *l, *ltmp; 180 struct mb_cache *cache;
181 struct mb_cache_entry *entry, *tmp;
205 int count = 0; 182 int count = 0;
206 183
207 spin_lock(&mb_cache_spinlock);
208 list_for_each(l, &mb_cache_list) {
209 struct mb_cache *cache =
210 list_entry(l, struct mb_cache, c_cache_list);
211 mb_debug("cache %s (%d)", cache->c_name,
212 atomic_read(&cache->c_entry_count));
213 count += atomic_read(&cache->c_entry_count);
214 }
215 mb_debug("trying to free %d entries", nr_to_scan); 184 mb_debug("trying to free %d entries", nr_to_scan);
216 if (nr_to_scan == 0) { 185 spin_lock(&mb_cache_spinlock);
217 spin_unlock(&mb_cache_spinlock);
218 goto out;
219 }
220 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 186 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
221 struct mb_cache_entry *ce = 187 struct mb_cache_entry *ce =
222 list_entry(mb_cache_lru_list.next, 188 list_entry(mb_cache_lru_list.next,
@@ -224,12 +190,15 @@ mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
224 list_move_tail(&ce->e_lru_list, &free_list); 190 list_move_tail(&ce->e_lru_list, &free_list);
225 __mb_cache_entry_unhash(ce); 191 __mb_cache_entry_unhash(ce);
226 } 192 }
193 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
194 mb_debug("cache %s (%d)", cache->c_name,
195 atomic_read(&cache->c_entry_count));
196 count += atomic_read(&cache->c_entry_count);
197 }
227 spin_unlock(&mb_cache_spinlock); 198 spin_unlock(&mb_cache_spinlock);
228 list_for_each_safe(l, ltmp, &free_list) { 199 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
229 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 200 __mb_cache_entry_forget(entry, gfp_mask);
230 e_lru_list), gfp_mask);
231 } 201 }
232out:
233 return (count / 100) * sysctl_vfs_cache_pressure; 202 return (count / 100) * sysctl_vfs_cache_pressure;
234} 203}
235 204
@@ -243,72 +212,55 @@ out:
243 * memory was available. 212 * memory was available.
244 * 213 *
245 * @name: name of the cache (informal) 214 * @name: name of the cache (informal)
246 * @cache_op: contains the callback called when freeing a cache entry
247 * @entry_size: The size of a cache entry, including
248 * struct mb_cache_entry
249 * @indexes_count: number of additional indexes in the cache. Must equal
250 * MB_CACHE_INDEXES_COUNT if the number of indexes is
251 * hardwired.
252 * @bucket_bits: log2(number of hash buckets) 215 * @bucket_bits: log2(number of hash buckets)
253 */ 216 */
254struct mb_cache * 217struct mb_cache *
255mb_cache_create(const char *name, struct mb_cache_op *cache_op, 218mb_cache_create(const char *name, int bucket_bits)
256 size_t entry_size, int indexes_count, int bucket_bits)
257{ 219{
258 int m=0, n, bucket_count = 1 << bucket_bits; 220 int n, bucket_count = 1 << bucket_bits;
259 struct mb_cache *cache = NULL; 221 struct mb_cache *cache = NULL;
260 222
261 if(entry_size < sizeof(struct mb_cache_entry) + 223 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
262 indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
263 return NULL;
264
265 cache = kmalloc(sizeof(struct mb_cache) +
266 indexes_count * sizeof(struct list_head), GFP_KERNEL);
267 if (!cache) 224 if (!cache)
268 goto fail; 225 return NULL;
269 cache->c_name = name; 226 cache->c_name = name;
270 cache->c_op.free = NULL;
271 if (cache_op)
272 cache->c_op.free = cache_op->free;
273 atomic_set(&cache->c_entry_count, 0); 227 atomic_set(&cache->c_entry_count, 0);
274 cache->c_bucket_bits = bucket_bits; 228 cache->c_bucket_bits = bucket_bits;
275#ifdef MB_CACHE_INDEXES_COUNT
276 mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
277#else
278 cache->c_indexes_count = indexes_count;
279#endif
280 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 229 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
281 GFP_KERNEL); 230 GFP_KERNEL);
282 if (!cache->c_block_hash) 231 if (!cache->c_block_hash)
283 goto fail; 232 goto fail;
284 for (n=0; n<bucket_count; n++) 233 for (n=0; n<bucket_count; n++)
285 INIT_LIST_HEAD(&cache->c_block_hash[n]); 234 INIT_LIST_HEAD(&cache->c_block_hash[n]);
286 for (m=0; m<indexes_count; m++) { 235 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
287 cache->c_indexes_hash[m] = kmalloc(bucket_count * 236 GFP_KERNEL);
288 sizeof(struct list_head), 237 if (!cache->c_index_hash)
289 GFP_KERNEL); 238 goto fail;
290 if (!cache->c_indexes_hash[m]) 239 for (n=0; n<bucket_count; n++)
291 goto fail; 240 INIT_LIST_HEAD(&cache->c_index_hash[n]);
292 for (n=0; n<bucket_count; n++) 241 cache->c_entry_cache = kmem_cache_create(name,
293 INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]); 242 sizeof(struct mb_cache_entry), 0,
294 }
295 cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
296 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 243 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
297 if (!cache->c_entry_cache) 244 if (!cache->c_entry_cache)
298 goto fail; 245 goto fail2;
246
247 /*
248 * Set an upper limit on the number of cache entries so that the hash
249 * chains won't grow too long.
250 */
251 cache->c_max_entries = bucket_count << 4;
299 252
300 spin_lock(&mb_cache_spinlock); 253 spin_lock(&mb_cache_spinlock);
301 list_add(&cache->c_cache_list, &mb_cache_list); 254 list_add(&cache->c_cache_list, &mb_cache_list);
302 spin_unlock(&mb_cache_spinlock); 255 spin_unlock(&mb_cache_spinlock);
303 return cache; 256 return cache;
304 257
258fail2:
259 kfree(cache->c_index_hash);
260
305fail: 261fail:
306 if (cache) { 262 kfree(cache->c_block_hash);
307 while (--m >= 0) 263 kfree(cache);
308 kfree(cache->c_indexes_hash[m]);
309 kfree(cache->c_block_hash);
310 kfree(cache);
311 }
312 return NULL; 264 return NULL;
313} 265}
314 266
@@ -357,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
357{ 309{
358 LIST_HEAD(free_list); 310 LIST_HEAD(free_list);
359 struct list_head *l, *ltmp; 311 struct list_head *l, *ltmp;
360 int n;
361 312
362 spin_lock(&mb_cache_spinlock); 313 spin_lock(&mb_cache_spinlock);
363 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 314 list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -384,13 +335,11 @@ mb_cache_destroy(struct mb_cache *cache)
384 335
385 kmem_cache_destroy(cache->c_entry_cache); 336 kmem_cache_destroy(cache->c_entry_cache);
386 337
387 for (n=0; n < mb_cache_indexes(cache); n++) 338 kfree(cache->c_index_hash);
388 kfree(cache->c_indexes_hash[n]);
389 kfree(cache->c_block_hash); 339 kfree(cache->c_block_hash);
390 kfree(cache); 340 kfree(cache);
391} 341}
392 342
393
394/* 343/*
395 * mb_cache_entry_alloc() 344 * mb_cache_entry_alloc()
396 * 345 *
@@ -402,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
402struct mb_cache_entry * 351struct mb_cache_entry *
403mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 352mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
404{ 353{
405 struct mb_cache_entry *ce; 354 struct mb_cache_entry *ce = NULL;
406 355
407 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 356 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
408 if (ce) { 357 spin_lock(&mb_cache_spinlock);
358 if (!list_empty(&mb_cache_lru_list)) {
359 ce = list_entry(mb_cache_lru_list.next,
360 struct mb_cache_entry, e_lru_list);
361 list_del_init(&ce->e_lru_list);
362 __mb_cache_entry_unhash(ce);
363 }
364 spin_unlock(&mb_cache_spinlock);
365 }
366 if (!ce) {
367 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
368 if (!ce)
369 return NULL;
409 atomic_inc(&cache->c_entry_count); 370 atomic_inc(&cache->c_entry_count);
410 INIT_LIST_HEAD(&ce->e_lru_list); 371 INIT_LIST_HEAD(&ce->e_lru_list);
411 INIT_LIST_HEAD(&ce->e_block_list); 372 INIT_LIST_HEAD(&ce->e_block_list);
412 ce->e_cache = cache; 373 ce->e_cache = cache;
413 ce->e_used = 1 + MB_CACHE_WRITER;
414 ce->e_queued = 0; 374 ce->e_queued = 0;
415 } 375 }
376 ce->e_used = 1 + MB_CACHE_WRITER;
416 return ce; 377 return ce;
417} 378}
418 379
@@ -429,17 +390,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
429 * 390 *
430 * @bdev: device the cache entry belongs to 391 * @bdev: device the cache entry belongs to
431 * @block: block number 392 * @block: block number
432 * @keys: array of additional keys. There must be indexes_count entries 393 * @key: lookup key
433 * in the array (as specified when creating the cache).
434 */ 394 */
435int 395int
436mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, 396mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
437 sector_t block, unsigned int keys[]) 397 sector_t block, unsigned int key)
438{ 398{
439 struct mb_cache *cache = ce->e_cache; 399 struct mb_cache *cache = ce->e_cache;
440 unsigned int bucket; 400 unsigned int bucket;
441 struct list_head *l; 401 struct list_head *l;
442 int error = -EBUSY, n; 402 int error = -EBUSY;
443 403
444 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 404 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
445 cache->c_bucket_bits); 405 cache->c_bucket_bits);
@@ -454,12 +414,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
454 ce->e_bdev = bdev; 414 ce->e_bdev = bdev;
455 ce->e_block = block; 415 ce->e_block = block;
456 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 416 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
457 for (n=0; n<mb_cache_indexes(cache); n++) { 417 ce->e_index.o_key = key;
458 ce->e_indexes[n].o_key = keys[n]; 418 bucket = hash_long(key, cache->c_bucket_bits);
459 bucket = hash_long(keys[n], cache->c_bucket_bits); 419 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
460 list_add(&ce->e_indexes[n].o_list,
461 &cache->c_indexes_hash[n][bucket]);
462 }
463 error = 0; 420 error = 0;
464out: 421out:
465 spin_unlock(&mb_cache_spinlock); 422 spin_unlock(&mb_cache_spinlock);
@@ -555,13 +512,12 @@ cleanup:
555 512
556static struct mb_cache_entry * 513static struct mb_cache_entry *
557__mb_cache_entry_find(struct list_head *l, struct list_head *head, 514__mb_cache_entry_find(struct list_head *l, struct list_head *head,
558 int index, struct block_device *bdev, unsigned int key) 515 struct block_device *bdev, unsigned int key)
559{ 516{
560 while (l != head) { 517 while (l != head) {
561 struct mb_cache_entry *ce = 518 struct mb_cache_entry *ce =
562 list_entry(l, struct mb_cache_entry, 519 list_entry(l, struct mb_cache_entry, e_index.o_list);
563 e_indexes[index].o_list); 520 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
564 if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
565 DEFINE_WAIT(wait); 521 DEFINE_WAIT(wait);
566 522
567 if (!list_empty(&ce->e_lru_list)) 523 if (!list_empty(&ce->e_lru_list))
@@ -603,23 +559,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
603 * returned cache entry is locked for shared access ("multiple readers"). 559 * returned cache entry is locked for shared access ("multiple readers").
604 * 560 *
605 * @cache: the cache to search 561 * @cache: the cache to search
606 * @index: the number of the additonal index to search (0<=index<indexes_count)
607 * @bdev: the device the cache entry should belong to 562 * @bdev: the device the cache entry should belong to
608 * @key: the key in the index 563 * @key: the key in the index
609 */ 564 */
610struct mb_cache_entry * 565struct mb_cache_entry *
611mb_cache_entry_find_first(struct mb_cache *cache, int index, 566mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
612 struct block_device *bdev, unsigned int key) 567 unsigned int key)
613{ 568{
614 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 569 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
615 struct list_head *l; 570 struct list_head *l;
616 struct mb_cache_entry *ce; 571 struct mb_cache_entry *ce;
617 572
618 mb_assert(index < mb_cache_indexes(cache));
619 spin_lock(&mb_cache_spinlock); 573 spin_lock(&mb_cache_spinlock);
620 l = cache->c_indexes_hash[index][bucket].next; 574 l = cache->c_index_hash[bucket].next;
621 ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], 575 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
622 index, bdev, key);
623 spin_unlock(&mb_cache_spinlock); 576 spin_unlock(&mb_cache_spinlock);
624 return ce; 577 return ce;
625} 578}
@@ -640,12 +593,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
640 * } 593 * }
641 * 594 *
642 * @prev: The previous match 595 * @prev: The previous match
643 * @index: the number of the additonal index to search (0<=index<indexes_count)
644 * @bdev: the device the cache entry should belong to 596 * @bdev: the device the cache entry should belong to
645 * @key: the key in the index 597 * @key: the key in the index
646 */ 598 */
647struct mb_cache_entry * 599struct mb_cache_entry *
648mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, 600mb_cache_entry_find_next(struct mb_cache_entry *prev,
649 struct block_device *bdev, unsigned int key) 601 struct block_device *bdev, unsigned int key)
650{ 602{
651 struct mb_cache *cache = prev->e_cache; 603 struct mb_cache *cache = prev->e_cache;
@@ -653,11 +605,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
653 struct list_head *l; 605 struct list_head *l;
654 struct mb_cache_entry *ce; 606 struct mb_cache_entry *ce;
655 607
656 mb_assert(index < mb_cache_indexes(cache));
657 spin_lock(&mb_cache_spinlock); 608 spin_lock(&mb_cache_spinlock);
658 l = prev->e_indexes[index].o_list.next; 609 l = prev->e_index.o_list.next;
659 ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], 610 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
660 index, bdev, key);
661 __mb_cache_entry_release_unlock(prev); 611 __mb_cache_entry_release_unlock(prev);
662 return ce; 612 return ce;
663} 613}
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 482779fe4e7..3f32bcb0d9b 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -200,13 +200,13 @@ void minix_free_inode(struct inode * inode)
200 ino = inode->i_ino; 200 ino = inode->i_ino;
201 if (ino < 1 || ino > sbi->s_ninodes) { 201 if (ino < 1 || ino > sbi->s_ninodes) {
202 printk("minix_free_inode: inode 0 or nonexistent inode\n"); 202 printk("minix_free_inode: inode 0 or nonexistent inode\n");
203 goto out; 203 return;
204 } 204 }
205 bit = ino & ((1<<k) - 1); 205 bit = ino & ((1<<k) - 1);
206 ino >>= k; 206 ino >>= k;
207 if (ino >= sbi->s_imap_blocks) { 207 if (ino >= sbi->s_imap_blocks) {
208 printk("minix_free_inode: nonexistent imap in superblock\n"); 208 printk("minix_free_inode: nonexistent imap in superblock\n");
209 goto out; 209 return;
210 } 210 }
211 211
212 minix_clear_inode(inode); /* clear on-disk copy */ 212 minix_clear_inode(inode); /* clear on-disk copy */
@@ -217,8 +217,6 @@ void minix_free_inode(struct inode * inode)
217 printk("minix_free_inode: bit %lu already cleared\n", bit); 217 printk("minix_free_inode: bit %lu already cleared\n", bit);
218 spin_unlock(&bitmap_lock); 218 spin_unlock(&bitmap_lock);
219 mark_buffer_dirty(bh); 219 mark_buffer_dirty(bh);
220 out:
221 clear_inode(inode); /* clear in-memory copy */
222} 220}
223 221
224struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) 222struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 1dbf921ca44..085a9262c69 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -271,8 +271,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
271 271
272got_it: 272got_it:
273 pos = page_offset(page) + p - (char *)page_address(page); 273 pos = page_offset(page) + p - (char *)page_address(page);
274 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, 274 err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
275 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
276 if (err) 275 if (err)
277 goto out_unlock; 276 goto out_unlock;
278 memcpy (namx, name, namelen); 277 memcpy (namx, name, namelen);
@@ -297,8 +296,7 @@ out_unlock:
297 296
298int minix_delete_entry(struct minix_dir_entry *de, struct page *page) 297int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
299{ 298{
300 struct address_space *mapping = page->mapping; 299 struct inode *inode = page->mapping->host;
301 struct inode *inode = (struct inode*)mapping->host;
302 char *kaddr = page_address(page); 300 char *kaddr = page_address(page);
303 loff_t pos = page_offset(page) + (char*)de - kaddr; 301 loff_t pos = page_offset(page) + (char*)de - kaddr;
304 struct minix_sb_info *sbi = minix_sb(inode->i_sb); 302 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
@@ -306,8 +304,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
306 int err; 304 int err;
307 305
308 lock_page(page); 306 lock_page(page);
309 err = __minix_write_begin(NULL, mapping, pos, len, 307 err = minix_prepare_chunk(page, pos, len);
310 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
311 if (err == 0) { 308 if (err == 0) {
312 if (sbi->s_version == MINIX_V3) 309 if (sbi->s_version == MINIX_V3)
313 ((minix3_dirent *) de)->inode = 0; 310 ((minix3_dirent *) de)->inode = 0;
@@ -325,16 +322,14 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
325 322
326int minix_make_empty(struct inode *inode, struct inode *dir) 323int minix_make_empty(struct inode *inode, struct inode *dir)
327{ 324{
328 struct address_space *mapping = inode->i_mapping; 325 struct page *page = grab_cache_page(inode->i_mapping, 0);
329 struct page *page = grab_cache_page(mapping, 0);
330 struct minix_sb_info *sbi = minix_sb(inode->i_sb); 326 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
331 char *kaddr; 327 char *kaddr;
332 int err; 328 int err;
333 329
334 if (!page) 330 if (!page)
335 return -ENOMEM; 331 return -ENOMEM;
336 err = __minix_write_begin(NULL, mapping, 0, 2 * sbi->s_dirsize, 332 err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
337 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
338 if (err) { 333 if (err) {
339 unlock_page(page); 334 unlock_page(page);
340 goto fail; 335 goto fail;
@@ -425,8 +420,7 @@ not_empty:
425void minix_set_link(struct minix_dir_entry *de, struct page *page, 420void minix_set_link(struct minix_dir_entry *de, struct page *page,
426 struct inode *inode) 421 struct inode *inode)
427{ 422{
428 struct address_space *mapping = page->mapping; 423 struct inode *dir = page->mapping->host;
429 struct inode *dir = mapping->host;
430 struct minix_sb_info *sbi = minix_sb(dir->i_sb); 424 struct minix_sb_info *sbi = minix_sb(dir->i_sb);
431 loff_t pos = page_offset(page) + 425 loff_t pos = page_offset(page) +
432 (char *)de-(char*)page_address(page); 426 (char *)de-(char*)page_address(page);
@@ -434,8 +428,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
434 428
435 lock_page(page); 429 lock_page(page);
436 430
437 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, 431 err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
438 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
439 if (err == 0) { 432 if (err == 0) {
440 if (sbi->s_version == MINIX_V3) 433 if (sbi->s_version == MINIX_V3)
441 ((minix3_dirent *) de)->inode = inode->i_ino; 434 ((minix3_dirent *) de)->inode = inode->i_ino;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index d5320ff23fa..4493ce695ab 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -23,7 +23,29 @@ const struct file_operations minix_file_operations = {
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
26static int minix_setattr(struct dentry *dentry, struct iattr *attr)
27{
28 struct inode *inode = dentry->d_inode;
29 int error;
30
31 error = inode_change_ok(inode, attr);
32 if (error)
33 return error;
34
35 if ((attr->ia_valid & ATTR_SIZE) &&
36 attr->ia_size != i_size_read(inode)) {
37 error = vmtruncate(inode, attr->ia_size);
38 if (error)
39 return error;
40 }
41
42 setattr_copy(inode, attr);
43 mark_inode_dirty(inode);
44 return 0;
45}
46
26const struct inode_operations minix_file_inode_operations = { 47const struct inode_operations minix_file_inode_operations = {
27 .truncate = minix_truncate, 48 .truncate = minix_truncate,
49 .setattr = minix_setattr,
28 .getattr = minix_getattr, 50 .getattr = minix_getattr,
29}; 51};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 756f8c93780..fb2020858a3 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,12 +24,17 @@ static int minix_write_inode(struct inode *inode,
24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
25static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
26 26
27static void minix_delete_inode(struct inode *inode) 27static void minix_evict_inode(struct inode *inode)
28{ 28{
29 truncate_inode_pages(&inode->i_data, 0); 29 truncate_inode_pages(&inode->i_data, 0);
30 inode->i_size = 0; 30 if (!inode->i_nlink) {
31 minix_truncate(inode); 31 inode->i_size = 0;
32 minix_free_inode(inode); 32 minix_truncate(inode);
33 }
34 invalidate_inode_buffers(inode);
35 end_writeback(inode);
36 if (!inode->i_nlink)
37 minix_free_inode(inode);
33} 38}
34 39
35static void minix_put_super(struct super_block *sb) 40static void minix_put_super(struct super_block *sb)
@@ -96,7 +101,7 @@ static const struct super_operations minix_sops = {
96 .alloc_inode = minix_alloc_inode, 101 .alloc_inode = minix_alloc_inode,
97 .destroy_inode = minix_destroy_inode, 102 .destroy_inode = minix_destroy_inode,
98 .write_inode = minix_write_inode, 103 .write_inode = minix_write_inode,
99 .delete_inode = minix_delete_inode, 104 .evict_inode = minix_evict_inode,
100 .put_super = minix_put_super, 105 .put_super = minix_put_super,
101 .statfs = minix_statfs, 106 .statfs = minix_statfs,
102 .remount_fs = minix_remount, 107 .remount_fs = minix_remount,
@@ -357,20 +362,26 @@ static int minix_readpage(struct file *file, struct page *page)
357 return block_read_full_page(page,minix_get_block); 362 return block_read_full_page(page,minix_get_block);
358} 363}
359 364
360int __minix_write_begin(struct file *file, struct address_space *mapping, 365int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
361 loff_t pos, unsigned len, unsigned flags,
362 struct page **pagep, void **fsdata)
363{ 366{
364 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 367 return __block_write_begin(page, pos, len, minix_get_block);
365 minix_get_block);
366} 368}
367 369
368static int minix_write_begin(struct file *file, struct address_space *mapping, 370static int minix_write_begin(struct file *file, struct address_space *mapping,
369 loff_t pos, unsigned len, unsigned flags, 371 loff_t pos, unsigned len, unsigned flags,
370 struct page **pagep, void **fsdata) 372 struct page **pagep, void **fsdata)
371{ 373{
372 *pagep = NULL; 374 int ret;
373 return __minix_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 375
376 ret = block_write_begin(mapping, pos, len, flags, pagep,
377 minix_get_block);
378 if (unlikely(ret)) {
379 loff_t isize = mapping->host->i_size;
380 if (pos + len > isize)
381 vmtruncate(mapping->host, isize);
382 }
383
384 return ret;
374} 385}
375 386
376static sector_t minix_bmap(struct address_space *mapping, sector_t block) 387static sector_t minix_bmap(struct address_space *mapping, sector_t block)
@@ -603,17 +614,16 @@ void minix_truncate(struct inode * inode)
603 V2_minix_truncate(inode); 614 V2_minix_truncate(inode);
604} 615}
605 616
606static int minix_get_sb(struct file_system_type *fs_type, 617static struct dentry *minix_mount(struct file_system_type *fs_type,
607 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 618 int flags, const char *dev_name, void *data)
608{ 619{
609 return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, 620 return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
610 mnt);
611} 621}
612 622
613static struct file_system_type minix_fs_type = { 623static struct file_system_type minix_fs_type = {
614 .owner = THIS_MODULE, 624 .owner = THIS_MODULE,
615 .name = "minix", 625 .name = "minix",
616 .get_sb = minix_get_sb, 626 .mount = minix_mount,
617 .kill_sb = kill_block_super, 627 .kill_sb = kill_block_super,
618 .fs_flags = FS_REQUIRES_DEV, 628 .fs_flags = FS_REQUIRES_DEV,
619}; 629};
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 111f34ee9e3..407b1c84911 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -53,9 +53,7 @@ extern int minix_new_block(struct inode * inode);
53extern void minix_free_block(struct inode *inode, unsigned long block); 53extern void minix_free_block(struct inode *inode, unsigned long block);
54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); 54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); 55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
56extern int __minix_write_begin(struct file *file, struct address_space *mapping, 56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
57 loff_t pos, unsigned len, unsigned flags,
58 struct page **pagep, void **fsdata);
59 57
60extern void V1_minix_truncate(struct inode *); 58extern void V1_minix_truncate(struct inode *);
61extern void V2_minix_truncate(struct inode *); 59extern void V2_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index e20ee85955d..c0d35a3acce 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
101 101
102 inode->i_ctime = CURRENT_TIME_SEC; 102 inode->i_ctime = CURRENT_TIME_SEC;
103 inode_inc_link_count(inode); 103 inode_inc_link_count(inode);
104 atomic_inc(&inode->i_count); 104 ihold(inode);
105 return add_nondir(dentry, inode); 105 return add_nondir(dentry, inode);
106} 106}
107 107
@@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
115 115
116 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
117 117
118 inode = minix_new_inode(dir, mode, &err); 118 inode = minix_new_inode(dir, S_IFDIR | mode, &err);
119 if (!inode) 119 if (!inode)
120 goto out_dir; 120 goto out_dir;
121 121
diff --git a/fs/namei.c b/fs/namei.c
index 868d0cb9d47..5362af9b737 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
282 if (retval) 282 if (retval)
283 return retval; 283 return retval;
284 284
285 return security_inode_permission(inode, 285 return security_inode_permission(inode, mask);
286 mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
287} 286}
288 287
289/** 288/**
@@ -484,13 +483,8 @@ ok:
484 483
485static __always_inline void set_root(struct nameidata *nd) 484static __always_inline void set_root(struct nameidata *nd)
486{ 485{
487 if (!nd->root.mnt) { 486 if (!nd->root.mnt)
488 struct fs_struct *fs = current->fs; 487 get_fs_root(current->fs, &nd->root);
489 read_lock(&fs->lock);
490 nd->root = fs->root;
491 path_get(&nd->root);
492 read_unlock(&fs->lock);
493 }
494} 488}
495 489
496static int link_path_walk(const char *, struct nameidata *); 490static int link_path_walk(const char *, struct nameidata *);
@@ -601,15 +595,16 @@ int follow_up(struct path *path)
601{ 595{
602 struct vfsmount *parent; 596 struct vfsmount *parent;
603 struct dentry *mountpoint; 597 struct dentry *mountpoint;
604 spin_lock(&vfsmount_lock); 598
599 br_read_lock(vfsmount_lock);
605 parent = path->mnt->mnt_parent; 600 parent = path->mnt->mnt_parent;
606 if (parent == path->mnt) { 601 if (parent == path->mnt) {
607 spin_unlock(&vfsmount_lock); 602 br_read_unlock(vfsmount_lock);
608 return 0; 603 return 0;
609 } 604 }
610 mntget(parent); 605 mntget(parent);
611 mountpoint = dget(path->mnt->mnt_mountpoint); 606 mountpoint = dget(path->mnt->mnt_mountpoint);
612 spin_unlock(&vfsmount_lock); 607 br_read_unlock(vfsmount_lock);
613 dput(path->dentry); 608 dput(path->dentry);
614 path->dentry = mountpoint; 609 path->dentry = mountpoint;
615 mntput(path->mnt); 610 mntput(path->mnt);
@@ -692,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
692} 687}
693 688
694/* 689/*
690 * Allocate a dentry with name and parent, and perform a parent
691 * directory ->lookup on it. Returns the new dentry, or ERR_PTR
692 * on error. parent->d_inode->i_mutex must be held. d_lookup must
693 * have verified that no child exists while under i_mutex.
694 */
695static struct dentry *d_alloc_and_lookup(struct dentry *parent,
696 struct qstr *name, struct nameidata *nd)
697{
698 struct inode *inode = parent->d_inode;
699 struct dentry *dentry;
700 struct dentry *old;
701
702 /* Don't create child dentry for a dead directory. */
703 if (unlikely(IS_DEADDIR(inode)))
704 return ERR_PTR(-ENOENT);
705
706 dentry = d_alloc(parent, name);
707 if (unlikely(!dentry))
708 return ERR_PTR(-ENOMEM);
709
710 old = inode->i_op->lookup(inode, dentry, nd);
711 if (unlikely(old)) {
712 dput(dentry);
713 dentry = old;
714 }
715 return dentry;
716}
717
718/*
695 * It's more convoluted than I'd like it to be, but... it's still fairly 719 * It's more convoluted than I'd like it to be, but... it's still fairly
696 * small and for now I'd prefer to have fast path as straight as possible. 720 * small and for now I'd prefer to have fast path as straight as possible.
697 * It _is_ time-critical. 721 * It _is_ time-critical.
@@ -712,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
712 return err; 736 return err;
713 } 737 }
714 738
739 /*
740 * Rename seqlock is not required here because in the off chance
741 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below.
743 */
715 dentry = __d_lookup(nd->path.dentry, name); 744 dentry = __d_lookup(nd->path.dentry, name);
716 if (!dentry) 745 if (!dentry)
717 goto need_lookup; 746 goto need_lookup;
747found:
718 if (dentry->d_op && dentry->d_op->d_revalidate) 748 if (dentry->d_op && dentry->d_op->d_revalidate)
719 goto need_revalidate; 749 goto need_revalidate;
720done: 750done:
@@ -730,56 +760,28 @@ need_lookup:
730 mutex_lock(&dir->i_mutex); 760 mutex_lock(&dir->i_mutex);
731 /* 761 /*
732 * First re-do the cached lookup just in case it was created 762 * First re-do the cached lookup just in case it was created
733 * while we waited for the directory semaphore.. 763 * while we waited for the directory semaphore, or the first
764 * lookup failed due to an unrelated rename.
734 * 765 *
735 * FIXME! This could use version numbering or similar to 766 * This could use version numbering or similar to avoid unnecessary
736 * avoid unnecessary cache lookups. 767 * cache lookups, but then we'd have to do the first lookup in the
737 * 768 * non-racy way. However in the common case here, everything should
738 * The "dcache_lock" is purely to protect the RCU list walker 769 * be hot in cache, so would it be a big win?
739 * from concurrent renames at this point (we mustn't get false
740 * negatives from the RCU list walk here, unlike the optimistic
741 * fast walk).
742 *
743 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
744 */ 770 */
745 dentry = d_lookup(parent, name); 771 dentry = d_lookup(parent, name);
746 if (!dentry) { 772 if (likely(!dentry)) {
747 struct dentry *new; 773 dentry = d_alloc_and_lookup(parent, name, nd);
748
749 /* Don't create child dentry for a dead directory. */
750 dentry = ERR_PTR(-ENOENT);
751 if (IS_DEADDIR(dir))
752 goto out_unlock;
753
754 new = d_alloc(parent, name);
755 dentry = ERR_PTR(-ENOMEM);
756 if (new) {
757 dentry = dir->i_op->lookup(dir, new, nd);
758 if (dentry)
759 dput(new);
760 else
761 dentry = new;
762 }
763out_unlock:
764 mutex_unlock(&dir->i_mutex); 774 mutex_unlock(&dir->i_mutex);
765 if (IS_ERR(dentry)) 775 if (IS_ERR(dentry))
766 goto fail; 776 goto fail;
767 goto done; 777 goto done;
768 } 778 }
769
770 /* 779 /*
771 * Uhhuh! Nasty case: the cache was re-populated while 780 * Uhhuh! Nasty case: the cache was re-populated while
772 * we waited on the semaphore. Need to revalidate. 781 * we waited on the semaphore. Need to revalidate.
773 */ 782 */
774 mutex_unlock(&dir->i_mutex); 783 mutex_unlock(&dir->i_mutex);
775 if (dentry->d_op && dentry->d_op->d_revalidate) { 784 goto found;
776 dentry = do_revalidate(dentry, nd);
777 if (!dentry)
778 dentry = ERR_PTR(-ENOENT);
779 }
780 if (IS_ERR(dentry))
781 goto fail;
782 goto done;
783 785
784need_revalidate: 786need_revalidate:
785 dentry = do_revalidate(dentry, nd); 787 dentry = do_revalidate(dentry, nd);
@@ -1016,11 +1018,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1016 nd->path = nd->root; 1018 nd->path = nd->root;
1017 path_get(&nd->root); 1019 path_get(&nd->root);
1018 } else if (dfd == AT_FDCWD) { 1020 } else if (dfd == AT_FDCWD) {
1019 struct fs_struct *fs = current->fs; 1021 get_fs_pwd(current->fs, &nd->path);
1020 read_lock(&fs->lock);
1021 nd->path = fs->pwd;
1022 path_get(&fs->pwd);
1023 read_unlock(&fs->lock);
1024 } else { 1022 } else {
1025 struct dentry *dentry; 1023 struct dentry *dentry;
1026 1024
@@ -1123,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1123static struct dentry *__lookup_hash(struct qstr *name, 1121static struct dentry *__lookup_hash(struct qstr *name,
1124 struct dentry *base, struct nameidata *nd) 1122 struct dentry *base, struct nameidata *nd)
1125{ 1123{
1124 struct inode *inode = base->d_inode;
1126 struct dentry *dentry; 1125 struct dentry *dentry;
1127 struct inode *inode;
1128 int err; 1126 int err;
1129 1127
1130 inode = base->d_inode; 1128 err = exec_permission(inode);
1129 if (err)
1130 return ERR_PTR(err);
1131 1131
1132 /* 1132 /*
1133 * See if the low-level filesystem might want 1133 * See if the low-level filesystem might want
@@ -1140,35 +1140,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
1140 goto out; 1140 goto out;
1141 } 1141 }
1142 1142
1143 dentry = __d_lookup(base, name); 1143 /*
1144 1144 * Don't bother with __d_lookup: callers are for creat as
1145 /* lockess __d_lookup may fail due to concurrent d_move() 1145 * well as unlink, so a lot of the time it would cost
1146 * in some unrelated directory, so try with d_lookup 1146 * a double lookup.
1147 */ 1147 */
1148 if (!dentry) 1148 dentry = d_lookup(base, name);
1149 dentry = d_lookup(base, name);
1150 1149
1151 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1150 if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1152 dentry = do_revalidate(dentry, nd); 1151 dentry = do_revalidate(dentry, nd);
1153 1152
1154 if (!dentry) { 1153 if (!dentry)
1155 struct dentry *new; 1154 dentry = d_alloc_and_lookup(base, name, nd);
1156
1157 /* Don't create child dentry for a dead directory. */
1158 dentry = ERR_PTR(-ENOENT);
1159 if (IS_DEADDIR(inode))
1160 goto out;
1161
1162 new = d_alloc(base, name);
1163 dentry = ERR_PTR(-ENOMEM);
1164 if (!new)
1165 goto out;
1166 dentry = inode->i_op->lookup(inode, new, nd);
1167 if (!dentry)
1168 dentry = new;
1169 else
1170 dput(new);
1171 }
1172out: 1155out:
1173 return dentry; 1156 return dentry;
1174} 1157}
@@ -1180,11 +1163,6 @@ out:
1180 */ 1163 */
1181static struct dentry *lookup_hash(struct nameidata *nd) 1164static struct dentry *lookup_hash(struct nameidata *nd)
1182{ 1165{
1183 int err;
1184
1185 err = exec_permission(nd->path.dentry->d_inode);
1186 if (err)
1187 return ERR_PTR(err);
1188 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1166 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1189} 1167}
1190 1168
@@ -1232,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1232 if (err) 1210 if (err)
1233 return ERR_PTR(err); 1211 return ERR_PTR(err);
1234 1212
1235 err = exec_permission(base->d_inode);
1236 if (err)
1237 return ERR_PTR(err);
1238 return __lookup_hash(&this, base, NULL); 1213 return __lookup_hash(&this, base, NULL);
1239} 1214}
1240 1215
@@ -1484,8 +1459,7 @@ static int handle_truncate(struct path *path)
1484 */ 1459 */
1485 error = locks_verify_locked(inode); 1460 error = locks_verify_locked(inode);
1486 if (!error) 1461 if (!error)
1487 error = security_path_truncate(path, 0, 1462 error = security_path_truncate(path);
1488 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1489 if (!error) { 1463 if (!error) {
1490 error = do_truncate(path->dentry, 0, 1464 error = do_truncate(path->dentry, 0,
1491 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 1465 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1600,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
1600 */ 1574 */
1601 if (will_truncate) 1575 if (will_truncate)
1602 mnt_drop_write(nd->path.mnt); 1576 mnt_drop_write(nd->path.mnt);
1577 path_put(&nd->path);
1603 return filp; 1578 return filp;
1604 1579
1605exit: 1580exit:
@@ -1701,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1701 } 1676 }
1702 filp = nameidata_to_filp(nd); 1677 filp = nameidata_to_filp(nd);
1703 mnt_drop_write(nd->path.mnt); 1678 mnt_drop_write(nd->path.mnt);
1679 path_put(&nd->path);
1704 if (!IS_ERR(filp)) { 1680 if (!IS_ERR(filp)) {
1705 error = ima_file_check(filp, acc_mode); 1681 error = ima_file_check(filp, acc_mode);
1706 if (error) { 1682 if (error) {
@@ -2311,7 +2287,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2311 goto slashes; 2287 goto slashes;
2312 inode = dentry->d_inode; 2288 inode = dentry->d_inode;
2313 if (inode) 2289 if (inode)
2314 atomic_inc(&inode->i_count); 2290 ihold(inode);
2315 error = mnt_want_write(nd.path.mnt); 2291 error = mnt_want_write(nd.path.mnt);
2316 if (error) 2292 if (error)
2317 goto exit2; 2293 goto exit2;
@@ -2635,7 +2611,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2635{ 2611{
2636 int error; 2612 int error;
2637 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 2613 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2638 const char *old_name; 2614 const unsigned char *old_name;
2639 2615
2640 if (old_dentry->d_inode == new_dentry->d_inode) 2616 if (old_dentry->d_inode == new_dentry->d_inode)
2641 return 0; 2617 return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 88058de59c7..8a415c9c5e5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
11#include <linux/syscalls.h> 11#include <linux/syscalls.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h>
15#include <linux/percpu.h>
14#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
15#include <linux/init.h> 17#include <linux/init.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -29,6 +31,7 @@
29#include <linux/log2.h> 31#include <linux/log2.h>
30#include <linux/idr.h> 32#include <linux/idr.h>
31#include <linux/fs_struct.h> 33#include <linux/fs_struct.h>
34#include <linux/fsnotify.h>
32#include <asm/uaccess.h> 35#include <asm/uaccess.h>
33#include <asm/unistd.h> 36#include <asm/unistd.h>
34#include "pnode.h" 37#include "pnode.h"
@@ -37,12 +40,10 @@
37#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
38#define HASH_SIZE (1UL << HASH_SHIFT) 41#define HASH_SIZE (1UL << HASH_SHIFT)
39 42
40/* spinlock for vfsmount related operations, inplace of dcache_lock */
41__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
42
43static int event; 43static int event;
44static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
45static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static DEFINE_SPINLOCK(mnt_id_lock);
46static int mnt_id_start = 0; 47static int mnt_id_start = 0;
47static int mnt_group_start = 1; 48static int mnt_group_start = 1;
48 49
@@ -54,6 +55,16 @@ static struct rw_semaphore namespace_sem;
54struct kobject *fs_kobj; 55struct kobject *fs_kobj;
55EXPORT_SYMBOL_GPL(fs_kobj); 56EXPORT_SYMBOL_GPL(fs_kobj);
56 57
58/*
59 * vfsmount lock may be taken for read to prevent changes to the
60 * vfsmount hash, ie. during mountpoint lookups or walking back
61 * up the tree.
62 *
63 * It should be taken for write in all cases where the vfsmount
64 * tree or hash is modified or when a vfsmount structure is modified.
65 */
66DEFINE_BRLOCK(vfsmount_lock);
67
57static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
58{ 69{
59 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -64,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
64 75
65#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
66 77
67/* allocation is serialized by namespace_sem */ 78/*
79 * allocation is serialized by namespace_sem, but we need the spinlock to
80 * serialize with freeing.
81 */
68static int mnt_alloc_id(struct vfsmount *mnt) 82static int mnt_alloc_id(struct vfsmount *mnt)
69{ 83{
70 int res; 84 int res;
71 85
72retry: 86retry:
73 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
74 spin_lock(&vfsmount_lock); 88 spin_lock(&mnt_id_lock);
75 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
76 if (!res) 90 if (!res)
77 mnt_id_start = mnt->mnt_id + 1; 91 mnt_id_start = mnt->mnt_id + 1;
78 spin_unlock(&vfsmount_lock); 92 spin_unlock(&mnt_id_lock);
79 if (res == -EAGAIN) 93 if (res == -EAGAIN)
80 goto retry; 94 goto retry;
81 95
@@ -85,11 +99,11 @@ retry:
85static void mnt_free_id(struct vfsmount *mnt) 99static void mnt_free_id(struct vfsmount *mnt)
86{ 100{
87 int id = mnt->mnt_id; 101 int id = mnt->mnt_id;
88 spin_lock(&vfsmount_lock); 102 spin_lock(&mnt_id_lock);
89 ida_remove(&mnt_id_ida, id); 103 ida_remove(&mnt_id_ida, id);
90 if (mnt_id_start > id) 104 if (mnt_id_start > id)
91 mnt_id_start = id; 105 mnt_id_start = id;
92 spin_unlock(&vfsmount_lock); 106 spin_unlock(&mnt_id_lock);
93} 107}
94 108
95/* 109/*
@@ -150,6 +164,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
150 INIT_LIST_HEAD(&mnt->mnt_share); 164 INIT_LIST_HEAD(&mnt->mnt_share);
151 INIT_LIST_HEAD(&mnt->mnt_slave_list); 165 INIT_LIST_HEAD(&mnt->mnt_slave_list);
152 INIT_LIST_HEAD(&mnt->mnt_slave); 166 INIT_LIST_HEAD(&mnt->mnt_slave);
167#ifdef CONFIG_FSNOTIFY
168 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
169#endif
153#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
154 mnt->mnt_writers = alloc_percpu(int); 171 mnt->mnt_writers = alloc_percpu(int);
155 if (!mnt->mnt_writers) 172 if (!mnt->mnt_writers)
@@ -344,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
344{ 361{
345 int ret = 0; 362 int ret = 0;
346 363
347 spin_lock(&vfsmount_lock); 364 br_write_lock(vfsmount_lock);
348 mnt->mnt_flags |= MNT_WRITE_HOLD; 365 mnt->mnt_flags |= MNT_WRITE_HOLD;
349 /* 366 /*
350 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 367 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -378,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
378 */ 395 */
379 smp_wmb(); 396 smp_wmb();
380 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 397 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
381 spin_unlock(&vfsmount_lock); 398 br_write_unlock(vfsmount_lock);
382 return ret; 399 return ret;
383} 400}
384 401
385static void __mnt_unmake_readonly(struct vfsmount *mnt) 402static void __mnt_unmake_readonly(struct vfsmount *mnt)
386{ 403{
387 spin_lock(&vfsmount_lock); 404 br_write_lock(vfsmount_lock);
388 mnt->mnt_flags &= ~MNT_READONLY; 405 mnt->mnt_flags &= ~MNT_READONLY;
389 spin_unlock(&vfsmount_lock); 406 br_write_unlock(vfsmount_lock);
390} 407}
391 408
392void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 409void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -410,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
410/* 427/*
411 * find the first or last mount at @dentry on vfsmount @mnt depending on 428 * find the first or last mount at @dentry on vfsmount @mnt depending on
412 * @dir. If @dir is set return the first mount else return the last mount. 429 * @dir. If @dir is set return the first mount else return the last mount.
430 * vfsmount_lock must be held for read or write.
413 */ 431 */
414struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 432struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
415 int dir) 433 int dir)
@@ -439,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
439struct vfsmount *lookup_mnt(struct path *path) 457struct vfsmount *lookup_mnt(struct path *path)
440{ 458{
441 struct vfsmount *child_mnt; 459 struct vfsmount *child_mnt;
442 spin_lock(&vfsmount_lock); 460
461 br_read_lock(vfsmount_lock);
443 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 462 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
444 mntget(child_mnt); 463 mntget(child_mnt);
445 spin_unlock(&vfsmount_lock); 464 br_read_unlock(vfsmount_lock);
446 return child_mnt; 465 return child_mnt;
447} 466}
448 467
@@ -451,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
451 return mnt->mnt_ns == current->nsproxy->mnt_ns; 470 return mnt->mnt_ns == current->nsproxy->mnt_ns;
452} 471}
453 472
473/*
474 * vfsmount lock must be held for write
475 */
454static void touch_mnt_namespace(struct mnt_namespace *ns) 476static void touch_mnt_namespace(struct mnt_namespace *ns)
455{ 477{
456 if (ns) { 478 if (ns) {
@@ -459,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
459 } 481 }
460} 482}
461 483
484/*
485 * vfsmount lock must be held for write
486 */
462static void __touch_mnt_namespace(struct mnt_namespace *ns) 487static void __touch_mnt_namespace(struct mnt_namespace *ns)
463{ 488{
464 if (ns && ns->event != event) { 489 if (ns && ns->event != event) {
@@ -467,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
467 } 492 }
468} 493}
469 494
495/*
496 * vfsmount lock must be held for write
497 */
470static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 498static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
471{ 499{
472 old_path->dentry = mnt->mnt_mountpoint; 500 old_path->dentry = mnt->mnt_mountpoint;
@@ -478,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
478 old_path->dentry->d_mounted--; 506 old_path->dentry->d_mounted--;
479} 507}
480 508
509/*
510 * vfsmount lock must be held for write
511 */
481void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 512void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
482 struct vfsmount *child_mnt) 513 struct vfsmount *child_mnt)
483{ 514{
@@ -486,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
486 dentry->d_mounted++; 517 dentry->d_mounted++;
487} 518}
488 519
520/*
521 * vfsmount lock must be held for write
522 */
489static void attach_mnt(struct vfsmount *mnt, struct path *path) 523static void attach_mnt(struct vfsmount *mnt, struct path *path)
490{ 524{
491 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 525 mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -495,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
495} 529}
496 530
497/* 531/*
498 * the caller must hold vfsmount_lock 532 * vfsmount lock must be held for write
499 */ 533 */
500static void commit_tree(struct vfsmount *mnt) 534static void commit_tree(struct vfsmount *mnt)
501{ 535{
@@ -561,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
561 goto out_free; 595 goto out_free;
562 } 596 }
563 597
564 mnt->mnt_flags = old->mnt_flags; 598 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
565 atomic_inc(&sb->s_active); 599 atomic_inc(&sb->s_active);
566 mnt->mnt_sb = sb; 600 mnt->mnt_sb = sb;
567 mnt->mnt_root = dget(root); 601 mnt->mnt_root = dget(root);
@@ -610,6 +644,7 @@ static inline void __mntput(struct vfsmount *mnt)
610 * provides barriers, so count_mnt_writers() below is safe. AV 644 * provides barriers, so count_mnt_writers() below is safe. AV
611 */ 645 */
612 WARN_ON(count_mnt_writers(mnt)); 646 WARN_ON(count_mnt_writers(mnt));
647 fsnotify_vfsmount_delete(mnt);
613 dput(mnt->mnt_root); 648 dput(mnt->mnt_root);
614 free_vfsmnt(mnt); 649 free_vfsmnt(mnt);
615 deactivate_super(sb); 650 deactivate_super(sb);
@@ -618,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
618void mntput_no_expire(struct vfsmount *mnt) 653void mntput_no_expire(struct vfsmount *mnt)
619{ 654{
620repeat: 655repeat:
621 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 656 if (atomic_add_unless(&mnt->mnt_count, -1, 1))
622 if (likely(!mnt->mnt_pinned)) { 657 return;
623 spin_unlock(&vfsmount_lock); 658 br_write_lock(vfsmount_lock);
624 __mntput(mnt); 659 if (!atomic_dec_and_test(&mnt->mnt_count)) {
625 return; 660 br_write_unlock(vfsmount_lock);
626 } 661 return;
627 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
628 mnt->mnt_pinned = 0;
629 spin_unlock(&vfsmount_lock);
630 acct_auto_close_mnt(mnt);
631 goto repeat;
632 } 662 }
663 if (likely(!mnt->mnt_pinned)) {
664 br_write_unlock(vfsmount_lock);
665 __mntput(mnt);
666 return;
667 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt);
672 goto repeat;
633} 673}
634
635EXPORT_SYMBOL(mntput_no_expire); 674EXPORT_SYMBOL(mntput_no_expire);
636 675
637void mnt_pin(struct vfsmount *mnt) 676void mnt_pin(struct vfsmount *mnt)
638{ 677{
639 spin_lock(&vfsmount_lock); 678 br_write_lock(vfsmount_lock);
640 mnt->mnt_pinned++; 679 mnt->mnt_pinned++;
641 spin_unlock(&vfsmount_lock); 680 br_write_unlock(vfsmount_lock);
642} 681}
643 682
644EXPORT_SYMBOL(mnt_pin); 683EXPORT_SYMBOL(mnt_pin);
645 684
646void mnt_unpin(struct vfsmount *mnt) 685void mnt_unpin(struct vfsmount *mnt)
647{ 686{
648 spin_lock(&vfsmount_lock); 687 br_write_lock(vfsmount_lock);
649 if (mnt->mnt_pinned) { 688 if (mnt->mnt_pinned) {
650 atomic_inc(&mnt->mnt_count); 689 atomic_inc(&mnt->mnt_count);
651 mnt->mnt_pinned--; 690 mnt->mnt_pinned--;
652 } 691 }
653 spin_unlock(&vfsmount_lock); 692 br_write_unlock(vfsmount_lock);
654} 693}
655 694
656EXPORT_SYMBOL(mnt_unpin); 695EXPORT_SYMBOL(mnt_unpin);
@@ -741,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
741 struct mnt_namespace *ns = p->ns; 780 struct mnt_namespace *ns = p->ns;
742 int res = 0; 781 int res = 0;
743 782
744 spin_lock(&vfsmount_lock); 783 br_read_lock(vfsmount_lock);
745 if (p->event != ns->event) { 784 if (p->event != ns->event) {
746 p->event = ns->event; 785 p->event = ns->event;
747 res = 1; 786 res = 1;
748 } 787 }
749 spin_unlock(&vfsmount_lock); 788 br_read_unlock(vfsmount_lock);
750 789
751 return res; 790 return res;
752} 791}
@@ -783,7 +822,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
783 { MNT_NOATIME, ",noatime" }, 822 { MNT_NOATIME, ",noatime" },
784 { MNT_NODIRATIME, ",nodiratime" }, 823 { MNT_NODIRATIME, ",nodiratime" },
785 { MNT_RELATIME, ",relatime" }, 824 { MNT_RELATIME, ",relatime" },
786 { MNT_STRICTATIME, ",strictatime" },
787 { 0, NULL } 825 { 0, NULL }
788 }; 826 };
789 const struct proc_fs_info *fs_infop; 827 const struct proc_fs_info *fs_infop;
@@ -948,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
948 int minimum_refs = 0; 986 int minimum_refs = 0;
949 struct vfsmount *p; 987 struct vfsmount *p;
950 988
951 spin_lock(&vfsmount_lock); 989 br_read_lock(vfsmount_lock);
952 for (p = mnt; p; p = next_mnt(p, mnt)) { 990 for (p = mnt; p; p = next_mnt(p, mnt)) {
953 actual_refs += atomic_read(&p->mnt_count); 991 actual_refs += atomic_read(&p->mnt_count);
954 minimum_refs += 2; 992 minimum_refs += 2;
955 } 993 }
956 spin_unlock(&vfsmount_lock); 994 br_read_unlock(vfsmount_lock);
957 995
958 if (actual_refs > minimum_refs) 996 if (actual_refs > minimum_refs)
959 return 0; 997 return 0;
@@ -980,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
980{ 1018{
981 int ret = 1; 1019 int ret = 1;
982 down_read(&namespace_sem); 1020 down_read(&namespace_sem);
983 spin_lock(&vfsmount_lock); 1021 br_read_lock(vfsmount_lock);
984 if (propagate_mount_busy(mnt, 2)) 1022 if (propagate_mount_busy(mnt, 2))
985 ret = 0; 1023 ret = 0;
986 spin_unlock(&vfsmount_lock); 1024 br_read_unlock(vfsmount_lock);
987 up_read(&namespace_sem); 1025 up_read(&namespace_sem);
988 return ret; 1026 return ret;
989} 1027}
@@ -999,13 +1037,14 @@ void release_mounts(struct list_head *head)
999 if (mnt->mnt_parent != mnt) { 1037 if (mnt->mnt_parent != mnt) {
1000 struct dentry *dentry; 1038 struct dentry *dentry;
1001 struct vfsmount *m; 1039 struct vfsmount *m;
1002 spin_lock(&vfsmount_lock); 1040
1041 br_write_lock(vfsmount_lock);
1003 dentry = mnt->mnt_mountpoint; 1042 dentry = mnt->mnt_mountpoint;
1004 m = mnt->mnt_parent; 1043 m = mnt->mnt_parent;
1005 mnt->mnt_mountpoint = mnt->mnt_root; 1044 mnt->mnt_mountpoint = mnt->mnt_root;
1006 mnt->mnt_parent = mnt; 1045 mnt->mnt_parent = mnt;
1007 m->mnt_ghosts--; 1046 m->mnt_ghosts--;
1008 spin_unlock(&vfsmount_lock); 1047 br_write_unlock(vfsmount_lock);
1009 dput(dentry); 1048 dput(dentry);
1010 mntput(m); 1049 mntput(m);
1011 } 1050 }
@@ -1013,6 +1052,10 @@ void release_mounts(struct list_head *head)
1013 } 1052 }
1014} 1053}
1015 1054
1055/*
1056 * vfsmount lock must be held for write
1057 * namespace_sem must be held for write
1058 */
1016void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1017{ 1060{
1018 struct vfsmount *p; 1061 struct vfsmount *p;
@@ -1103,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1103 } 1146 }
1104 1147
1105 down_write(&namespace_sem); 1148 down_write(&namespace_sem);
1106 spin_lock(&vfsmount_lock); 1149 br_write_lock(vfsmount_lock);
1107 event++; 1150 event++;
1108 1151
1109 if (!(flags & MNT_DETACH)) 1152 if (!(flags & MNT_DETACH))
@@ -1115,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1115 umount_tree(mnt, 1, &umount_list); 1158 umount_tree(mnt, 1, &umount_list);
1116 retval = 0; 1159 retval = 0;
1117 } 1160 }
1118 spin_unlock(&vfsmount_lock); 1161 br_write_unlock(vfsmount_lock);
1119 up_write(&namespace_sem); 1162 up_write(&namespace_sem);
1120 release_mounts(&umount_list); 1163 release_mounts(&umount_list);
1121 return retval; 1164 return retval;
@@ -1227,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
1227 q = clone_mnt(p, p->mnt_root, flag); 1270 q = clone_mnt(p, p->mnt_root, flag);
1228 if (!q) 1271 if (!q)
1229 goto Enomem; 1272 goto Enomem;
1230 spin_lock(&vfsmount_lock); 1273 br_write_lock(vfsmount_lock);
1231 list_add_tail(&q->mnt_list, &res->mnt_list); 1274 list_add_tail(&q->mnt_list, &res->mnt_list);
1232 attach_mnt(q, &path); 1275 attach_mnt(q, &path);
1233 spin_unlock(&vfsmount_lock); 1276 br_write_unlock(vfsmount_lock);
1234 } 1277 }
1235 } 1278 }
1236 return res; 1279 return res;
1237Enomem: 1280Enomem:
1238 if (res) { 1281 if (res) {
1239 LIST_HEAD(umount_list); 1282 LIST_HEAD(umount_list);
1240 spin_lock(&vfsmount_lock); 1283 br_write_lock(vfsmount_lock);
1241 umount_tree(res, 0, &umount_list); 1284 umount_tree(res, 0, &umount_list);
1242 spin_unlock(&vfsmount_lock); 1285 br_write_unlock(vfsmount_lock);
1243 release_mounts(&umount_list); 1286 release_mounts(&umount_list);
1244 } 1287 }
1245 return NULL; 1288 return NULL;
@@ -1258,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
1258{ 1301{
1259 LIST_HEAD(umount_list); 1302 LIST_HEAD(umount_list);
1260 down_write(&namespace_sem); 1303 down_write(&namespace_sem);
1261 spin_lock(&vfsmount_lock); 1304 br_write_lock(vfsmount_lock);
1262 umount_tree(mnt, 0, &umount_list); 1305 umount_tree(mnt, 0, &umount_list);
1263 spin_unlock(&vfsmount_lock); 1306 br_write_unlock(vfsmount_lock);
1264 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1265 release_mounts(&umount_list); 1308 release_mounts(&umount_list);
1266} 1309}
@@ -1388,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1388 if (err) 1431 if (err)
1389 goto out_cleanup_ids; 1432 goto out_cleanup_ids;
1390 1433
1391 spin_lock(&vfsmount_lock); 1434 br_write_lock(vfsmount_lock);
1392 1435
1393 if (IS_MNT_SHARED(dest_mnt)) { 1436 if (IS_MNT_SHARED(dest_mnt)) {
1394 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1437 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1407,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1407 list_del_init(&child->mnt_hash); 1450 list_del_init(&child->mnt_hash);
1408 commit_tree(child); 1451 commit_tree(child);
1409 } 1452 }
1410 spin_unlock(&vfsmount_lock); 1453 br_write_unlock(vfsmount_lock);
1454
1411 return 0; 1455 return 0;
1412 1456
1413 out_cleanup_ids: 1457 out_cleanup_ids:
@@ -1440,13 +1484,30 @@ out_unlock:
1440} 1484}
1441 1485
1442/* 1486/*
1487 * Sanity check the flags to change_mnt_propagation.
1488 */
1489
1490static int flags_to_propagation_type(int flags)
1491{
1492 int type = flags & ~MS_REC;
1493
1494 /* Fail if any non-propagation flags are set */
1495 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1496 return 0;
1497 /* Only one propagation flag should be set */
1498 if (!is_power_of_2(type))
1499 return 0;
1500 return type;
1501}
1502
1503/*
1443 * recursively change the type of the mountpoint. 1504 * recursively change the type of the mountpoint.
1444 */ 1505 */
1445static int do_change_type(struct path *path, int flag) 1506static int do_change_type(struct path *path, int flag)
1446{ 1507{
1447 struct vfsmount *m, *mnt = path->mnt; 1508 struct vfsmount *m, *mnt = path->mnt;
1448 int recurse = flag & MS_REC; 1509 int recurse = flag & MS_REC;
1449 int type = flag & ~MS_REC; 1510 int type;
1450 int err = 0; 1511 int err = 0;
1451 1512
1452 if (!capable(CAP_SYS_ADMIN)) 1513 if (!capable(CAP_SYS_ADMIN))
@@ -1455,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
1455 if (path->dentry != path->mnt->mnt_root) 1516 if (path->dentry != path->mnt->mnt_root)
1456 return -EINVAL; 1517 return -EINVAL;
1457 1518
1519 type = flags_to_propagation_type(flag);
1520 if (!type)
1521 return -EINVAL;
1522
1458 down_write(&namespace_sem); 1523 down_write(&namespace_sem);
1459 if (type == MS_SHARED) { 1524 if (type == MS_SHARED) {
1460 err = invent_group_ids(mnt, recurse); 1525 err = invent_group_ids(mnt, recurse);
@@ -1462,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
1462 goto out_unlock; 1527 goto out_unlock;
1463 } 1528 }
1464 1529
1465 spin_lock(&vfsmount_lock); 1530 br_write_lock(vfsmount_lock);
1466 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1531 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1467 change_mnt_propagation(m, type); 1532 change_mnt_propagation(m, type);
1468 spin_unlock(&vfsmount_lock); 1533 br_write_unlock(vfsmount_lock);
1469 1534
1470 out_unlock: 1535 out_unlock:
1471 up_write(&namespace_sem); 1536 up_write(&namespace_sem);
@@ -1509,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
1509 err = graft_tree(mnt, path); 1574 err = graft_tree(mnt, path);
1510 if (err) { 1575 if (err) {
1511 LIST_HEAD(umount_list); 1576 LIST_HEAD(umount_list);
1512 spin_lock(&vfsmount_lock); 1577
1578 br_write_lock(vfsmount_lock);
1513 umount_tree(mnt, 0, &umount_list); 1579 umount_tree(mnt, 0, &umount_list);
1514 spin_unlock(&vfsmount_lock); 1580 br_write_unlock(vfsmount_lock);
1515 release_mounts(&umount_list); 1581 release_mounts(&umount_list);
1516 } 1582 }
1517 1583
@@ -1564,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1564 else 1630 else
1565 err = do_remount_sb(sb, flags, data, 0); 1631 err = do_remount_sb(sb, flags, data, 0);
1566 if (!err) { 1632 if (!err) {
1567 spin_lock(&vfsmount_lock); 1633 br_write_lock(vfsmount_lock);
1568 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1634 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1569 path->mnt->mnt_flags = mnt_flags; 1635 path->mnt->mnt_flags = mnt_flags;
1570 spin_unlock(&vfsmount_lock); 1636 br_write_unlock(vfsmount_lock);
1571 } 1637 }
1572 up_write(&sb->s_umount); 1638 up_write(&sb->s_umount);
1573 if (!err) { 1639 if (!err) {
1574 spin_lock(&vfsmount_lock); 1640 br_write_lock(vfsmount_lock);
1575 touch_mnt_namespace(path->mnt->mnt_ns); 1641 touch_mnt_namespace(path->mnt->mnt_ns);
1576 spin_unlock(&vfsmount_lock); 1642 br_write_unlock(vfsmount_lock);
1577 } 1643 }
1578 return err; 1644 return err;
1579} 1645}
@@ -1678,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1678 if (!capable(CAP_SYS_ADMIN)) 1744 if (!capable(CAP_SYS_ADMIN))
1679 return -EPERM; 1745 return -EPERM;
1680 1746
1681 lock_kernel();
1682 mnt = do_kern_mount(type, flags, name, data); 1747 mnt = do_kern_mount(type, flags, name, data);
1683 unlock_kernel();
1684 if (IS_ERR(mnt)) 1748 if (IS_ERR(mnt))
1685 return PTR_ERR(mnt); 1749 return PTR_ERR(mnt);
1686 1750
@@ -1750,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1750 return; 1814 return;
1751 1815
1752 down_write(&namespace_sem); 1816 down_write(&namespace_sem);
1753 spin_lock(&vfsmount_lock); 1817 br_write_lock(vfsmount_lock);
1754 1818
1755 /* extract from the expiration list every vfsmount that matches the 1819 /* extract from the expiration list every vfsmount that matches the
1756 * following criteria: 1820 * following criteria:
@@ -1769,7 +1833,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1769 touch_mnt_namespace(mnt->mnt_ns); 1833 touch_mnt_namespace(mnt->mnt_ns);
1770 umount_tree(mnt, 1, &umounts); 1834 umount_tree(mnt, 1, &umounts);
1771 } 1835 }
1772 spin_unlock(&vfsmount_lock); 1836 br_write_unlock(vfsmount_lock);
1773 up_write(&namespace_sem); 1837 up_write(&namespace_sem);
1774 1838
1775 release_mounts(&umounts); 1839 release_mounts(&umounts);
@@ -1826,6 +1890,8 @@ resume:
1826/* 1890/*
1827 * process a list of expirable mountpoints with the intent of discarding any 1891 * process a list of expirable mountpoints with the intent of discarding any
1828 * submounts of a specific parent mountpoint 1892 * submounts of a specific parent mountpoint
1893 *
1894 * vfsmount_lock must be held for write
1829 */ 1895 */
1830static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1896static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
1831{ 1897{
@@ -1984,7 +2050,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1984 if (flags & MS_RDONLY) 2050 if (flags & MS_RDONLY)
1985 mnt_flags |= MNT_READONLY; 2051 mnt_flags |= MNT_READONLY;
1986 2052
1987 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 2053 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
1988 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2054 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
1989 MS_STRICTATIME); 2055 MS_STRICTATIME);
1990 2056
@@ -2044,9 +2110,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2044 kfree(new_ns); 2110 kfree(new_ns);
2045 return ERR_PTR(-ENOMEM); 2111 return ERR_PTR(-ENOMEM);
2046 } 2112 }
2047 spin_lock(&vfsmount_lock); 2113 br_write_lock(vfsmount_lock);
2048 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2114 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
2049 spin_unlock(&vfsmount_lock); 2115 br_write_unlock(vfsmount_lock);
2050 2116
2051 /* 2117 /*
2052 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2118 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2208,10 +2274,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2208 goto out1; 2274 goto out1;
2209 } 2275 }
2210 2276
2211 read_lock(&current->fs->lock); 2277 get_fs_root(current->fs, &root);
2212 root = current->fs->root;
2213 path_get(&current->fs->root);
2214 read_unlock(&current->fs->lock);
2215 down_write(&namespace_sem); 2278 down_write(&namespace_sem);
2216 mutex_lock(&old.dentry->d_inode->i_mutex); 2279 mutex_lock(&old.dentry->d_inode->i_mutex);
2217 error = -EINVAL; 2280 error = -EINVAL;
@@ -2243,7 +2306,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2243 goto out2; /* not attached */ 2306 goto out2; /* not attached */
2244 /* make sure we can reach put_old from new_root */ 2307 /* make sure we can reach put_old from new_root */
2245 tmp = old.mnt; 2308 tmp = old.mnt;
2246 spin_lock(&vfsmount_lock); 2309 br_write_lock(vfsmount_lock);
2247 if (tmp != new.mnt) { 2310 if (tmp != new.mnt) {
2248 for (;;) { 2311 for (;;) {
2249 if (tmp->mnt_parent == tmp) 2312 if (tmp->mnt_parent == tmp)
@@ -2263,7 +2326,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2263 /* mount new_root on / */ 2326 /* mount new_root on / */
2264 attach_mnt(new.mnt, &root_parent); 2327 attach_mnt(new.mnt, &root_parent);
2265 touch_mnt_namespace(current->nsproxy->mnt_ns); 2328 touch_mnt_namespace(current->nsproxy->mnt_ns);
2266 spin_unlock(&vfsmount_lock); 2329 br_write_unlock(vfsmount_lock);
2267 chroot_fs_refs(&root, &new); 2330 chroot_fs_refs(&root, &new);
2268 error = 0; 2331 error = 0;
2269 path_put(&root_parent); 2332 path_put(&root_parent);
@@ -2278,7 +2341,7 @@ out1:
2278out0: 2341out0:
2279 return error; 2342 return error;
2280out3: 2343out3:
2281 spin_unlock(&vfsmount_lock); 2344 br_write_unlock(vfsmount_lock);
2282 goto out2; 2345 goto out2;
2283} 2346}
2284 2347
@@ -2325,6 +2388,8 @@ void __init mnt_init(void)
2325 for (u = 0; u < HASH_SIZE; u++) 2388 for (u = 0; u < HASH_SIZE; u++)
2326 INIT_LIST_HEAD(&mount_hashtable[u]); 2389 INIT_LIST_HEAD(&mount_hashtable[u]);
2327 2390
2391 br_lock_init(vfsmount_lock);
2392
2328 err = sysfs_init(); 2393 err = sysfs_init();
2329 if (err) 2394 if (err)
2330 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2395 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2343,9 +2408,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2343 if (!atomic_dec_and_test(&ns->count)) 2408 if (!atomic_dec_and_test(&ns->count))
2344 return; 2409 return;
2345 down_write(&namespace_sem); 2410 down_write(&namespace_sem);
2346 spin_lock(&vfsmount_lock); 2411 br_write_lock(vfsmount_lock);
2347 umount_tree(ns->root, 0, &umount_list); 2412 umount_tree(ns->root, 0, &umount_list);
2348 spin_unlock(&vfsmount_lock); 2413 br_write_unlock(vfsmount_lock);
2349 up_write(&namespace_sem); 2414 up_write(&namespace_sem);
2350 release_mounts(&umount_list); 2415 release_mounts(&umount_list);
2351 kfree(ns); 2416 kfree(ns);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd5..aac8832e919 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
95}; 95};
96 96
97 97
98#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
99
100static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
101{
102#ifdef CONFIG_NCPFS_SMALLDOS
103 int ns = ncp_namespace(i);
104
105 if ((ns == NW_NS_DOS)
106#ifdef CONFIG_NCPFS_OS2_NS
107 || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
108#endif /* CONFIG_NCPFS_OS2_NS */
109 )
110 return 0;
111#endif /* CONFIG_NCPFS_SMALLDOS */
112 return 1;
113}
114
115#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS)
116
117static inline int ncp_case_sensitive(struct dentry *dentry)
118{
119#ifdef CONFIG_NCPFS_NFS_NS
120 return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
121#else
122 return 0;
123#endif /* CONFIG_NCPFS_NFS_NS */
124}
125
98/* 126/*
99 * Note: leave the hash unchanged if the directory 127 * Note: leave the hash unchanged if the directory
100 * is case-sensitive. 128 * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
102static int 130static int
103ncp_hash_dentry(struct dentry *dentry, struct qstr *this) 131ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
104{ 132{
105 struct nls_table *t; 133 if (!ncp_case_sensitive(dentry)) {
106 unsigned long hash; 134 struct nls_table *t;
107 int i; 135 unsigned long hash;
108 136 int i;
109 t = NCP_IO_TABLE(dentry);
110 137
111 if (!ncp_case_sensitive(dentry->d_inode)) { 138 t = NCP_IO_TABLE(dentry);
112 hash = init_name_hash(); 139 hash = init_name_hash();
113 for (i=0; i<this->len ; i++) 140 for (i=0; i<this->len ; i++)
114 hash = partial_name_hash(ncp_tolower(t, this->name[i]), 141 hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
124 if (a->len != b->len) 151 if (a->len != b->len)
125 return 1; 152 return 1;
126 153
127 if (ncp_case_sensitive(dentry->d_inode)) 154 if (ncp_case_sensitive(dentry))
128 return strncmp(a->name, b->name, a->len); 155 return strncmp(a->name, b->name, a->len);
129 156
130 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); 157 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
266 293
267 294
268static int 295static int
269__ncp_lookup_validate(struct dentry *dentry) 296ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
270{ 297{
271 struct ncp_server *server; 298 struct ncp_server *server;
272 struct dentry *parent; 299 struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
283 310
284 server = NCP_SERVER(dir); 311 server = NCP_SERVER(dir);
285 312
286 if (!ncp_conn_valid(server))
287 goto finished;
288
289 /* 313 /*
290 * Inspired by smbfs: 314 * Inspired by smbfs:
291 * The default validation is based on dentry age: 315 * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
304 if (ncp_is_server_root(dir)) { 328 if (ncp_is_server_root(dir)) {
305 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 329 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
306 dentry->d_name.len, 1); 330 dentry->d_name.len, 1);
307 if (!res) 331 if (!res) {
308 res = ncp_lookup_volume(server, __name, &(finfo.i)); 332 res = ncp_lookup_volume(server, __name, &(finfo.i));
333 if (!res)
334 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
335 }
309 } else { 336 } else {
310 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 337 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
311 dentry->d_name.len, !ncp_preserve_case(dir)); 338 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
320 * what we remember, it's not valid any more. 347 * what we remember, it's not valid any more.
321 */ 348 */
322 if (!res) { 349 if (!res) {
323 if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) { 350 struct inode *inode = dentry->d_inode;
351
352 mutex_lock(&inode->i_mutex);
353 if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
324 ncp_new_dentry(dentry); 354 ncp_new_dentry(dentry);
325 val=1; 355 val=1;
326 } else 356 } else
327 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); 357 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
328 358
329 ncp_update_inode2(dentry->d_inode, &finfo); 359 ncp_update_inode2(inode, &finfo);
360 mutex_unlock(&inode->i_mutex);
330 } 361 }
331 362
332finished: 363finished:
@@ -335,16 +366,6 @@ finished:
335 return val; 366 return val;
336} 367}
337 368
338static int
339ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
340{
341 int res;
342 lock_kernel();
343 res = __ncp_lookup_validate(dentry);
344 unlock_kernel();
345 return res;
346}
347
348static struct dentry * 369static struct dentry *
349ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) 370ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
350{ 371{
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
411 int result, mtime_valid = 0; 432 int result, mtime_valid = 0;
412 time_t mtime = 0; 433 time_t mtime = 0;
413 434
414 lock_kernel();
415
416 ctl.page = NULL; 435 ctl.page = NULL;
417 ctl.cache = NULL; 436 ctl.cache = NULL;
418 437
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
421 (int) filp->f_pos); 440 (int) filp->f_pos);
422 441
423 result = -EIO; 442 result = -EIO;
443 /* Do not generate '.' and '..' when server is dead. */
424 if (!ncp_conn_valid(server)) 444 if (!ncp_conn_valid(server))
425 goto out; 445 goto out;
426 446
@@ -532,6 +552,12 @@ read_really:
532 ctl.head.end = ctl.fpos - 1; 552 ctl.head.end = ctl.fpos - 1;
533 ctl.head.eof = ctl.valid; 553 ctl.head.eof = ctl.valid;
534finished: 554finished:
555 if (ctl.page) {
556 kunmap(ctl.page);
557 SetPageUptodate(ctl.page);
558 unlock_page(ctl.page);
559 page_cache_release(ctl.page);
560 }
535 if (page) { 561 if (page) {
536 cache->head = ctl.head; 562 cache->head = ctl.head;
537 kunmap(page); 563 kunmap(page);
@@ -539,23 +565,17 @@ finished:
539 unlock_page(page); 565 unlock_page(page);
540 page_cache_release(page); 566 page_cache_release(page);
541 } 567 }
542 if (ctl.page) {
543 kunmap(ctl.page);
544 SetPageUptodate(ctl.page);
545 unlock_page(ctl.page);
546 page_cache_release(ctl.page);
547 }
548out: 568out:
549 unlock_kernel();
550 return result; 569 return result;
551} 570}
552 571
553static int 572static int
554ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 573ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
555 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry) 574 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
575 int inval_childs)
556{ 576{
557 struct dentry *newdent, *dentry = filp->f_path.dentry; 577 struct dentry *newdent, *dentry = filp->f_path.dentry;
558 struct inode *newino, *inode = dentry->d_inode; 578 struct inode *dir = dentry->d_inode;
559 struct ncp_cache_control ctl = *ctrl; 579 struct ncp_cache_control ctl = *ctrl;
560 struct qstr qname; 580 struct qstr qname;
561 int valid = 0; 581 int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
564 __u8 __name[NCP_MAXPATHLEN + 1]; 584 __u8 __name[NCP_MAXPATHLEN + 1];
565 585
566 qname.len = sizeof(__name); 586 qname.len = sizeof(__name);
567 if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len, 587 if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
568 entry->i.entryName, entry->i.nameLen, 588 entry->i.entryName, entry->i.nameLen,
569 !ncp_preserve_entry_case(inode, entry->i.NSCreator))) 589 !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
570 return 1; /* I'm not sure */ 590 return 1; /* I'm not sure */
571 591
572 qname.name = __name; 592 qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
584 goto end_advance; 604 goto end_advance;
585 } else { 605 } else {
586 hashed = 1; 606 hashed = 1;
587 memcpy((char *) newdent->d_name.name, qname.name, 607
588 newdent->d_name.len); 608 /* If case sensitivity changed for this volume, all entries below this one
609 should be thrown away. This entry itself is not affected, as its case
610 sensitivity is controlled by its own parent. */
611 if (inval_childs)
612 shrink_dcache_parent(newdent);
613
614 /*
615 * It is not as dangerous as it looks. NetWare's OS2 namespace is
616 * case preserving yet case insensitive. So we update dentry's name
617 * as received from server. We found dentry via d_lookup with our
618 * hash, so we know that hash does not change, and so replacing name
619 * should be reasonably safe.
620 */
621 if (qname.len == newdent->d_name.len &&
622 memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
623 struct inode *inode = newdent->d_inode;
624
625 /*
626 * Inside ncpfs all uses of d_name are either for debugging,
627 * or on functions which acquire inode mutex (mknod, creat,
628 * lookup). So grab i_mutex here, to be sure. d_path
629 * uses dcache_lock when generating path, so we should too.
630 * And finally d_compare is protected by dentry's d_lock, so
631 * here we go.
632 */
633 if (inode)
634 mutex_lock(&inode->i_mutex);
635 spin_lock(&dcache_lock);
636 spin_lock(&newdent->d_lock);
637 memcpy((char *) newdent->d_name.name, qname.name,
638 newdent->d_name.len);
639 spin_unlock(&newdent->d_lock);
640 spin_unlock(&dcache_lock);
641 if (inode)
642 mutex_unlock(&inode->i_mutex);
643 }
589 } 644 }
590 645
591 if (!newdent->d_inode) { 646 if (!newdent->d_inode) {
647 struct inode *inode;
648
592 entry->opened = 0; 649 entry->opened = 0;
593 entry->ino = iunique(inode->i_sb, 2); 650 entry->ino = iunique(dir->i_sb, 2);
594 newino = ncp_iget(inode->i_sb, entry); 651 inode = ncp_iget(dir->i_sb, entry);
595 if (newino) { 652 if (inode) {
596 newdent->d_op = &ncp_dentry_operations; 653 newdent->d_op = &ncp_dentry_operations;
597 d_instantiate(newdent, newino); 654 d_instantiate(newdent, inode);
598 if (!hashed) 655 if (!hashed)
599 d_rehash(newdent); 656 d_rehash(newdent);
600 } 657 }
601 } else 658 } else {
602 ncp_update_inode2(newdent->d_inode, entry); 659 struct inode *inode = newdent->d_inode;
660
661 mutex_lock(&inode->i_mutex);
662 ncp_update_inode2(inode, entry);
663 mutex_unlock(&inode->i_mutex);
664 }
603 665
604 if (newdent->d_inode) { 666 if (newdent->d_inode) {
605 ino = newdent->d_inode->i_ino; 667 ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
617 ctl.cache = NULL; 679 ctl.cache = NULL;
618 ctl.idx -= NCP_DIRCACHE_SIZE; 680 ctl.idx -= NCP_DIRCACHE_SIZE;
619 ctl.ofs += 1; 681 ctl.ofs += 1;
620 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs); 682 ctl.page = grab_cache_page(&dir->i_data, ctl.ofs);
621 if (ctl.page) 683 if (ctl.page)
622 ctl.cache = kmap(ctl.page); 684 ctl.cache = kmap(ctl.page);
623 } 685 }
@@ -633,7 +695,7 @@ end_advance:
633 if (!ino) 695 if (!ino)
634 ino = find_inode_number(dentry, &qname); 696 ino = find_inode_number(dentry, &qname);
635 if (!ino) 697 if (!ino)
636 ino = iunique(inode->i_sb, 2); 698 ino = iunique(dir->i_sb, 2);
637 ctl.filled = filldir(dirent, qname.name, qname.len, 699 ctl.filled = filldir(dirent, qname.name, qname.len,
638 filp->f_pos, ino, DT_UNKNOWN); 700 filp->f_pos, ino, DT_UNKNOWN);
639 if (!ctl.filled) 701 if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
660 (unsigned long) filp->f_pos); 722 (unsigned long) filp->f_pos);
661 723
662 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 724 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
725 int inval_dentry;
663 726
664 if (ncp_get_volume_info_with_number(server, i, &info) != 0) 727 if (ncp_get_volume_info_with_number(server, i, &info) != 0)
665 return; 728 return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
675 info.volume_name); 738 info.volume_name);
676 continue; 739 continue;
677 } 740 }
741 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
678 entry.volume = entry.i.volNumber; 742 entry.volume = entry.i.volNumber;
679 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 743 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
680 return; 744 return;
681 } 745 }
682} 746}
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
739 rpl += onerpl; 803 rpl += onerpl;
740 rpls -= onerpl; 804 rpls -= onerpl;
741 entry.volume = entry.i.volNumber; 805 entry.volume = entry.i.volNumber;
742 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 806 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
743 break; 807 break;
744 } 808 }
745 } while (more); 809 } while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
775 if (dent) { 839 if (dent) {
776 struct inode* ino = dent->d_inode; 840 struct inode* ino = dent->d_inode;
777 if (ino) { 841 if (ino) {
842 ncp_update_known_namespace(server, volNumber, NULL);
778 NCP_FINFO(ino)->volNumber = volNumber; 843 NCP_FINFO(ino)->volNumber = volNumber;
779 NCP_FINFO(ino)->dirEntNum = dirEntNum; 844 NCP_FINFO(ino)->dirEntNum = dirEntNum;
780 NCP_FINFO(ino)->DosDirNum = DosDirNum; 845 NCP_FINFO(ino)->DosDirNum = DosDirNum;
846 result = 0;
781 } else { 847 } else {
782 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); 848 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
783 } 849 }
784 } else { 850 } else {
785 DPRINTK("ncpfs: sb->s_root == NULL!\n"); 851 DPRINTK("ncpfs: sb->s_root == NULL!\n");
786 } 852 }
787 } 853 } else
788 result = 0; 854 result = 0;
789 855
790out: 856out:
791 return result; 857 return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
799 int error, res, len; 865 int error, res, len;
800 __u8 __name[NCP_MAXPATHLEN + 1]; 866 __u8 __name[NCP_MAXPATHLEN + 1];
801 867
802 lock_kernel();
803 error = -EIO; 868 error = -EIO;
804 if (!ncp_conn_valid(server)) 869 if (!ncp_conn_valid(server))
805 goto finished; 870 goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
813 dentry->d_name.len, 1); 878 dentry->d_name.len, 1);
814 if (!res) 879 if (!res)
815 res = ncp_lookup_volume(server, __name, &(finfo.i)); 880 res = ncp_lookup_volume(server, __name, &(finfo.i));
881 if (!res)
882 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
816 } else { 883 } else {
817 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 884 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
818 dentry->d_name.len, !ncp_preserve_case(dir)); 885 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
846 913
847finished: 914finished:
848 PPRINTK("ncp_lookup: result=%d\n", error); 915 PPRINTK("ncp_lookup: result=%d\n", error);
849 unlock_kernel();
850 return ERR_PTR(error); 916 return ERR_PTR(error);
851} 917}
852 918
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
887 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", 953 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
888 dentry->d_parent->d_name.name, dentry->d_name.name, mode); 954 dentry->d_parent->d_name.name, dentry->d_name.name, mode);
889 955
890 error = -EIO;
891 lock_kernel();
892 if (!ncp_conn_valid(server))
893 goto out;
894
895 ncp_age_dentry(server, dentry); 956 ncp_age_dentry(server, dentry);
896 len = sizeof(__name); 957 len = sizeof(__name);
897 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 958 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
917 if (result) { 978 if (result) {
918 if (result == 0x87) 979 if (result == 0x87)
919 error = -ENAMETOOLONG; 980 error = -ENAMETOOLONG;
981 else if (result < 0)
982 error = result;
920 DPRINTK("ncp_create: %s/%s failed\n", 983 DPRINTK("ncp_create: %s/%s failed\n",
921 dentry->d_parent->d_name.name, dentry->d_name.name); 984 dentry->d_parent->d_name.name, dentry->d_name.name);
922 goto out; 985 goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
935 998
936 error = ncp_instantiate(dir, dentry, &finfo); 999 error = ncp_instantiate(dir, dentry, &finfo);
937out: 1000out:
938 unlock_kernel();
939 return error; 1001 return error;
940} 1002}
941 1003
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
955 DPRINTK("ncp_mkdir: making %s/%s\n", 1017 DPRINTK("ncp_mkdir: making %s/%s\n",
956 dentry->d_parent->d_name.name, dentry->d_name.name); 1018 dentry->d_parent->d_name.name, dentry->d_name.name);
957 1019
958 error = -EIO;
959 lock_kernel();
960 if (!ncp_conn_valid(server))
961 goto out;
962
963 ncp_age_dentry(server, dentry); 1020 ncp_age_dentry(server, dentry);
964 len = sizeof(__name); 1021 len = sizeof(__name);
965 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1022 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
967 if (error) 1024 if (error)
968 goto out; 1025 goto out;
969 1026
970 error = -EACCES; 1027 error = ncp_open_create_file_or_subdir(server, dir, __name,
971 if (ncp_open_create_file_or_subdir(server, dir, __name,
972 OC_MODE_CREATE, aDIR, 1028 OC_MODE_CREATE, aDIR,
973 cpu_to_le16(0xffff), 1029 cpu_to_le16(0xffff),
974 &finfo) == 0) 1030 &finfo);
975 { 1031 if (error == 0) {
976 if (ncp_is_nfs_extras(server, finfo.volume)) { 1032 if (ncp_is_nfs_extras(server, finfo.volume)) {
977 mode |= S_IFDIR; 1033 mode |= S_IFDIR;
978 finfo.i.nfs.mode = mode; 1034 finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
983 goto out; 1039 goto out;
984 } 1040 }
985 error = ncp_instantiate(dir, dentry, &finfo); 1041 error = ncp_instantiate(dir, dentry, &finfo);
1042 } else if (error > 0) {
1043 error = -EACCES;
986 } 1044 }
987out: 1045out:
988 unlock_kernel();
989 return error; 1046 return error;
990} 1047}
991 1048
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
998 DPRINTK("ncp_rmdir: removing %s/%s\n", 1055 DPRINTK("ncp_rmdir: removing %s/%s\n",
999 dentry->d_parent->d_name.name, dentry->d_name.name); 1056 dentry->d_parent->d_name.name, dentry->d_name.name);
1000 1057
1001 error = -EIO;
1002 lock_kernel();
1003 if (!ncp_conn_valid(server))
1004 goto out;
1005
1006 error = -EBUSY; 1058 error = -EBUSY;
1007 if (!d_unhashed(dentry)) 1059 if (!d_unhashed(dentry))
1008 goto out; 1060 goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1036 error = -ENOENT; 1088 error = -ENOENT;
1037 break; 1089 break;
1038 default: 1090 default:
1039 error = -EACCES; 1091 error = result < 0 ? result : -EACCES;
1040 break; 1092 break;
1041 } 1093 }
1042out: 1094out:
1043 unlock_kernel();
1044 return error; 1095 return error;
1045} 1096}
1046 1097
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1050 struct ncp_server *server; 1101 struct ncp_server *server;
1051 int error; 1102 int error;
1052 1103
1053 lock_kernel();
1054 server = NCP_SERVER(dir); 1104 server = NCP_SERVER(dir);
1055 DPRINTK("ncp_unlink: unlinking %s/%s\n", 1105 DPRINTK("ncp_unlink: unlinking %s/%s\n",
1056 dentry->d_parent->d_name.name, dentry->d_name.name); 1106 dentry->d_parent->d_name.name, dentry->d_name.name);
1057 1107
1058 error = -EIO;
1059 if (!ncp_conn_valid(server))
1060 goto out;
1061
1062 /* 1108 /*
1063 * Check whether to close the file ... 1109 * Check whether to close the file ...
1064 */ 1110 */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1097 error = -ENOENT; 1143 error = -ENOENT;
1098 break; 1144 break;
1099 default: 1145 default:
1100 error = -EACCES; 1146 error = error < 0 ? error : -EACCES;
1101 break; 1147 break;
1102 } 1148 }
1103
1104out:
1105 unlock_kernel();
1106 return error; 1149 return error;
1107} 1150}
1108 1151
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1118 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1161 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1119 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1162 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1120 1163
1121 error = -EIO;
1122 lock_kernel();
1123 if (!ncp_conn_valid(server))
1124 goto out;
1125
1126 ncp_age_dentry(server, old_dentry); 1164 ncp_age_dentry(server, old_dentry);
1127 ncp_age_dentry(server, new_dentry); 1165 ncp_age_dentry(server, new_dentry);
1128 1166
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1161 error = -ENOENT; 1199 error = -ENOENT;
1162 break; 1200 break;
1163 default: 1201 default:
1164 error = -EACCES; 1202 error = error < 0 ? error : -EACCES;
1165 break; 1203 break;
1166 } 1204 }
1167out: 1205out:
1168 unlock_kernel();
1169 return error; 1206 return error;
1170} 1207}
1171 1208
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbda..6c754f70c52 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
113 DPRINTK("ncp_file_read: enter %s/%s\n", 113 DPRINTK("ncp_file_read: enter %s/%s\n",
114 dentry->d_parent->d_name.name, dentry->d_name.name); 114 dentry->d_parent->d_name.name, dentry->d_name.name);
115 115
116 if (!ncp_conn_valid(NCP_SERVER(inode)))
117 return -EIO;
118
119 pos = *ppos; 116 pos = *ppos;
120 117
121 if ((ssize_t) count < 0) { 118 if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
192 189
193 DPRINTK("ncp_file_write: enter %s/%s\n", 190 DPRINTK("ncp_file_write: enter %s/%s\n",
194 dentry->d_parent->d_name.name, dentry->d_name.name); 191 dentry->d_parent->d_name.name, dentry->d_name.name);
195 if (!ncp_conn_valid(NCP_SERVER(inode)))
196 return -EIO;
197 if ((ssize_t) count < 0) 192 if ((ssize_t) count < 0)
198 return -EINVAL; 193 return -EINVAL;
199 pos = *ppos; 194 pos = *ppos;
200 if (file->f_flags & O_APPEND) { 195 if (file->f_flags & O_APPEND) {
201 pos = inode->i_size; 196 pos = i_size_read(inode);
202 } 197 }
203 198
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 199 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
264 259
265 *ppos = pos; 260 *ppos = pos;
266 261
267 if (pos > inode->i_size) { 262 if (pos > i_size_read(inode)) {
268 inode->i_size = pos; 263 mutex_lock(&inode->i_mutex);
264 if (pos > i_size_read(inode))
265 i_size_write(inode, pos);
266 mutex_unlock(&inode->i_mutex);
269 } 267 }
270 DPRINTK("ncp_file_write: exit %s/%s\n", 268 DPRINTK("ncp_file_write: exit %s/%s\n",
271 dentry->d_parent->d_name.name, dentry->d_name.name); 269 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 279 return 0;
282} 280}
283 281
284static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
285{
286 loff_t ret;
287 lock_kernel();
288 ret = generic_file_llseek_unlocked(file, offset, origin);
289 unlock_kernel();
290 return ret;
291}
292
293const struct file_operations ncp_file_operations = 282const struct file_operations ncp_file_operations =
294{ 283{
295 .llseek = ncp_remote_llseek, 284 .llseek = generic_file_llseek,
296 .read = ncp_file_read, 285 .read = ncp_file_read,
297 .write = ncp_file_write, 286 .write = ncp_file_write,
298 .unlocked_ioctl = ncp_ioctl, 287 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa338515402..d290545aa0c 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -43,7 +43,7 @@
43#define NCP_DEFAULT_TIME_OUT 10 43#define NCP_DEFAULT_TIME_OUT 10
44#define NCP_DEFAULT_RETRY_COUNT 20 44#define NCP_DEFAULT_RETRY_COUNT 20
45 45
46static void ncp_delete_inode(struct inode *); 46static void ncp_evict_inode(struct inode *);
47static void ncp_put_super(struct super_block *); 47static void ncp_put_super(struct super_block *);
48static int ncp_statfs(struct dentry *, struct kstatfs *); 48static int ncp_statfs(struct dentry *, struct kstatfs *);
49static int ncp_show_options(struct seq_file *, struct vfsmount *); 49static int ncp_show_options(struct seq_file *, struct vfsmount *);
@@ -100,7 +100,7 @@ static const struct super_operations ncp_sops =
100 .alloc_inode = ncp_alloc_inode, 100 .alloc_inode = ncp_alloc_inode,
101 .destroy_inode = ncp_destroy_inode, 101 .destroy_inode = ncp_destroy_inode,
102 .drop_inode = generic_delete_inode, 102 .drop_inode = generic_delete_inode,
103 .delete_inode = ncp_delete_inode, 103 .evict_inode = ncp_evict_inode,
104 .put_super = ncp_put_super, 104 .put_super = ncp_put_super,
105 .statfs = ncp_statfs, 105 .statfs = ncp_statfs,
106 .remount_fs = ncp_remount, 106 .remount_fs = ncp_remount,
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
139 inode->i_mode = nwi->nfs.mode; 139 inode->i_mode = nwi->nfs.mode;
140 } 140 }
141 141
142 inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; 142 inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
143 143
144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); 144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); 145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
158 inode->i_mode = server->m.dir_mode; 158 inode->i_mode = server->m.dir_mode;
159 /* for directories dataStreamSize seems to be some 159 /* for directories dataStreamSize seems to be some
160 Object ID ??? */ 160 Object ID ??? */
161 inode->i_size = NCP_BLOCK_SIZE; 161 i_size_write(inode, NCP_BLOCK_SIZE);
162 } else { 162 } else {
163 u32 size;
164
163 inode->i_mode = server->m.file_mode; 165 inode->i_mode = server->m.file_mode;
164 inode->i_size = le32_to_cpu(nwi->dataStreamSize); 166 size = le32_to_cpu(nwi->dataStreamSize);
167 i_size_write(inode, size);
165#ifdef CONFIG_NCPFS_EXTRAS 168#ifdef CONFIG_NCPFS_EXTRAS
166 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 169 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS))
167 && (nwi->attributes & aSHARED)) { 170 && (nwi->attributes & aSHARED)) {
168 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { 171 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
169 case aHIDDEN: 172 case aHIDDEN:
170 if (server->m.flags & NCP_MOUNT_SYMLINKS) { 173 if (server->m.flags & NCP_MOUNT_SYMLINKS) {
171 if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE) 174 if (/* (size >= NCP_MIN_SYMLINK_SIZE)
172 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) { 175 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
173 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; 176 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
174 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; 177 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
175 break; 178 break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
208} 211}
209 212
210/* 213/*
211 * Fill in the inode based on the ncp_entry_info structure. 214 * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes.
212 */ 215 */
213static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) 216static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
214{ 217{
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
254 if (inode) { 257 if (inode) {
255 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 258 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
256 259
260 inode->i_mapping->backing_dev_info = sb->s_bdi;
257 inode->i_ino = info->ino; 261 inode->i_ino = info->ino;
258 ncp_set_attr(inode, info); 262 ncp_set_attr(inode, info);
259 if (S_ISREG(inode->i_mode)) { 263 if (S_ISREG(inode->i_mode)) {
@@ -282,27 +286,29 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
282} 286}
283 287
284static void 288static void
285ncp_delete_inode(struct inode *inode) 289ncp_evict_inode(struct inode *inode)
286{ 290{
287 truncate_inode_pages(&inode->i_data, 0); 291 truncate_inode_pages(&inode->i_data, 0);
292 end_writeback(inode);
288 293
289 if (S_ISDIR(inode->i_mode)) { 294 if (S_ISDIR(inode->i_mode)) {
290 DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino); 295 DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
291 } 296 }
292 297
293 if (ncp_make_closed(inode) != 0) { 298 if (ncp_make_closed(inode) != 0) {
294 /* We can't do anything but complain. */ 299 /* We can't do anything but complain. */
295 printk(KERN_ERR "ncp_delete_inode: could not close\n"); 300 printk(KERN_ERR "ncp_evict_inode: could not close\n");
296 } 301 }
297 clear_inode(inode);
298} 302}
299 303
300static void ncp_stop_tasks(struct ncp_server *server) { 304static void ncp_stop_tasks(struct ncp_server *server) {
301 struct sock* sk = server->ncp_sock->sk; 305 struct sock* sk = server->ncp_sock->sk;
302 306
307 lock_sock(sk);
303 sk->sk_error_report = server->error_report; 308 sk->sk_error_report = server->error_report;
304 sk->sk_data_ready = server->data_ready; 309 sk->sk_data_ready = server->data_ready;
305 sk->sk_write_space = server->write_space; 310 sk->sk_write_space = server->write_space;
311 release_sock(sk);
306 del_timer_sync(&server->timeout_tm); 312 del_timer_sync(&server->timeout_tm);
307 flush_scheduled_work(); 313 flush_scheduled_work();
308} 314}
@@ -565,10 +571,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
565/* server->conn_status = 0; */ 571/* server->conn_status = 0; */
566/* server->root_dentry = NULL; */ 572/* server->root_dentry = NULL; */
567/* server->root_setuped = 0; */ 573/* server->root_setuped = 0; */
574 mutex_init(&server->root_setup_lock);
568#ifdef CONFIG_NCPFS_PACKET_SIGNING 575#ifdef CONFIG_NCPFS_PACKET_SIGNING
569/* server->sign_wanted = 0; */ 576/* server->sign_wanted = 0; */
570/* server->sign_active = 0; */ 577/* server->sign_active = 0; */
571#endif 578#endif
579 init_rwsem(&server->auth_rwsem);
572 server->auth.auth_type = NCP_AUTH_NONE; 580 server->auth.auth_type = NCP_AUTH_NONE;
573/* server->auth.object_name_len = 0; */ 581/* server->auth.object_name_len = 0; */
574/* server->auth.object_name = NULL; */ 582/* server->auth.object_name = NULL; */
@@ -593,16 +601,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
593 server->nls_io = load_nls_default(); 601 server->nls_io = load_nls_default();
594#endif /* CONFIG_NCPFS_NLS */ 602#endif /* CONFIG_NCPFS_NLS */
595 603
596 server->dentry_ttl = 0; /* no caching */ 604 atomic_set(&server->dentry_ttl, 0); /* no caching */
597 605
598 INIT_LIST_HEAD(&server->tx.requests); 606 INIT_LIST_HEAD(&server->tx.requests);
599 mutex_init(&server->rcv.creq_mutex); 607 mutex_init(&server->rcv.creq_mutex);
600 server->tx.creq = NULL; 608 server->tx.creq = NULL;
601 server->rcv.creq = NULL; 609 server->rcv.creq = NULL;
602 server->data_ready = sock->sk->sk_data_ready;
603 server->write_space = sock->sk->sk_write_space;
604 server->error_report = sock->sk->sk_error_report;
605 sock->sk->sk_user_data = server;
606 610
607 init_timer(&server->timeout_tm); 611 init_timer(&server->timeout_tm);
608#undef NCP_PACKET_SIZE 612#undef NCP_PACKET_SIZE
@@ -619,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
619 if (server->rxbuf == NULL) 623 if (server->rxbuf == NULL)
620 goto out_txbuf; 624 goto out_txbuf;
621 625
626 lock_sock(sock->sk);
627 server->data_ready = sock->sk->sk_data_ready;
628 server->write_space = sock->sk->sk_write_space;
629 server->error_report = sock->sk->sk_error_report;
630 sock->sk->sk_user_data = server;
622 sock->sk->sk_data_ready = ncp_tcp_data_ready; 631 sock->sk->sk_data_ready = ncp_tcp_data_ready;
623 sock->sk->sk_error_report = ncp_tcp_error_report; 632 sock->sk->sk_error_report = ncp_tcp_error_report;
624 if (sock->type == SOCK_STREAM) { 633 if (sock->type == SOCK_STREAM) {
@@ -634,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
634 server->timeout_tm.data = (unsigned long)server; 643 server->timeout_tm.data = (unsigned long)server;
635 server->timeout_tm.function = ncpdgram_timeout_call; 644 server->timeout_tm.function = ncpdgram_timeout_call;
636 } 645 }
646 release_sock(sock->sk);
637 647
638 ncp_lock_server(server); 648 ncp_lock_server(server);
639 error = ncp_connect(server); 649 error = ncp_connect(server);
@@ -658,8 +668,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
658 goto out_disconnect; 668 goto out_disconnect;
659 } 669 }
660 } 670 }
671 ncp_lock_server(server);
661 if (options & 2) 672 if (options & 2)
662 server->sign_wanted = 1; 673 server->sign_wanted = 1;
674 ncp_unlock_server(server);
663 } 675 }
664 else 676 else
665#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 677#endif /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +732,9 @@ out_nls:
720 unload_nls(server->nls_io); 732 unload_nls(server->nls_io);
721 unload_nls(server->nls_vol); 733 unload_nls(server->nls_vol);
722#endif 734#endif
735 mutex_destroy(&server->rcv.creq_mutex);
736 mutex_destroy(&server->root_setup_lock);
737 mutex_destroy(&server->mutex);
723out_fput2: 738out_fput2:
724 if (server->info_filp) 739 if (server->info_filp)
725 fput(server->info_filp); 740 fput(server->info_filp);
@@ -728,8 +743,8 @@ out_fput:
728out_bdi: 743out_bdi:
729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 744 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
730 * 745 *
731 * The previously used put_filp(ncp_filp); was bogous, since 746 * The previously used put_filp(ncp_filp); was bogus, since
732 * it doesn't proper unlocking. 747 * it doesn't perform proper unlocking.
733 */ 748 */
734 fput(ncp_filp); 749 fput(ncp_filp);
735out: 750out:
@@ -743,8 +758,6 @@ static void ncp_put_super(struct super_block *sb)
743{ 758{
744 struct ncp_server *server = NCP_SBP(sb); 759 struct ncp_server *server = NCP_SBP(sb);
745 760
746 lock_kernel();
747
748 ncp_lock_server(server); 761 ncp_lock_server(server);
749 ncp_disconnect(server); 762 ncp_disconnect(server);
750 ncp_unlock_server(server); 763 ncp_unlock_server(server);
@@ -756,6 +769,9 @@ static void ncp_put_super(struct super_block *sb)
756 unload_nls(server->nls_vol); 769 unload_nls(server->nls_vol);
757 unload_nls(server->nls_io); 770 unload_nls(server->nls_io);
758#endif /* CONFIG_NCPFS_NLS */ 771#endif /* CONFIG_NCPFS_NLS */
772 mutex_destroy(&server->rcv.creq_mutex);
773 mutex_destroy(&server->root_setup_lock);
774 mutex_destroy(&server->mutex);
759 775
760 if (server->info_filp) 776 if (server->info_filp)
761 fput(server->info_filp); 777 fput(server->info_filp);
@@ -771,8 +787,6 @@ static void ncp_put_super(struct super_block *sb)
771 vfree(server->packet); 787 vfree(server->packet);
772 sb->s_fs_info = NULL; 788 sb->s_fs_info = NULL;
773 kfree(server); 789 kfree(server);
774
775 unlock_kernel();
776} 790}
777 791
778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 792static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +865,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
851 865
852 result = -EIO; 866 result = -EIO;
853 867
854 lock_kernel();
855
856 server = NCP_SERVER(inode); 868 server = NCP_SERVER(inode);
857 if ((!server) || !ncp_conn_valid(server)) 869 if (!server) /* How this could happen? */
858 goto out; 870 goto out;
859 871
860 /* ageing the dentry to force validation */ 872 /* ageing the dentry to force validation */
@@ -924,9 +936,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
924 tmpattr.ia_valid = ATTR_MODE; 936 tmpattr.ia_valid = ATTR_MODE;
925 tmpattr.ia_mode = attr->ia_mode; 937 tmpattr.ia_mode = attr->ia_mode;
926 938
927 result = inode_setattr(inode, &tmpattr); 939 setattr_copy(inode, &tmpattr);
928 if (result) 940 mark_inode_dirty(inode);
929 goto out;
930 } 941 }
931 } 942 }
932#endif 943#endif
@@ -954,15 +965,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
954 result = ncp_make_closed(inode); 965 result = ncp_make_closed(inode);
955 if (result) 966 if (result)
956 goto out; 967 goto out;
957 { 968
958 struct iattr tmpattr; 969 if (attr->ia_size != i_size_read(inode)) {
959 970 result = vmtruncate(inode, attr->ia_size);
960 tmpattr.ia_valid = ATTR_SIZE;
961 tmpattr.ia_size = attr->ia_size;
962
963 result = inode_setattr(inode, &tmpattr);
964 if (result) 971 if (result)
965 goto out; 972 goto out;
973 mark_inode_dirty(inode);
966 } 974 }
967 } 975 }
968 if ((attr->ia_valid & ATTR_CTIME) != 0) { 976 if ((attr->ia_valid & ATTR_CTIME) != 0) {
@@ -985,8 +993,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
985 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), 993 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
986 inode, info_mask, &info); 994 inode, info_mask, &info);
987 if (result != 0) { 995 if (result != 0) {
988 result = -EACCES;
989
990 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { 996 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
991 /* NetWare seems not to allow this. I 997 /* NetWare seems not to allow this. I
992 do not know why. So, just tell the 998 do not know why. So, just tell the
@@ -1002,23 +1008,28 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
1002 NCP_FINFO(inode)->nwattr = info.attributes; 1008 NCP_FINFO(inode)->nwattr = info.attributes;
1003#endif 1009#endif
1004 } 1010 }
1005 if (!result) 1011 if (result)
1006 result = inode_setattr(inode, attr); 1012 goto out;
1013
1014 setattr_copy(inode, attr);
1015 mark_inode_dirty(inode);
1016
1007out: 1017out:
1008 unlock_kernel(); 1018 if (result > 0)
1019 result = -EACCES;
1009 return result; 1020 return result;
1010} 1021}
1011 1022
1012static int ncp_get_sb(struct file_system_type *fs_type, 1023static struct dentry *ncp_mount(struct file_system_type *fs_type,
1013 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1024 int flags, const char *dev_name, void *data)
1014{ 1025{
1015 return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); 1026 return mount_nodev(fs_type, flags, data, ncp_fill_super);
1016} 1027}
1017 1028
1018static struct file_system_type ncp_fs_type = { 1029static struct file_system_type ncp_fs_type = {
1019 .owner = THIS_MODULE, 1030 .owner = THIS_MODULE,
1020 .name = "ncpfs", 1031 .name = "ncpfs",
1021 .get_sb = ncp_get_sb, 1032 .mount = ncp_mount,
1022 .kill_sb = kill_anon_super, 1033 .kill_sb = kill_anon_super,
1023 .fs_flags = FS_BINARY_MOUNTDATA, 1034 .fs_flags = FS_BINARY_MOUNTDATA,
1024}; 1035};
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 023c03d0207..c2a1f9a155c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,7 +20,6 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
24 23
25#include <linux/ncp_fs.h> 24#include <linux/ncp_fs.h>
26 25
@@ -36,16 +35,11 @@
36#define NCP_PACKET_SIZE_INTERNAL 65536 35#define NCP_PACKET_SIZE_INTERNAL 65536
37 36
38static int 37static int
39ncp_get_fs_info(struct ncp_server * server, struct file *file, 38ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
40 struct ncp_fs_info __user *arg) 39 struct ncp_fs_info __user *arg)
41{ 40{
42 struct inode *inode = file->f_path.dentry->d_inode;
43 struct ncp_fs_info info; 41 struct ncp_fs_info info;
44 42
45 if (file_permission(file, MAY_WRITE) != 0
46 && current_uid() != server->m.mounted_uid)
47 return -EACCES;
48
49 if (copy_from_user(&info, arg, sizeof(info))) 43 if (copy_from_user(&info, arg, sizeof(info)))
50 return -EFAULT; 44 return -EFAULT;
51 45
@@ -66,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
66} 60}
67 61
68static int 62static int
69ncp_get_fs_info_v2(struct ncp_server * server, struct file *file, 63ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
70 struct ncp_fs_info_v2 __user * arg) 64 struct ncp_fs_info_v2 __user * arg)
71{ 65{
72 struct inode *inode = file->f_path.dentry->d_inode;
73 struct ncp_fs_info_v2 info2; 66 struct ncp_fs_info_v2 info2;
74 67
75 if (file_permission(file, MAY_WRITE) != 0
76 && current_uid() != server->m.mounted_uid)
77 return -EACCES;
78
79 if (copy_from_user(&info2, arg, sizeof(info2))) 68 if (copy_from_user(&info2, arg, sizeof(info2)))
80 return -EFAULT; 69 return -EFAULT;
81 70
@@ -137,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
137#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) 126#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
138 127
139static int 128static int
140ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file, 129ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
141 struct compat_ncp_fs_info_v2 __user * arg) 130 struct compat_ncp_fs_info_v2 __user * arg)
142{ 131{
143 struct inode *inode = file->f_path.dentry->d_inode;
144 struct compat_ncp_fs_info_v2 info2; 132 struct compat_ncp_fs_info_v2 info2;
145 133
146 if (file_permission(file, MAY_WRITE) != 0
147 && current_uid() != server->m.mounted_uid)
148 return -EACCES;
149
150 if (copy_from_user(&info2, arg, sizeof(info2))) 134 if (copy_from_user(&info2, arg, sizeof(info2)))
151 return -EFAULT; 135 return -EFAULT;
152 136
@@ -183,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
183 struct nls_table *iocharset; 167 struct nls_table *iocharset;
184 struct nls_table *oldset_io; 168 struct nls_table *oldset_io;
185 struct nls_table *oldset_cp; 169 struct nls_table *oldset_cp;
186 170 int utf8;
187 if (!capable(CAP_SYS_ADMIN)) 171 int err;
188 return -EACCES;
189 if (server->root_setuped)
190 return -EBUSY;
191 172
192 if (copy_from_user(&user, arg, sizeof(user))) 173 if (copy_from_user(&user, arg, sizeof(user)))
193 return -EFAULT; 174 return -EFAULT;
@@ -207,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
207 user.iocharset[NCP_IOCSNAME_LEN] = 0; 188 user.iocharset[NCP_IOCSNAME_LEN] = 0;
208 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { 189 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
209 iocharset = load_nls_default(); 190 iocharset = load_nls_default();
210 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 191 utf8 = 0;
211 } else if (!strcmp(user.iocharset, "utf8")) { 192 } else if (!strcmp(user.iocharset, "utf8")) {
212 iocharset = load_nls_default(); 193 iocharset = load_nls_default();
213 NCP_SET_FLAG(server, NCP_FLAG_UTF8); 194 utf8 = 1;
214 } else { 195 } else {
215 iocharset = load_nls(user.iocharset); 196 iocharset = load_nls(user.iocharset);
216 if (!iocharset) { 197 if (!iocharset) {
217 unload_nls(codepage); 198 unload_nls(codepage);
218 return -EBADRQC; 199 return -EBADRQC;
219 } 200 }
220 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 201 utf8 = 0;
221 } 202 }
222 203
223 oldset_cp = server->nls_vol; 204 mutex_lock(&server->root_setup_lock);
224 server->nls_vol = codepage; 205 if (server->root_setuped) {
225 oldset_io = server->nls_io; 206 oldset_cp = codepage;
226 server->nls_io = iocharset; 207 oldset_io = iocharset;
227 208 err = -EBUSY;
209 } else {
210 if (utf8)
211 NCP_SET_FLAG(server, NCP_FLAG_UTF8);
212 else
213 NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
214 oldset_cp = server->nls_vol;
215 server->nls_vol = codepage;
216 oldset_io = server->nls_io;
217 server->nls_io = iocharset;
218 err = 0;
219 }
220 mutex_unlock(&server->root_setup_lock);
228 unload_nls(oldset_cp); 221 unload_nls(oldset_cp);
229 unload_nls(oldset_io); 222 unload_nls(oldset_io);
230 223
231 return 0; 224 return err;
232} 225}
233 226
234static int 227static int
@@ -238,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
238 int len; 231 int len;
239 232
240 memset(&user, 0, sizeof(user)); 233 memset(&user, 0, sizeof(user));
234 mutex_lock(&server->root_setup_lock);
241 if (server->nls_vol && server->nls_vol->charset) { 235 if (server->nls_vol && server->nls_vol->charset) {
242 len = strlen(server->nls_vol->charset); 236 len = strlen(server->nls_vol->charset);
243 if (len > NCP_IOCSNAME_LEN) 237 if (len > NCP_IOCSNAME_LEN)
@@ -255,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
255 strncpy(user.iocharset, server->nls_io->charset, len); 249 strncpy(user.iocharset, server->nls_io->charset, len);
256 user.iocharset[len] = 0; 250 user.iocharset[len] = 0;
257 } 251 }
252 mutex_unlock(&server->root_setup_lock);
258 253
259 if (copy_to_user(arg, &user, sizeof(user))) 254 if (copy_to_user(arg, &user, sizeof(user)))
260 return -EFAULT; 255 return -EFAULT;
@@ -262,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
262} 257}
263#endif /* CONFIG_NCPFS_NLS */ 258#endif /* CONFIG_NCPFS_NLS */
264 259
265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 260static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
266{ 261{
267 struct inode *inode = filp->f_dentry->d_inode;
268 struct ncp_server *server = NCP_SERVER(inode); 262 struct ncp_server *server = NCP_SERVER(inode);
269 int result; 263 int result;
270 struct ncp_ioctl_request request; 264 struct ncp_ioctl_request request;
271 char* bouncebuffer; 265 char* bouncebuffer;
272 void __user *argp = (void __user *)arg; 266 void __user *argp = (void __user *)arg;
273 uid_t uid = current_uid();
274 267
275 switch (cmd) { 268 switch (cmd) {
276#ifdef CONFIG_COMPAT 269#ifdef CONFIG_COMPAT
277 case NCP_IOC_NCPREQUEST_32: 270 case NCP_IOC_NCPREQUEST_32:
278#endif 271#endif
279 case NCP_IOC_NCPREQUEST: 272 case NCP_IOC_NCPREQUEST:
280 if (file_permission(filp, MAY_WRITE) != 0
281 && uid != server->m.mounted_uid)
282 return -EACCES;
283
284#ifdef CONFIG_COMPAT 273#ifdef CONFIG_COMPAT
285 if (cmd == NCP_IOC_NCPREQUEST_32) { 274 if (cmd == NCP_IOC_NCPREQUEST_32) {
286 struct compat_ncp_ioctl_request request32; 275 struct compat_ncp_ioctl_request request32;
@@ -315,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
315 server->current_size = request.size; 304 server->current_size = request.size;
316 memcpy(server->packet, bouncebuffer, request.size); 305 memcpy(server->packet, bouncebuffer, request.size);
317 306
318 result = ncp_request2(server, request.function, 307 result = ncp_request2(server, request.function,
319 bouncebuffer, NCP_PACKET_SIZE_INTERNAL); 308 bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
320 if (result < 0) 309 if (result < 0)
321 result = -EIO; 310 result = -EIO;
@@ -332,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
332 321
333 case NCP_IOC_CONN_LOGGED_IN: 322 case NCP_IOC_CONN_LOGGED_IN:
334 323
335 if (!capable(CAP_SYS_ADMIN))
336 return -EACCES;
337 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) 324 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
338 return -EINVAL; 325 return -EINVAL;
326 mutex_lock(&server->root_setup_lock);
339 if (server->root_setuped) 327 if (server->root_setuped)
340 return -EBUSY; 328 result = -EBUSY;
341 server->root_setuped = 1; 329 else {
342 return ncp_conn_logged_in(inode->i_sb); 330 result = ncp_conn_logged_in(inode->i_sb);
331 if (result == 0)
332 server->root_setuped = 1;
333 }
334 mutex_unlock(&server->root_setup_lock);
335 return result;
343 336
344 case NCP_IOC_GET_FS_INFO: 337 case NCP_IOC_GET_FS_INFO:
345 return ncp_get_fs_info(server, filp, argp); 338 return ncp_get_fs_info(server, inode, argp);
346 339
347 case NCP_IOC_GET_FS_INFO_V2: 340 case NCP_IOC_GET_FS_INFO_V2:
348 return ncp_get_fs_info_v2(server, filp, argp); 341 return ncp_get_fs_info_v2(server, inode, argp);
349 342
350#ifdef CONFIG_COMPAT 343#ifdef CONFIG_COMPAT
351 case NCP_IOC_GET_FS_INFO_V2_32: 344 case NCP_IOC_GET_FS_INFO_V2_32:
352 return ncp_get_compat_fs_info_v2(server, filp, argp); 345 return ncp_get_compat_fs_info_v2(server, inode, argp);
353#endif 346#endif
354 /* we have too many combinations of CONFIG_COMPAT, 347 /* we have too many combinations of CONFIG_COMPAT,
355 * CONFIG_64BIT and CONFIG_UID16, so just handle 348 * CONFIG_64BIT and CONFIG_UID16, so just handle
356 * any of the possible ioctls */ 349 * any of the possible ioctls */
357 case NCP_IOC_GETMOUNTUID16: 350 case NCP_IOC_GETMOUNTUID16:
358 case NCP_IOC_GETMOUNTUID32: 351 {
359 case NCP_IOC_GETMOUNTUID64:
360 if (file_permission(filp, MAY_READ) != 0
361 && uid != server->m.mounted_uid)
362 return -EACCES;
363
364 if (cmd == NCP_IOC_GETMOUNTUID16) {
365 u16 uid; 352 u16 uid;
353
366 SET_UID(uid, server->m.mounted_uid); 354 SET_UID(uid, server->m.mounted_uid);
367 if (put_user(uid, (u16 __user *)argp)) 355 if (put_user(uid, (u16 __user *)argp))
368 return -EFAULT; 356 return -EFAULT;
369 } else if (cmd == NCP_IOC_GETMOUNTUID32) { 357 return 0;
370 if (put_user(server->m.mounted_uid,
371 (u32 __user *)argp))
372 return -EFAULT;
373 } else {
374 if (put_user(server->m.mounted_uid,
375 (u64 __user *)argp))
376 return -EFAULT;
377 } 358 }
359 case NCP_IOC_GETMOUNTUID32:
360 if (put_user(server->m.mounted_uid,
361 (u32 __user *)argp))
362 return -EFAULT;
363 return 0;
364 case NCP_IOC_GETMOUNTUID64:
365 if (put_user(server->m.mounted_uid,
366 (u64 __user *)argp))
367 return -EFAULT;
378 return 0; 368 return 0;
379 369
380 case NCP_IOC_GETROOT: 370 case NCP_IOC_GETROOT:
381 { 371 {
382 struct ncp_setroot_ioctl sr; 372 struct ncp_setroot_ioctl sr;
383 373
384 if (file_permission(filp, MAY_READ) != 0 374 result = -EACCES;
385 && uid != server->m.mounted_uid) 375 mutex_lock(&server->root_setup_lock);
386 return -EACCES;
387
388 if (server->m.mounted_vol[0]) { 376 if (server->m.mounted_vol[0]) {
389 struct dentry* dentry = inode->i_sb->s_root; 377 struct dentry* dentry = inode->i_sb->s_root;
390 378
391 if (dentry) { 379 if (dentry) {
392 struct inode* s_inode = dentry->d_inode; 380 struct inode* s_inode = dentry->d_inode;
393 381
394 if (s_inode) { 382 if (s_inode) {
395 sr.volNumber = NCP_FINFO(s_inode)->volNumber; 383 sr.volNumber = NCP_FINFO(s_inode)->volNumber;
396 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; 384 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
397 sr.namespace = server->name_space[sr.volNumber]; 385 sr.namespace = server->name_space[sr.volNumber];
386 result = 0;
398 } else 387 } else
399 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 388 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
400 } else 389 } else
@@ -403,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
403 sr.volNumber = -1; 392 sr.volNumber = -1;
404 sr.namespace = 0; 393 sr.namespace = 0;
405 sr.dirEntNum = 0; 394 sr.dirEntNum = 0;
395 result = 0;
406 } 396 }
407 if (copy_to_user(argp, &sr, sizeof(sr))) 397 mutex_unlock(&server->root_setup_lock);
408 return -EFAULT; 398 if (!result && copy_to_user(argp, &sr, sizeof(sr)))
409 return 0; 399 result = -EFAULT;
400 return result;
410 } 401 }
411 402
412 case NCP_IOC_SETROOT: 403 case NCP_IOC_SETROOT:
@@ -417,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
417 __le32 dosde; 408 __le32 dosde;
418 struct dentry* dentry; 409 struct dentry* dentry;
419 410
420 if (!capable(CAP_SYS_ADMIN))
421 {
422 return -EACCES;
423 }
424 if (server->root_setuped) return -EBUSY;
425 if (copy_from_user(&sr, argp, sizeof(sr))) 411 if (copy_from_user(&sr, argp, sizeof(sr)))
426 return -EFAULT; 412 return -EFAULT;
427 if (sr.volNumber < 0) { 413 mutex_lock(&server->root_setup_lock);
428 server->m.mounted_vol[0] = 0; 414 if (server->root_setuped)
429 vnum = NCP_NUMBER_OF_VOLUMES; 415 result = -EBUSY;
430 de = 0; 416 else {
431 dosde = 0; 417 if (sr.volNumber < 0) {
432 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { 418 server->m.mounted_vol[0] = 0;
433 return -EINVAL; 419 vnum = NCP_NUMBER_OF_VOLUMES;
434 } else if (ncp_mount_subdir(server, sr.volNumber, 420 de = 0;
435 sr.namespace, sr.dirEntNum, 421 dosde = 0;
436 &vnum, &de, &dosde)) { 422 result = 0;
437 return -ENOENT; 423 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
438 } 424 result = -EINVAL;
439 425 } else if (ncp_mount_subdir(server, sr.volNumber,
440 dentry = inode->i_sb->s_root; 426 sr.namespace, sr.dirEntNum,
441 server->root_setuped = 1; 427 &vnum, &de, &dosde)) {
442 if (dentry) { 428 result = -ENOENT;
443 struct inode* s_inode = dentry->d_inode;
444
445 if (s_inode) {
446 NCP_FINFO(s_inode)->volNumber = vnum;
447 NCP_FINFO(s_inode)->dirEntNum = de;
448 NCP_FINFO(s_inode)->DosDirNum = dosde;
449 } else 429 } else
450 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 430 result = 0;
451 } else 431
452 DPRINTK("ncpfs: s_root==NULL\n"); 432 if (result == 0) {
433 dentry = inode->i_sb->s_root;
434 if (dentry) {
435 struct inode* s_inode = dentry->d_inode;
436
437 if (s_inode) {
438 NCP_FINFO(s_inode)->volNumber = vnum;
439 NCP_FINFO(s_inode)->dirEntNum = de;
440 NCP_FINFO(s_inode)->DosDirNum = dosde;
441 server->root_setuped = 1;
442 } else {
443 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
444 result = -EIO;
445 }
446 } else {
447 DPRINTK("ncpfs: s_root==NULL\n");
448 result = -EIO;
449 }
450 }
451 result = 0;
452 }
453 mutex_unlock(&server->root_setup_lock);
453 454
454 return 0; 455 return result;
455 } 456 }
456 457
457#ifdef CONFIG_NCPFS_PACKET_SIGNING 458#ifdef CONFIG_NCPFS_PACKET_SIGNING
458 case NCP_IOC_SIGN_INIT: 459 case NCP_IOC_SIGN_INIT:
459 if (file_permission(filp, MAY_WRITE) != 0 460 {
460 && uid != server->m.mounted_uid) 461 struct ncp_sign_init sign;
461 return -EACCES;
462
463 if (argp) {
464 if (server->sign_wanted)
465 {
466 struct ncp_sign_init sign;
467 462
463 if (argp)
468 if (copy_from_user(&sign, argp, sizeof(sign))) 464 if (copy_from_user(&sign, argp, sizeof(sign)))
469 return -EFAULT; 465 return -EFAULT;
470 memcpy(server->sign_root,sign.sign_root,8); 466 ncp_lock_server(server);
471 memcpy(server->sign_last,sign.sign_last,16); 467 mutex_lock(&server->rcv.creq_mutex);
472 server->sign_active = 1; 468 if (argp) {
469 if (server->sign_wanted) {
470 memcpy(server->sign_root,sign.sign_root,8);
471 memcpy(server->sign_last,sign.sign_last,16);
472 server->sign_active = 1;
473 }
474 /* ignore when signatures not wanted */
475 } else {
476 server->sign_active = 0;
473 } 477 }
474 /* ignore when signatures not wanted */ 478 mutex_unlock(&server->rcv.creq_mutex);
475 } else { 479 ncp_unlock_server(server);
476 server->sign_active = 0; 480 return 0;
477 } 481 }
478 return 0; 482
479
480 case NCP_IOC_SIGN_WANTED: 483 case NCP_IOC_SIGN_WANTED:
481 if (file_permission(filp, MAY_READ) != 0 484 {
482 && uid != server->m.mounted_uid) 485 int state;
483 return -EACCES; 486
484 487 ncp_lock_server(server);
485 if (put_user(server->sign_wanted, (int __user *)argp)) 488 state = server->sign_wanted;
486 return -EFAULT; 489 ncp_unlock_server(server);
487 return 0; 490 if (put_user(state, (int __user *)argp))
491 return -EFAULT;
492 return 0;
493 }
488 494
489 case NCP_IOC_SET_SIGN_WANTED: 495 case NCP_IOC_SET_SIGN_WANTED:
490 { 496 {
491 int newstate; 497 int newstate;
492 498
493 if (file_permission(filp, MAY_WRITE) != 0
494 && uid != server->m.mounted_uid)
495 return -EACCES;
496
497 /* get only low 8 bits... */ 499 /* get only low 8 bits... */
498 if (get_user(newstate, (unsigned char __user *)argp)) 500 if (get_user(newstate, (unsigned char __user *)argp))
499 return -EFAULT; 501 return -EFAULT;
502 result = 0;
503 ncp_lock_server(server);
500 if (server->sign_active) { 504 if (server->sign_active) {
501 /* cannot turn signatures OFF when active */ 505 /* cannot turn signatures OFF when active */
502 if (!newstate) return -EINVAL; 506 if (!newstate)
507 result = -EINVAL;
503 } else { 508 } else {
504 server->sign_wanted = newstate != 0; 509 server->sign_wanted = newstate != 0;
505 } 510 }
506 return 0; 511 ncp_unlock_server(server);
512 return result;
507 } 513 }
508 514
509#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 515#endif /* CONFIG_NCPFS_PACKET_SIGNING */
510 516
511#ifdef CONFIG_NCPFS_IOCTL_LOCKING 517#ifdef CONFIG_NCPFS_IOCTL_LOCKING
512 case NCP_IOC_LOCKUNLOCK: 518 case NCP_IOC_LOCKUNLOCK:
513 if (file_permission(filp, MAY_WRITE) != 0
514 && uid != server->m.mounted_uid)
515 return -EACCES;
516
517 { 519 {
518 struct ncp_lock_ioctl rqdata; 520 struct ncp_lock_ioctl rqdata;
519 521
@@ -542,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
542 { 544 {
543 return result; 545 return result;
544 } 546 }
545 result = -EIO;
546 if (!ncp_conn_valid(server))
547 goto outrel;
548 result = -EISDIR; 547 result = -EISDIR;
549 if (!S_ISREG(inode->i_mode)) 548 if (!S_ISREG(inode->i_mode))
550 goto outrel; 549 goto outrel;
551 if (rqdata.cmd == NCP_LOCK_CLEAR) 550 if (rqdata.cmd == NCP_LOCK_CLEAR)
552 { 551 {
553 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), 552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
554 NCP_FINFO(inode)->file_handle, 553 NCP_FINFO(inode)->file_handle,
555 rqdata.offset, 554 rqdata.offset,
556 rqdata.length); 555 rqdata.length);
557 if (result > 0) result = 0; /* no such lock */ 556 if (result > 0) result = 0; /* no such lock */
@@ -574,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
574 rqdata.timeout); 573 rqdata.timeout);
575 if (result > 0) result = -EAGAIN; 574 if (result > 0) result = -EAGAIN;
576 } 575 }
577outrel: 576outrel:
578 ncp_inode_close(inode); 577 ncp_inode_close(inode);
579 return result; 578 return result;
580 } 579 }
@@ -582,60 +581,62 @@ outrel:
582 581
583#ifdef CONFIG_COMPAT 582#ifdef CONFIG_COMPAT
584 case NCP_IOC_GETOBJECTNAME_32: 583 case NCP_IOC_GETOBJECTNAME_32:
585 if (uid != server->m.mounted_uid)
586 return -EACCES;
587 { 584 {
588 struct compat_ncp_objectname_ioctl user; 585 struct compat_ncp_objectname_ioctl user;
589 size_t outl; 586 size_t outl;
590 587
591 if (copy_from_user(&user, argp, sizeof(user))) 588 if (copy_from_user(&user, argp, sizeof(user)))
592 return -EFAULT; 589 return -EFAULT;
590 down_read(&server->auth_rwsem);
593 user.auth_type = server->auth.auth_type; 591 user.auth_type = server->auth.auth_type;
594 outl = user.object_name_len; 592 outl = user.object_name_len;
595 user.object_name_len = server->auth.object_name_len; 593 user.object_name_len = server->auth.object_name_len;
596 if (outl > user.object_name_len) 594 if (outl > user.object_name_len)
597 outl = user.object_name_len; 595 outl = user.object_name_len;
596 result = 0;
598 if (outl) { 597 if (outl) {
599 if (copy_to_user(compat_ptr(user.object_name), 598 if (copy_to_user(compat_ptr(user.object_name),
600 server->auth.object_name, 599 server->auth.object_name,
601 outl)) return -EFAULT; 600 outl))
601 result = -EFAULT;
602 } 602 }
603 if (copy_to_user(argp, &user, sizeof(user))) 603 up_read(&server->auth_rwsem);
604 return -EFAULT; 604 if (!result && copy_to_user(argp, &user, sizeof(user)))
605 return 0; 605 result = -EFAULT;
606 return result;
606 } 607 }
607#endif 608#endif
608 609
609 case NCP_IOC_GETOBJECTNAME: 610 case NCP_IOC_GETOBJECTNAME:
610 if (uid != server->m.mounted_uid)
611 return -EACCES;
612 { 611 {
613 struct ncp_objectname_ioctl user; 612 struct ncp_objectname_ioctl user;
614 size_t outl; 613 size_t outl;
615 614
616 if (copy_from_user(&user, argp, sizeof(user))) 615 if (copy_from_user(&user, argp, sizeof(user)))
617 return -EFAULT; 616 return -EFAULT;
617 down_read(&server->auth_rwsem);
618 user.auth_type = server->auth.auth_type; 618 user.auth_type = server->auth.auth_type;
619 outl = user.object_name_len; 619 outl = user.object_name_len;
620 user.object_name_len = server->auth.object_name_len; 620 user.object_name_len = server->auth.object_name_len;
621 if (outl > user.object_name_len) 621 if (outl > user.object_name_len)
622 outl = user.object_name_len; 622 outl = user.object_name_len;
623 result = 0;
623 if (outl) { 624 if (outl) {
624 if (copy_to_user(user.object_name, 625 if (copy_to_user(user.object_name,
625 server->auth.object_name, 626 server->auth.object_name,
626 outl)) return -EFAULT; 627 outl))
628 result = -EFAULT;
627 } 629 }
628 if (copy_to_user(argp, &user, sizeof(user))) 630 up_read(&server->auth_rwsem);
629 return -EFAULT; 631 if (!result && copy_to_user(argp, &user, sizeof(user)))
630 return 0; 632 result = -EFAULT;
633 return result;
631 } 634 }
632 635
633#ifdef CONFIG_COMPAT 636#ifdef CONFIG_COMPAT
634 case NCP_IOC_SETOBJECTNAME_32: 637 case NCP_IOC_SETOBJECTNAME_32:
635#endif 638#endif
636 case NCP_IOC_SETOBJECTNAME: 639 case NCP_IOC_SETOBJECTNAME:
637 if (uid != server->m.mounted_uid)
638 return -EACCES;
639 { 640 {
640 struct ncp_objectname_ioctl user; 641 struct ncp_objectname_ioctl user;
641 void* newname; 642 void* newname;
@@ -667,9 +668,7 @@ outrel:
667 } else { 668 } else {
668 newname = NULL; 669 newname = NULL;
669 } 670 }
670 /* enter critical section */ 671 down_write(&server->auth_rwsem);
671 /* maybe that kfree can sleep so do that this way */
672 /* it is at least more SMP friendly (in future...) */
673 oldname = server->auth.object_name; 672 oldname = server->auth.object_name;
674 oldnamelen = server->auth.object_name_len; 673 oldnamelen = server->auth.object_name_len;
675 oldprivate = server->priv.data; 674 oldprivate = server->priv.data;
@@ -679,7 +678,7 @@ outrel:
679 server->auth.object_name = newname; 678 server->auth.object_name = newname;
680 server->priv.len = 0; 679 server->priv.len = 0;
681 server->priv.data = NULL; 680 server->priv.data = NULL;
682 /* leave critical section */ 681 up_write(&server->auth_rwsem);
683 kfree(oldprivate); 682 kfree(oldprivate);
684 kfree(oldname); 683 kfree(oldname);
685 return 0; 684 return 0;
@@ -689,8 +688,6 @@ outrel:
689 case NCP_IOC_GETPRIVATEDATA_32: 688 case NCP_IOC_GETPRIVATEDATA_32:
690#endif 689#endif
691 case NCP_IOC_GETPRIVATEDATA: 690 case NCP_IOC_GETPRIVATEDATA:
692 if (uid != server->m.mounted_uid)
693 return -EACCES;
694 { 691 {
695 struct ncp_privatedata_ioctl user; 692 struct ncp_privatedata_ioctl user;
696 size_t outl; 693 size_t outl;
@@ -707,14 +704,20 @@ outrel:
707 if (copy_from_user(&user, argp, sizeof(user))) 704 if (copy_from_user(&user, argp, sizeof(user)))
708 return -EFAULT; 705 return -EFAULT;
709 706
707 down_read(&server->auth_rwsem);
710 outl = user.len; 708 outl = user.len;
711 user.len = server->priv.len; 709 user.len = server->priv.len;
712 if (outl > user.len) outl = user.len; 710 if (outl > user.len) outl = user.len;
711 result = 0;
713 if (outl) { 712 if (outl) {
714 if (copy_to_user(user.data, 713 if (copy_to_user(user.data,
715 server->priv.data, 714 server->priv.data,
716 outl)) return -EFAULT; 715 outl))
716 result = -EFAULT;
717 } 717 }
718 up_read(&server->auth_rwsem);
719 if (result)
720 return result;
718#ifdef CONFIG_COMPAT 721#ifdef CONFIG_COMPAT
719 if (cmd == NCP_IOC_GETPRIVATEDATA_32) { 722 if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
720 struct compat_ncp_privatedata_ioctl user32; 723 struct compat_ncp_privatedata_ioctl user32;
@@ -734,8 +737,6 @@ outrel:
734 case NCP_IOC_SETPRIVATEDATA_32: 737 case NCP_IOC_SETPRIVATEDATA_32:
735#endif 738#endif
736 case NCP_IOC_SETPRIVATEDATA: 739 case NCP_IOC_SETPRIVATEDATA:
737 if (uid != server->m.mounted_uid)
738 return -EACCES;
739 { 740 {
740 struct ncp_privatedata_ioctl user; 741 struct ncp_privatedata_ioctl user;
741 void* new; 742 void* new;
@@ -763,12 +764,12 @@ outrel:
763 } else { 764 } else {
764 new = NULL; 765 new = NULL;
765 } 766 }
766 /* enter critical section */ 767 down_write(&server->auth_rwsem);
767 old = server->priv.data; 768 old = server->priv.data;
768 oldlen = server->priv.len; 769 oldlen = server->priv.len;
769 server->priv.len = user.len; 770 server->priv.len = user.len;
770 server->priv.data = new; 771 server->priv.data = new;
771 /* leave critical section */ 772 up_write(&server->auth_rwsem);
772 kfree(old); 773 kfree(old);
773 return 0; 774 return 0;
774 } 775 }
@@ -776,17 +777,13 @@ outrel:
776#ifdef CONFIG_NCPFS_NLS 777#ifdef CONFIG_NCPFS_NLS
777 case NCP_IOC_SETCHARSETS: 778 case NCP_IOC_SETCHARSETS:
778 return ncp_set_charsets(server, argp); 779 return ncp_set_charsets(server, argp);
779 780
780 case NCP_IOC_GETCHARSETS: 781 case NCP_IOC_GETCHARSETS:
781 return ncp_get_charsets(server, argp); 782 return ncp_get_charsets(server, argp);
782 783
783#endif /* CONFIG_NCPFS_NLS */ 784#endif /* CONFIG_NCPFS_NLS */
784 785
785 case NCP_IOC_SETDENTRYTTL: 786 case NCP_IOC_SETDENTRYTTL:
786 if (file_permission(filp, MAY_WRITE) != 0 &&
787 uid != server->m.mounted_uid)
788 return -EACCES;
789
790 { 787 {
791 u_int32_t user; 788 u_int32_t user;
792 789
@@ -796,13 +793,13 @@ outrel:
796 if (user > 20000) 793 if (user > 20000)
797 return -EINVAL; 794 return -EINVAL;
798 user = (user * HZ) / 1000; 795 user = (user * HZ) / 1000;
799 server->dentry_ttl = user; 796 atomic_set(&server->dentry_ttl, user);
800 return 0; 797 return 0;
801 } 798 }
802 799
803 case NCP_IOC_GETDENTRYTTL: 800 case NCP_IOC_GETDENTRYTTL:
804 { 801 {
805 u_int32_t user = (server->dentry_ttl * 1000) / HZ; 802 u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
806 if (copy_to_user(argp, &user, sizeof(user))) 803 if (copy_to_user(argp, &user, sizeof(user)))
807 return -EFAULT; 804 return -EFAULT;
808 return 0; 805 return 0;
@@ -812,59 +809,103 @@ outrel:
812 return -EINVAL; 809 return -EINVAL;
813} 810}
814 811
815static int ncp_ioctl_need_write(unsigned int cmd) 812long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
816{ 813{
814 struct inode *inode = filp->f_dentry->d_inode;
815 struct ncp_server *server = NCP_SERVER(inode);
816 uid_t uid = current_uid();
817 int need_drop_write = 0;
818 long ret;
819
817 switch (cmd) { 820 switch (cmd) {
818 case NCP_IOC_GET_FS_INFO:
819 case NCP_IOC_GET_FS_INFO_V2:
820 case NCP_IOC_NCPREQUEST:
821 case NCP_IOC_SETDENTRYTTL:
822 case NCP_IOC_SIGN_INIT:
823 case NCP_IOC_LOCKUNLOCK:
824 case NCP_IOC_SET_SIGN_WANTED:
825 return 1;
826 case NCP_IOC_GETOBJECTNAME:
827 case NCP_IOC_SETOBJECTNAME:
828 case NCP_IOC_GETPRIVATEDATA:
829 case NCP_IOC_SETPRIVATEDATA:
830 case NCP_IOC_SETCHARSETS: 821 case NCP_IOC_SETCHARSETS:
831 case NCP_IOC_GETCHARSETS:
832 case NCP_IOC_CONN_LOGGED_IN: 822 case NCP_IOC_CONN_LOGGED_IN:
833 case NCP_IOC_GETDENTRYTTL:
834 case NCP_IOC_GETMOUNTUID2:
835 case NCP_IOC_SIGN_WANTED:
836 case NCP_IOC_GETROOT:
837 case NCP_IOC_SETROOT: 823 case NCP_IOC_SETROOT:
838 return 0; 824 if (!capable(CAP_SYS_ADMIN)) {
839 default: 825 ret = -EACCES;
840 /* unknown IOCTL command, assume write */ 826 goto out;
841 return 1; 827 }
828 break;
842 } 829 }
843} 830 if (server->m.mounted_uid != uid) {
844 831 switch (cmd) {
845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
846{
847 long ret;
848
849 lock_kernel();
850 if (ncp_ioctl_need_write(cmd)) {
851 /* 832 /*
852 * inside the ioctl(), any failures which 833 * Only mount owner can issue these ioctls. Information
853 * are because of file_permission() are 834 * necessary to authenticate to other NDS servers are
854 * -EACCESS, so it seems consistent to keep 835 * stored here.
855 * that here.
856 */ 836 */
857 if (mnt_want_write(filp->f_path.mnt)) { 837 case NCP_IOC_GETOBJECTNAME:
838 case NCP_IOC_SETOBJECTNAME:
839 case NCP_IOC_GETPRIVATEDATA:
840 case NCP_IOC_SETPRIVATEDATA:
841#ifdef CONFIG_COMPAT
842 case NCP_IOC_GETOBJECTNAME_32:
843 case NCP_IOC_SETOBJECTNAME_32:
844 case NCP_IOC_GETPRIVATEDATA_32:
845 case NCP_IOC_SETPRIVATEDATA_32:
846#endif
858 ret = -EACCES; 847 ret = -EACCES;
859 goto out; 848 goto out;
849 /*
850 * These require write access on the inode if user id
851 * does not match. Note that they do not write to the
852 * file... But old code did mnt_want_write, so I keep
853 * it as is. Of course not for mountpoint owner, as
854 * that breaks read-only mounts altogether as ncpmount
855 * needs working NCP_IOC_NCPREQUEST and
856 * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl,
857 * signinit, setsignwanted) should be probably restricted
858 * to owner only, or even more to CAP_SYS_ADMIN).
859 */
860 case NCP_IOC_GET_FS_INFO:
861 case NCP_IOC_GET_FS_INFO_V2:
862 case NCP_IOC_NCPREQUEST:
863 case NCP_IOC_SETDENTRYTTL:
864 case NCP_IOC_SIGN_INIT:
865 case NCP_IOC_LOCKUNLOCK:
866 case NCP_IOC_SET_SIGN_WANTED:
867#ifdef CONFIG_COMPAT
868 case NCP_IOC_GET_FS_INFO_V2_32:
869 case NCP_IOC_NCPREQUEST_32:
870#endif
871 ret = mnt_want_write_file(filp);
872 if (ret)
873 goto out;
874 need_drop_write = 1;
875 ret = inode_permission(inode, MAY_WRITE);
876 if (ret)
877 goto outDropWrite;
878 break;
879 /*
880 * Read access required.
881 */
882 case NCP_IOC_GETMOUNTUID16:
883 case NCP_IOC_GETMOUNTUID32:
884 case NCP_IOC_GETMOUNTUID64:
885 case NCP_IOC_GETROOT:
886 case NCP_IOC_SIGN_WANTED:
887 ret = inode_permission(inode, MAY_READ);
888 if (ret)
889 goto out;
890 break;
891 /*
892 * Anybody can read these.
893 */
894 case NCP_IOC_GETCHARSETS:
895 case NCP_IOC_GETDENTRYTTL:
896 default:
897 /* Three codes below are protected by CAP_SYS_ADMIN above. */
898 case NCP_IOC_SETCHARSETS:
899 case NCP_IOC_CONN_LOGGED_IN:
900 case NCP_IOC_SETROOT:
901 break;
860 } 902 }
861 } 903 }
862 ret = __ncp_ioctl(filp, cmd, arg); 904 ret = __ncp_ioctl(inode, cmd, arg);
863 if (ncp_ioctl_need_write(cmd)) 905outDropWrite:
906 if (need_drop_write)
864 mnt_drop_write(filp->f_path.mnt); 907 mnt_drop_write(filp->f_path.mnt);
865
866out: 908out:
867 unlock_kernel();
868 return ret; 909 return ret;
869} 910}
870 911
@@ -873,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
873{ 914{
874 long ret; 915 long ret;
875 916
876 lock_kernel();
877 arg = (unsigned long) compat_ptr(arg); 917 arg = (unsigned long) compat_ptr(arg);
878 ret = ncp_ioctl(file, cmd, arg); 918 ret = ncp_ioctl(file, cmd, arg);
879 unlock_kernel();
880 return ret; 919 return ret;
881} 920}
882#endif 921#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a597..a95615a0b6a 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]); 107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
108} 108}
109 109
110static inline u8 BVAL(void *data) 110static inline u8 BVAL(const void *data)
111{ 111{
112 return *(u8 *)data; 112 return *(const u8 *)data;
113} 113}
114 114
115static u8 ncp_reply_byte(struct ncp_server *server, int offset) 115static u8 ncp_reply_byte(struct ncp_server *server, int offset)
116{ 116{
117 return *(u8 *)ncp_reply_data(server, offset); 117 return *(const u8 *)ncp_reply_data(server, offset);
118} 118}
119 119
120static inline u16 WVAL_LH(void *data) 120static inline u16 WVAL_LH(const void *data)
121{ 121{
122 return get_unaligned_le16(data); 122 return get_unaligned_le16(data);
123} 123}
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
134 return get_unaligned_be16(ncp_reply_data(server, offset)); 134 return get_unaligned_be16(ncp_reply_data(server, offset));
135} 135}
136 136
137static inline u32 DVAL_LH(void *data) 137static inline u32 DVAL_LH(const void *data)
138{ 138{
139 return get_unaligned_le32(data); 139 return get_unaligned_le32(data);
140} 140}
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
349 return result; 349 return result;
350} 350}
351 351
352void ncp_extract_file_info(void *structure, struct nw_info_struct *target) 352void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
353{ 353{
354 __u8 *name_len; 354 const __u8 *name_len;
355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen); 355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
356 356
357 memcpy(target, structure, info_struct_size); 357 memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
364} 364}
365 365
366#ifdef CONFIG_NCPFS_NFS_NS 366#ifdef CONFIG_NCPFS_NFS_NS
367static inline void ncp_extract_nfs_info(unsigned char *structure, 367static inline void ncp_extract_nfs_info(const unsigned char *structure,
368 struct nw_nfs_info *target) 368 struct nw_nfs_info *target)
369{ 369{
370 target->mode = DVAL_LH(structure); 370 target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
417 * Returns information for a (one-component) name relative to 417 * Returns information for a (one-component) name relative to
418 * the specified directory. 418 * the specified directory.
419 */ 419 */
420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path, 420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
421 struct nw_info_struct *target) 421 struct nw_info_struct *target)
422{ 422{
423 __u8 volnum = NCP_FINFO(dir)->volNumber; 423 __u8 volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
452#ifdef CONFIG_NCPFS_NFS_NS 452#ifdef CONFIG_NCPFS_NFS_NS
453static int 453static int
454ncp_obtain_DOS_dir_base(struct ncp_server *server, 454ncp_obtain_DOS_dir_base(struct ncp_server *server,
455 __u8 volnum, __le32 dirent, 455 __u8 ns, __u8 volnum, __le32 dirent,
456 char *path, /* At most 1 component */ 456 const char *path, /* At most 1 component */
457 __le32 *DOS_dir_base) 457 __le32 *DOS_dir_base)
458{ 458{
459 int result; 459 int result;
460 460
461 ncp_init_request(server); 461 ncp_init_request(server);
462 ncp_add_byte(server, 6); /* subfunction */ 462 ncp_add_byte(server, 6); /* subfunction */
463 ncp_add_byte(server, server->name_space[volnum]); 463 ncp_add_byte(server, ns);
464 ncp_add_byte(server, server->name_space[volnum]); 464 ncp_add_byte(server, ns);
465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ 465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
466 ncp_add_dword(server, RIM_DIRECTORY); 466 ncp_add_dword(server, RIM_DIRECTORY);
467 ncp_add_handle_path(server, volnum, dirent, 1, path); 467 ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ 523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
524} 524}
525 525
526int
527ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
528{
529 int ns = ncp_get_known_namespace(server, volume);
530
531 if (ret_ns)
532 *ret_ns = ns;
533
534 DPRINTK("lookup_vol: namespace[%d] = %d\n",
535 volume, server->name_space[volume]);
536
537 if (server->name_space[volume] == ns)
538 return 0;
539 server->name_space[volume] = ns;
540 return 1;
541}
542
526static int 543static int
527ncp_ObtainSpecificDirBase(struct ncp_server *server, 544ncp_ObtainSpecificDirBase(struct ncp_server *server,
528 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, 545 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
529 char *path, /* At most 1 component */ 546 const char *path, /* At most 1 component */
530 __le32 *dirEntNum, __le32 *DosDirNum) 547 __le32 *dirEntNum, __le32 *DosDirNum)
531{ 548{
532 int result; 549 int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
560{ 577{
561 int dstNS; 578 int dstNS;
562 int result; 579 int result;
563 580
564 dstNS = ncp_get_known_namespace(server, volNumber); 581 ncp_update_known_namespace(server, volNumber, &dstNS);
565 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 582 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber,
566 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) 583 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
567 { 584 {
568 return result; 585 return result;
569 } 586 }
570 server->name_space[volNumber] = dstNS;
571 *volume = volNumber; 587 *volume = volNumber;
572 server->m.mounted_vol[1] = 0; 588 server->m.mounted_vol[1] = 0;
573 server->m.mounted_vol[0] = 'X'; 589 server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
575} 591}
576 592
577int 593int
578ncp_get_volume_root(struct ncp_server *server, const char *volname, 594ncp_get_volume_root(struct ncp_server *server,
579 __u32* volume, __le32* dirent, __le32* dosdirent) 595 const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
580{ 596{
581 int result; 597 int result;
582 __u8 volnum;
583 598
584 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); 599 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
585 600
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
601 return result; 616 return result;
602 } 617 }
603 *dirent = *dosdirent = ncp_reply_dword(server, 4); 618 *dirent = *dosdirent = ncp_reply_dword(server, 4);
604 volnum = ncp_reply_byte(server, 8); 619 *volume = ncp_reply_byte(server, 8);
605 ncp_unlock_server(server); 620 ncp_unlock_server(server);
606 *volume = volnum;
607
608 server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
609
610 DPRINTK("lookup_vol: namespace[%d] = %d\n",
611 volnum, server->name_space[volnum]);
612
613 return 0; 621 return 0;
614} 622}
615 623
616int 624int
617ncp_lookup_volume(struct ncp_server *server, const char *volname, 625ncp_lookup_volume(struct ncp_server *server,
618 struct nw_info_struct *target) 626 const char *volname, struct nw_info_struct *target)
619{ 627{
620 int result; 628 int result;
621 629
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
625 if (result) { 633 if (result) {
626 return result; 634 return result;
627 } 635 }
636 ncp_update_known_namespace(server, target->volNumber, NULL);
628 target->nameLen = strlen(volname); 637 target->nameLen = strlen(volname);
629 memcpy(target->entryName, volname, target->nameLen+1); 638 memcpy(target->entryName, volname, target->nameLen+1);
630 target->attributes = aDIR; 639 target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
676{ 685{
677 int result = 0; 686 int result = 0;
678 687
688 ncp_init_request(server);
679 if (server->name_space[volnum] == NW_NS_NFS) { 689 if (server->name_space[volnum] == NW_NS_NFS) {
680 ncp_init_request(server);
681 ncp_add_byte(server, 25); /* subfunction */ 690 ncp_add_byte(server, 25); /* subfunction */
682 ncp_add_byte(server, server->name_space[volnum]); 691 ncp_add_byte(server, server->name_space[volnum]);
683 ncp_add_byte(server, NW_NS_NFS); 692 ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
690 ncp_add_dword_lh(server, 1); /* nlinks */ 699 ncp_add_dword_lh(server, 1); /* nlinks */
691 ncp_add_dword_lh(server, rdev); 700 ncp_add_dword_lh(server, rdev);
692 result = ncp_request(server, 87); 701 result = ncp_request(server, 87);
693 ncp_unlock_server(server);
694 } 702 }
703 ncp_unlock_server(server);
695 return result; 704 return result;
696} 705}
697#endif 706#endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
700static int 709static int
701ncp_DeleteNSEntry(struct ncp_server *server, 710ncp_DeleteNSEntry(struct ncp_server *server,
702 __u8 have_dir_base, __u8 volnum, __le32 dirent, 711 __u8 have_dir_base, __u8 volnum, __le32 dirent,
703 char* name, __u8 ns, __le16 attr) 712 const char* name, __u8 ns, __le16 attr)
704{ 713{
705 int result; 714 int result;
706 715
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
734 743
735int 744int
736ncp_del_file_or_subdir(struct ncp_server *server, 745ncp_del_file_or_subdir(struct ncp_server *server,
737 struct inode *dir, char *name) 746 struct inode *dir, const char *name)
738{ 747{
739 __u8 volnum = NCP_FINFO(dir)->volNumber; 748 __u8 volnum = NCP_FINFO(dir)->volNumber;
740 __le32 dirent = NCP_FINFO(dir)->dirEntNum; 749 __le32 dirent = NCP_FINFO(dir)->dirEntNum;
750 int name_space;
741 751
752 name_space = server->name_space[volnum];
742#ifdef CONFIG_NCPFS_NFS_NS 753#ifdef CONFIG_NCPFS_NFS_NS
743 if (server->name_space[volnum]==NW_NS_NFS) 754 if (name_space == NW_NS_NFS)
744 { 755 {
745 int result; 756 int result;
746 757
747 result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent); 758 result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
748 if (result) return result; 759 if (result) return result;
749 return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); 760 name = NULL;
761 name_space = NW_NS_DOS;
750 } 762 }
751 else
752#endif /* CONFIG_NCPFS_NFS_NS */ 763#endif /* CONFIG_NCPFS_NFS_NS */
753 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006)); 764 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
754} 765}
755 766
756static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) 767static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
765/* If both dir and name are NULL, then in target there's already a 776/* If both dir and name are NULL, then in target there's already a
766 looked-up entry that wants to be opened. */ 777 looked-up entry that wants to be opened. */
767int ncp_open_create_file_or_subdir(struct ncp_server *server, 778int ncp_open_create_file_or_subdir(struct ncp_server *server,
768 struct inode *dir, char *name, 779 struct inode *dir, const char *name,
769 int open_create_mode, 780 int open_create_mode,
770 __le32 create_attributes, 781 __le32 create_attributes,
771 __le16 desired_acc_rights, 782 __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
890 901
891static int 902static int
892ncp_RenameNSEntry(struct ncp_server *server, 903ncp_RenameNSEntry(struct ncp_server *server,
893 struct inode *old_dir, char *old_name, __le16 old_type, 904 struct inode *old_dir, const char *old_name, __le16 old_type,
894 struct inode *new_dir, char *new_name) 905 struct inode *new_dir, const char *new_name)
895{ 906{
896 int result = -EINVAL; 907 int result = -EINVAL;
897 908
@@ -929,8 +940,8 @@ out:
929} 940}
930 941
931int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 942int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
932 struct inode *old_dir, char *old_name, 943 struct inode *old_dir, const char *old_name,
933 struct inode *new_dir, char *new_name) 944 struct inode *new_dir, const char *new_name)
934{ 945{
935 int result; 946 int result;
936 __le16 old_type = cpu_to_le16(0x06); 947 __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
958ncp_read_kernel(struct ncp_server *server, const char *file_id, 969ncp_read_kernel(struct ncp_server *server, const char *file_id,
959 __u32 offset, __u16 to_read, char *target, int *bytes_read) 970 __u32 offset, __u16 to_read, char *target, int *bytes_read)
960{ 971{
961 char *source; 972 const char *source;
962 int result; 973 int result;
963 974
964 ncp_init_request(server); 975 ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57d..3c57eca634c 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
65 atomic_dec(&NCP_FINFO(inode)->opened); 65 atomic_dec(&NCP_FINFO(inode)->opened);
66} 66}
67 67
68void ncp_extract_file_info(void* src, struct nw_info_struct* target); 68void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
69int ncp_obtain_info(struct ncp_server *server, struct inode *, char *, 69int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
70 struct nw_info_struct *target); 70 struct nw_info_struct *target);
71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); 71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
72int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
72int ncp_get_volume_root(struct ncp_server *server, const char *volname, 73int ncp_get_volume_root(struct ncp_server *server, const char *volname,
73 __u32 *volume, __le32 *dirent, __le32 *dosdirent); 74 __u32 *volume, __le32 *dirent, __le32 *dosdirent);
74int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); 75int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
80 __u32 mode, __u32 rdev); 81 __u32 mode, __u32 rdev);
81 82
82int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); 83int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
83int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *); 84int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
84int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *, 85int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
85 int, __le32, __le16, struct ncp_entry_info *); 86 int, __le32, __le16, struct ncp_entry_info *);
86 87
87int ncp_initialize_search(struct ncp_server *, struct inode *, 88int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
93 char** rbuf, size_t* rsize); 94 char** rbuf, size_t* rsize);
94 95
95int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 96int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
96 struct inode *, char *, struct inode *, char *); 97 struct inode *, const char *, struct inode *, const char *);
97 98
98 99
99int 100int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
170#endif /* CONFIG_NCPFS_NLS */ 171#endif /* CONFIG_NCPFS_NLS */
171 172
172#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) 173#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time)
173#define NCP_MAX_AGE(server) ((server)->dentry_ttl) 174#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl)
174#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) 175#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
175 176
176static inline void 177static inline void
177ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) 178ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
178{ 179{
179 dentry->d_time = jiffies - server->dentry_ttl; 180 dentry->d_time = jiffies - NCP_MAX_AGE(server);
180} 181}
181 182
182static inline void 183static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6c..d8b2d7e6910 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
15 15
16/* i386: 32-bit, little endian, handles mis-alignment */ 16/* i386: 32-bit, little endian, handles mis-alignment */
17#ifdef __i386__ 17#ifdef __i386__
18#define GET_LE32(p) (*(int *)(p)) 18#define GET_LE32(p) (*(const int *)(p))
19#define PUT_LE32(p,v) { *(int *)(p)=v; } 19#define PUT_LE32(p,v) { *(int *)(p)=v; }
20#else 20#else
21/* from include/ncplib.h */ 21/* from include/ncplib.h */
22#define BVAL(buf,pos) (((__u8 *)(buf))[pos]) 22#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) 23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
24#define BSET(buf,pos,val) (BVAL(buf,pos) = (val)) 24#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
25 25
26static inline __u16 26static inline __u16
27WVAL_LH(__u8 * buf, int pos) 27WVAL_LH(const __u8 * buf, int pos)
28{ 28{
29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; 29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
30} 30}
31static inline __u32 31static inline __u32
32DVAL_LH(__u8 * buf, int pos) 32DVAL_LH(const __u8 * buf, int pos)
33{ 33{
34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; 34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
35} 35}
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6..668bd267346 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
746 return -EIO; 746 return -EIO;
747 } 747 }
748 if (!ncp_conn_valid(server)) { 748 if (!ncp_conn_valid(server)) {
749 printk(KERN_ERR "ncpfs: Connection invalid!\n");
750 return -EIO; 749 return -EIO;
751 } 750 }
752 { 751 {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b92..ba306658a6d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,9 +61,9 @@ config NFS_V3_ACL
61 If unsure, say N. 61 If unsure, say N.
62 62
63config NFS_V4 63config NFS_V4
64 bool "NFS client support for NFS version 4 (EXPERIMENTAL)" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS && EXPERIMENTAL 65 depends on NFS_FS
66 select RPCSEC_GSS_KRB5 66 select SUNRPC_GSS
67 help 67 help
68 This option enables support for version 4 of the NFS protocol 68 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client. 69 (RFC 3530) in the kernel's NFS client.
@@ -72,16 +72,20 @@ config NFS_V4
72 space programs which can be found in the Linux nfs-utils package, 72 space programs which can be found in the Linux nfs-utils package,
73 available from http://linux-nfs.org/. 73 available from http://linux-nfs.org/.
74 74
75 If unsure, say N. 75 If unsure, say Y.
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select PNFS_FILE_LAYOUT
80 help 81 help
81 This option enables support for minor version 1 of the NFSv4 protocol 82 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 83 (RFC 5661) in the kernel's NFS client.
84
85 If unsure, say N.
83 86
84 Unless you're an NFS developer, say N. 87config PNFS_FILE_LAYOUT
88 tristate
85 89
86config ROOT_NFS 90config ROOT_NFS
87 bool "Root file system on NFS" 91 bool "Root file system on NFS"
@@ -100,3 +104,31 @@ config NFS_FSCACHE
100 help 104 help
101 Say Y here if you want NFS data to be cached locally on disc through 105 Say Y here if you want NFS data to be cached locally on disc through
102 the general filesystem cache manager 106 the general filesystem cache manager
107
108config NFS_USE_LEGACY_DNS
109 bool "Use the legacy NFS DNS resolver"
110 depends on NFS_V4
111 help
112 The kernel now provides a method for translating a host name into an
113 IP address. Select Y here if you would rather use your own DNS
114 resolver script.
115
116 If unsure, say N
117
118config NFS_USE_KERNEL_DNS
119 bool
120 depends on NFS_V4 && !NFS_USE_LEGACY_DNS
121 select DNS_RESOLVER
122 select KEYS
123 default y
124
125config NFS_USE_NEW_IDMAPPER
126 bool "Use the new idmapper upcall routine"
127 depends on NFS_V4 && KEYS
128 help
129 Say Y here if you want NFS to use the new idmapper upcall functions.
130 You will need /sbin/request-key (usually provided by the keyutils
131 package). For details, read
132 <file:Documentation/filesystems/nfs/idmapper.txt>.
133
134 If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639ea..4776ff9e381 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o
18nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
19nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae9512..aeec017fe81 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46#define NFS_CALLBACK_MAXPORTNR (65535U) 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47 47
48static int param_set_portnr(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, const struct kernel_param *kp)
49{ 49{
50 unsigned long num; 50 unsigned long num;
51 int ret; 51 int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
58 *((unsigned int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
59 return 0; 59 return 0;
60} 60}
61 61static struct kernel_param_ops param_ops_portnr = {
62static int param_get_portnr(char *buffer, struct kernel_param *kp) 62 .set = param_set_portnr,
63{ 63 .get = param_get_uint,
64 return param_get_uint(buffer, kp); 64};
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int); 65#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67 66
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); 67module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
@@ -110,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
110{ 109{
111 int ret; 110 int ret;
112 111
113 ret = svc_create_xprt(serv, "tcp", PF_INET, 112 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
114 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
115 if (ret <= 0) 114 if (ret <= 0)
116 goto out_err; 115 goto out_err;
@@ -118,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
118 dprintk("NFS: Callback listener port = %u (af %u)\n", 117 dprintk("NFS: Callback listener port = %u (af %u)\n",
119 nfs_callback_tcpport, PF_INET); 118 nfs_callback_tcpport, PF_INET);
120 119
121 ret = svc_create_xprt(serv, "tcp", PF_INET6, 120 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
122 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
123 if (ret > 0) { 122 if (ret > 0) {
124 nfs_callback_tcpport6 = ret; 123 nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e85..2950fca0c61 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
37 if (inode == NULL) 37 if (inode == NULL)
38 goto out_putclient; 38 goto out_putclient;
39 nfsi = NFS_I(inode); 39 nfsi = NFS_I(inode);
40 down_read(&nfsi->rwsem); 40 rcu_read_lock();
41 delegation = nfsi->delegation; 41 delegation = rcu_dereference(nfsi->delegation);
42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) 42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
43 goto out_iput; 43 goto out_iput;
44 res->size = i_size_read(inode); 44 res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
53 args->bitmap[1]; 53 args->bitmap[1];
54 res->status = 0; 54 res->status = 0;
55out_iput: 55out_iput:
56 up_read(&nfsi->rwsem); 56 rcu_read_unlock();
57 iput(inode); 57 iput(inode);
58out_putclient: 58out_putclient:
59 nfs_put_client(clp); 59 nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
62 return res->status; 62 return res->status;
63} 63}
64 64
65static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
66{
67#if defined(CONFIG_NFS_V4_1)
68 if (clp->cl_minorversion > 0)
69 return nfs41_validate_delegation_stateid;
70#endif
71 return nfs4_validate_delegation_stateid;
72}
73
74
75__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
76{ 66{
77 struct nfs_client *clp; 67 struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
92 inode = nfs_delegation_find_inode(clp, &args->fh); 82 inode = nfs_delegation_find_inode(clp, &args->fh);
93 if (inode != NULL) { 83 if (inode != NULL) {
94 /* Set up a helper thread to actually return the delegation */ 84 /* Set up a helper thread to actually return the delegation */
95 switch (nfs_async_inode_return_delegation(inode, &args->stateid, 85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
96 nfs_validate_delegation_stateid(clp))) {
97 case 0: 86 case 0:
98 res = 0; 87 res = 0;
99 break; 88 break;
@@ -129,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
129 if (delegation == NULL) 118 if (delegation == NULL)
130 return 0; 119 return 0;
131 120
132 /* seqid is 4-bytes long */ 121 if (stateid->stateid.seqid != 0)
133 if (((u32 *) &stateid->data)[0] != 0)
134 return 0; 122 return 0;
135 if (memcmp(&delegation->stateid.data[4], &stateid->data[4], 123 if (memcmp(&delegation->stateid.stateid.other,
136 sizeof(stateid->data)-4)) 124 &stateid->stateid.other,
125 NFS4_STATEID_OTHER_SIZE))
137 return 0; 126 return 0;
138 127
139 return 1; 128 return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a..0870d0d4efc 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_CLIENT 53#define NFSDBG_FACILITY NFSDBG_CLIENT
53 54
@@ -150,11 +151,14 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
150 clp->cl_boot_time = CURRENT_TIME; 151 clp->cl_boot_time = CURRENT_TIME;
151 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; 152 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
152 clp->cl_minorversion = cl_init->minorversion; 153 clp->cl_minorversion = cl_init->minorversion;
154 clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
153#endif 155#endif
154 cred = rpc_lookup_machine_cred(); 156 cred = rpc_lookup_machine_cred();
155 if (!IS_ERR(cred)) 157 if (!IS_ERR(cred))
156 clp->cl_machine_cred = cred; 158 clp->cl_machine_cred = cred;
157 159#if defined(CONFIG_NFS_V4_1)
160 INIT_LIST_HEAD(&clp->cl_layouts);
161#endif
158 nfs_fscache_get_client_cookie(clp); 162 nfs_fscache_get_client_cookie(clp);
159 163
160 return clp; 164 return clp;
@@ -178,7 +182,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
178 clp->cl_session = NULL; 182 clp->cl_session = NULL;
179 } 183 }
180 184
181 clp->cl_call_sync = _nfs4_call_sync; 185 clp->cl_mvops = nfs_v4_minor_ops[0];
182#endif /* CONFIG_NFS_V4_1 */ 186#endif /* CONFIG_NFS_V4_1 */
183} 187}
184 188
@@ -188,7 +192,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
188static void nfs4_destroy_callback(struct nfs_client *clp) 192static void nfs4_destroy_callback(struct nfs_client *clp)
189{ 193{
190 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 194 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
191 nfs_callback_down(clp->cl_minorversion); 195 nfs_callback_down(clp->cl_mvops->minor_version);
192} 196}
193 197
194static void nfs4_shutdown_client(struct nfs_client *clp) 198static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -251,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
251 nfs_free_client(clp); 255 nfs_free_client(clp);
252 } 256 }
253} 257}
258EXPORT_SYMBOL_GPL(nfs_put_client);
254 259
255#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 260#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
256/* 261/*
@@ -274,7 +279,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
274 sin1->sin6_scope_id != sin2->sin6_scope_id) 279 sin1->sin6_scope_id != sin2->sin6_scope_id)
275 return 0; 280 return 0;
276 281
277 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); 282 return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
278} 283}
279#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ 284#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
280static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, 285static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -600,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
600{ 605{
601 struct rpc_clnt *clnt = NULL; 606 struct rpc_clnt *clnt = NULL;
602 struct rpc_create_args args = { 607 struct rpc_create_args args = {
608 .net = &init_net,
603 .protocol = clp->cl_proto, 609 .protocol = clp->cl_proto,
604 .address = (struct sockaddr *)&clp->cl_addr, 610 .address = (struct sockaddr *)&clp->cl_addr,
605 .addrsize = clp->cl_addrlen, 611 .addrsize = clp->cl_addrlen,
@@ -634,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
634 */ 640 */
635static void nfs_destroy_server(struct nfs_server *server) 641static void nfs_destroy_server(struct nfs_server *server)
636{ 642{
637 if (!(server->flags & NFS_MOUNT_NONLM)) 643 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
644 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
638 nlmclnt_done(server->nlm_host); 645 nlmclnt_done(server->nlm_host);
639} 646}
640 647
@@ -656,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
656 663
657 if (nlm_init.nfs_version > 3) 664 if (nlm_init.nfs_version > 3)
658 return 0; 665 return 0;
659 if (server->flags & NFS_MOUNT_NONLM) 666 if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
667 (server->flags & NFS_MOUNT_LOCAL_FCNTL))
660 return 0; 668 return 0;
661 669
662 switch (clp->cl_proto) { 670 switch (clp->cl_proto) {
@@ -897,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
897 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 905 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
898 server->wsize = NFS_MAX_FILE_IO_SIZE; 906 server->wsize = NFS_MAX_FILE_IO_SIZE;
899 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 907 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
908 set_pnfs_layoutdriver(server, fsinfo->layouttype);
909
900 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 910 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
901 911
902 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); 912 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
903 if (server->dtsize > PAGE_CACHE_SIZE) 913 if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
904 server->dtsize = PAGE_CACHE_SIZE; 914 server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
905 if (server->dtsize > server->rsize) 915 if (server->dtsize > server->rsize)
906 server->dtsize = server->rsize; 916 server->dtsize = server->rsize;
907 917
@@ -912,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
912 922
913 server->maxfilesize = fsinfo->maxfilesize; 923 server->maxfilesize = fsinfo->maxfilesize;
914 924
925 server->time_delta = fsinfo->time_delta;
926
915 /* We're airborne Set socket buffersize */ 927 /* We're airborne Set socket buffersize */
916 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); 928 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
917} 929}
@@ -934,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
934 } 946 }
935 947
936 fsinfo.fattr = fattr; 948 fsinfo.fattr = fattr;
949 fsinfo.layouttype = 0;
937 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 950 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
938 if (error < 0) 951 if (error < 0)
939 goto out_error; 952 goto out_error;
@@ -1016,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
1016{ 1029{
1017 dprintk("--> nfs_free_server()\n"); 1030 dprintk("--> nfs_free_server()\n");
1018 1031
1032 unset_pnfs_layoutdriver(server);
1019 spin_lock(&nfs_client_lock); 1033 spin_lock(&nfs_client_lock);
1020 list_del(&server->client_link); 1034 list_del(&server->client_link);
1021 list_del(&server->master_link); 1035 list_del(&server->master_link);
@@ -1126,7 +1140,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
1126 return error; 1140 return error;
1127 } 1141 }
1128 1142
1129 error = nfs_callback_up(clp->cl_minorversion, 1143 error = nfs_callback_up(clp->cl_mvops->minor_version,
1130 clp->cl_rpcclient->cl_xprt); 1144 clp->cl_rpcclient->cl_xprt);
1131 if (error < 0) { 1145 if (error < 0) {
1132 dprintk("%s: failed to start callback. Error = %d\n", 1146 dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1157,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
1143 */ 1157 */
1144static int nfs4_init_client_minor_version(struct nfs_client *clp) 1158static int nfs4_init_client_minor_version(struct nfs_client *clp)
1145{ 1159{
1146 clp->cl_call_sync = _nfs4_call_sync;
1147
1148#if defined(CONFIG_NFS_V4_1) 1160#if defined(CONFIG_NFS_V4_1)
1149 if (clp->cl_minorversion) { 1161 if (clp->cl_mvops->minor_version) {
1150 struct nfs4_session *session = NULL; 1162 struct nfs4_session *session = NULL;
1151 /* 1163 /*
1152 * Create the session and mark it expired. 1164 * Create the session and mark it expired.
@@ -1158,7 +1170,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1158 return -ENOMEM; 1170 return -ENOMEM;
1159 1171
1160 clp->cl_session = session; 1172 clp->cl_session = session;
1161 clp->cl_call_sync = _nfs4_call_sync_session; 1173 /*
1174 * The create session reply races with the server back
1175 * channel probe. Mark the client NFS_CS_SESSION_INITING
1176 * so that the client back channel can find the
1177 * nfs_client struct
1178 */
1179 clp->cl_cons_state = NFS_CS_SESSION_INITING;
1162 } 1180 }
1163#endif /* CONFIG_NFS_V4_1 */ 1181#endif /* CONFIG_NFS_V4_1 */
1164 1182
@@ -1351,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
1351 1369
1352 /* Initialise the client representation from the mount data */ 1370 /* Initialise the client representation from the mount data */
1353 server->flags = data->flags; 1371 server->flags = data->flags;
1354 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| 1372 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
1355 NFS_CAP_POSIX_LOCK; 1373 if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
1374 server->caps |= NFS_CAP_READDIRPLUS;
1356 server->options = data->options; 1375 server->options = data->options;
1357 1376
1358 /* Get a client record */ 1377 /* Get a client record */
@@ -1454,7 +1473,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1454 data->authflavor, 1473 data->authflavor,
1455 parent_server->client->cl_xprt->prot, 1474 parent_server->client->cl_xprt->prot,
1456 parent_server->client->cl_timeout, 1475 parent_server->client->cl_timeout,
1457 parent_client->cl_minorversion); 1476 parent_client->cl_mvops->minor_version);
1458 if (error < 0) 1477 if (error < 0)
1459 goto error; 1478 goto error;
1460 1479
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 30163454397..232a7eead33 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
71 if (inode->i_flock == NULL) 71 if (inode->i_flock == NULL)
72 goto out; 72 goto out;
73 73
74 /* Protect inode->i_flock using the BKL */ 74 /* Protect inode->i_flock using the file locks lock */
75 lock_kernel(); 75 lock_flocks();
76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
78 continue; 78 continue;
79 if (nfs_file_open_context(fl->fl_file) != ctx) 79 if (nfs_file_open_context(fl->fl_file) != ctx)
80 continue; 80 continue;
81 unlock_kernel(); 81 unlock_flocks();
82 status = nfs4_lock_delegation_recall(state, fl); 82 status = nfs4_lock_delegation_recall(state, fl);
83 if (status < 0) 83 if (status < 0)
84 goto out; 84 goto out;
85 lock_kernel(); 85 lock_flocks();
86 } 86 }
87 unlock_kernel(); 87 unlock_flocks();
88out: 88out:
89 return status; 89 return status;
90} 90}
@@ -268,14 +268,6 @@ out:
268 return status; 268 return status;
269} 269}
270 270
271/* Sync all data to disk upon delegation return */
272static void nfs_msync_inode(struct inode *inode)
273{
274 filemap_fdatawrite(inode->i_mapping);
275 nfs_wb_all(inode);
276 filemap_fdatawait(inode->i_mapping);
277}
278
279/* 271/*
280 * Basic procedure for returning a delegation to the server 272 * Basic procedure for returning a delegation to the server
281 */ 273 */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
367 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); 359 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
368 spin_unlock(&clp->cl_lock); 360 spin_unlock(&clp->cl_lock);
369 if (delegation != NULL) { 361 if (delegation != NULL) {
370 nfs_msync_inode(inode); 362 nfs_wb_all(inode);
371 err = __nfs_inode_return_delegation(inode, delegation, 1); 363 err = __nfs_inode_return_delegation(inode, delegation, 1);
372 } 364 }
373 } 365 }
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
471/* 463/*
472 * Asynchronous delegation recall! 464 * Asynchronous delegation recall!
473 */ 465 */
474int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 466int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
475 int (*validate_stateid)(struct nfs_delegation *delegation,
476 const nfs4_stateid *stateid))
477{ 467{
478 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 468 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
479 struct nfs_delegation *delegation; 469 struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
481 rcu_read_lock(); 471 rcu_read_lock();
482 delegation = rcu_dereference(NFS_I(inode)->delegation); 472 delegation = rcu_dereference(NFS_I(inode)->delegation);
483 473
484 if (!validate_stateid(delegation, stateid)) { 474 if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
485 rcu_read_unlock(); 475 rcu_read_unlock();
486 return -ENOENT; 476 return -ENOENT;
487 } 477 }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b814012..2026304bda1 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
36int nfs_inode_return_delegation(struct inode *inode); 36int nfs_inode_return_delegation(struct inode *inode);
37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
38 int (*validate_stateid)(struct nfs_delegation *delegation,
39 const nfs4_stateid *stateid));
40void nfs_inode_return_delegation_noreclaim(struct inode *inode); 38void nfs_inode_return_delegation_noreclaim(struct inode *inode);
41 39
42struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 40struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e60416d3f81..07ac3847e56 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h>
36 37
37#include "nfs4_fs.h"
38#include "delegation.h" 38#include "delegation.h"
39#include "iostat.h" 39#include "iostat.h"
40#include "internal.h" 40#include "internal.h"
41#include "fscache.h"
41 42
42/* #define NFS_DEBUG_VERBOSE 1 */ 43/* #define NFS_DEBUG_VERBOSE 1 */
43 44
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 56 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, int); 57static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 58static loff_t nfs_llseek_dir(struct file *, loff_t, int);
59static int nfs_readdir_clear_array(struct page*, gfp_t);
58 60
59const struct file_operations nfs_dir_operations = { 61const struct file_operations nfs_dir_operations = {
60 .llseek = nfs_llseek_dir, 62 .llseek = nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
80 .setattr = nfs_setattr, 82 .setattr = nfs_setattr,
81}; 83};
82 84
85const struct address_space_operations nfs_dir_addr_space_ops = {
86 .releasepage = nfs_readdir_clear_array,
87};
88
83#ifdef CONFIG_NFS_V3 89#ifdef CONFIG_NFS_V3
84const struct inode_operations nfs3_dir_inode_operations = { 90const struct inode_operations nfs3_dir_inode_operations = {
85 .create = nfs_create, 91 .create = nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
104#ifdef CONFIG_NFS_V4 110#ifdef CONFIG_NFS_V4
105 111
106static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); 112static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
113static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
107const struct inode_operations nfs4_dir_inode_operations = { 114const struct inode_operations nfs4_dir_inode_operations = {
108 .create = nfs_create, 115 .create = nfs_open_create,
109 .lookup = nfs_atomic_lookup, 116 .lookup = nfs_atomic_lookup,
110 .link = nfs_link, 117 .link = nfs_link,
111 .unlink = nfs_unlink, 118 .unlink = nfs_unlink,
@@ -140,54 +147,207 @@ nfs_opendir(struct inode *inode, struct file *filp)
140 147
141 /* Call generic open code in order to cache credentials */ 148 /* Call generic open code in order to cache credentials */
142 res = nfs_open(inode, filp); 149 res = nfs_open(inode, filp);
150 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
151 /* This is a mountpoint, so d_revalidate will never
152 * have been called, so we need to refresh the
153 * inode (for close-open consistency) ourselves.
154 */
155 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
156 }
143 return res; 157 return res;
144} 158}
145 159
146typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); 160struct nfs_cache_array_entry {
161 u64 cookie;
162 u64 ino;
163 struct qstr string;
164};
165
166struct nfs_cache_array {
167 unsigned int size;
168 int eof_index;
169 u64 last_cookie;
170 struct nfs_cache_array_entry array[0];
171};
172
173#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
174
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
147typedef struct { 176typedef struct {
148 struct file *file; 177 struct file *file;
149 struct page *page; 178 struct page *page;
150 unsigned long page_index; 179 unsigned long page_index;
151 __be32 *ptr;
152 u64 *dir_cookie; 180 u64 *dir_cookie;
153 loff_t current_index; 181 loff_t current_index;
154 struct nfs_entry *entry;
155 decode_dirent_t decode; 182 decode_dirent_t decode;
156 int plus; 183
157 unsigned long timestamp; 184 unsigned long timestamp;
158 unsigned long gencount; 185 unsigned long gencount;
159 int timestamp_valid; 186 unsigned int cache_entry_index;
187 unsigned int plus:1;
188 unsigned int eof:1;
160} nfs_readdir_descriptor_t; 189} nfs_readdir_descriptor_t;
161 190
162/* Now we cache directories properly, by stuffing the dirent 191/*
163 * data directly in the page cache. 192 * The caller is responsible for calling nfs_readdir_release_array(page)
164 *
165 * Inode invalidation due to refresh etc. takes care of
166 * _everything_, no sloppy entry flushing logic, no extraneous
167 * copying, network direct to page cache, the way it was meant
168 * to be.
169 *
170 * NOTE: Dirent information verification is done always by the
171 * page-in of the RPC reply, nowhere else, this simplies
172 * things substantially.
173 */ 193 */
174static 194static
175int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) 195struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
196{
197 if (page == NULL)
198 return ERR_PTR(-EIO);
199 return (struct nfs_cache_array *)kmap(page);
200}
201
202static
203void nfs_readdir_release_array(struct page *page)
204{
205 kunmap(page);
206}
207
208/*
209 * we are freeing strings created by nfs_add_to_readdir_array()
210 */
211static
212int nfs_readdir_clear_array(struct page *page, gfp_t mask)
213{
214 struct nfs_cache_array *array = nfs_readdir_get_array(page);
215 int i;
216 for (i = 0; i < array->size; i++)
217 kfree(array->array[i].string.name);
218 nfs_readdir_release_array(page);
219 return 0;
220}
221
222/*
223 * the caller is responsible for freeing qstr.name
224 * when called by nfs_readdir_add_to_array, the strings will be freed in
225 * nfs_clear_readdir_array()
226 */
227static
228int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
229{
230 string->len = len;
231 string->name = kmemdup(name, len, GFP_KERNEL);
232 if (string->name == NULL)
233 return -ENOMEM;
234 string->hash = full_name_hash(name, len);
235 return 0;
236}
237
238static
239int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
240{
241 struct nfs_cache_array *array = nfs_readdir_get_array(page);
242 struct nfs_cache_array_entry *cache_entry;
243 int ret;
244
245 if (IS_ERR(array))
246 return PTR_ERR(array);
247 ret = -EIO;
248 if (array->size >= MAX_READDIR_ARRAY)
249 goto out;
250
251 cache_entry = &array->array[array->size];
252 cache_entry->cookie = entry->prev_cookie;
253 cache_entry->ino = entry->ino;
254 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
255 if (ret)
256 goto out;
257 array->last_cookie = entry->cookie;
258 if (entry->eof == 1)
259 array->eof_index = array->size;
260 array->size++;
261out:
262 nfs_readdir_release_array(page);
263 return ret;
264}
265
266static
267int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
268{
269 loff_t diff = desc->file->f_pos - desc->current_index;
270 unsigned int index;
271
272 if (diff < 0)
273 goto out_eof;
274 if (diff >= array->size) {
275 if (array->eof_index > 0)
276 goto out_eof;
277 desc->current_index += array->size;
278 return -EAGAIN;
279 }
280
281 index = (unsigned int)diff;
282 *desc->dir_cookie = array->array[index].cookie;
283 desc->cache_entry_index = index;
284 if (index == array->eof_index)
285 desc->eof = 1;
286 return 0;
287out_eof:
288 desc->eof = 1;
289 return -EBADCOOKIE;
290}
291
292static
293int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
294{
295 int i;
296 int status = -EAGAIN;
297
298 for (i = 0; i < array->size; i++) {
299 if (i == array->eof_index) {
300 desc->eof = 1;
301 status = -EBADCOOKIE;
302 }
303 if (array->array[i].cookie == *desc->dir_cookie) {
304 desc->cache_entry_index = i;
305 status = 0;
306 break;
307 }
308 }
309
310 return status;
311}
312
313static
314int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
315{
316 struct nfs_cache_array *array;
317 int status = -EBADCOOKIE;
318
319 if (desc->dir_cookie == NULL)
320 goto out;
321
322 array = nfs_readdir_get_array(desc->page);
323 if (IS_ERR(array)) {
324 status = PTR_ERR(array);
325 goto out;
326 }
327
328 if (*desc->dir_cookie == 0)
329 status = nfs_readdir_search_for_pos(array, desc);
330 else
331 status = nfs_readdir_search_for_cookie(array, desc);
332
333 nfs_readdir_release_array(desc->page);
334out:
335 return status;
336}
337
338/* Fill a page with xdr information before transferring to the cache page */
339static
340int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
341 struct nfs_entry *entry, struct file *file, struct inode *inode)
176{ 342{
177 struct file *file = desc->file;
178 struct inode *inode = file->f_path.dentry->d_inode;
179 struct rpc_cred *cred = nfs_file_cred(file); 343 struct rpc_cred *cred = nfs_file_cred(file);
180 unsigned long timestamp, gencount; 344 unsigned long timestamp, gencount;
181 int error; 345 int error;
182 346
183 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
184 __func__, (long long)desc->entry->cookie,
185 page->index);
186
187 again: 347 again:
188 timestamp = jiffies; 348 timestamp = jiffies;
189 gencount = nfs_inc_attr_generation_counter(); 349 gencount = nfs_inc_attr_generation_counter();
190 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 350 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
191 NFS_SERVER(inode)->dtsize, desc->plus); 351 NFS_SERVER(inode)->dtsize, desc->plus);
192 if (error < 0) { 352 if (error < 0) {
193 /* We requested READDIRPLUS, but the server doesn't grok it */ 353 /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -201,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
201 } 361 }
202 desc->timestamp = timestamp; 362 desc->timestamp = timestamp;
203 desc->gencount = gencount; 363 desc->gencount = gencount;
204 desc->timestamp_valid = 1; 364error:
205 SetPageUptodate(page); 365 return error;
206 /* Ensure consistent page alignment of the data.
207 * Note: assumes we have exclusive access to this mapping either
208 * through inode->i_mutex or some other mechanism.
209 */
210 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
211 /* Should never happen */
212 nfs_zap_mapping(inode, inode->i_mapping);
213 }
214 unlock_page(page);
215 return 0;
216 error:
217 unlock_page(page);
218 return -EIO;
219} 366}
220 367
221static inline 368/* Fill in an entry based on the xdr code stored in desc->page */
222int dir_decode(nfs_readdir_descriptor_t *desc) 369static
370int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
223{ 371{
224 __be32 *p = desc->ptr; 372 __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
225 p = desc->decode(p, desc->entry, desc->plus);
226 if (IS_ERR(p)) 373 if (IS_ERR(p))
227 return PTR_ERR(p); 374 return PTR_ERR(p);
228 desc->ptr = p; 375
229 if (desc->timestamp_valid) { 376 entry->fattr->time_start = desc->timestamp;
230 desc->entry->fattr->time_start = desc->timestamp; 377 entry->fattr->gencount = desc->gencount;
231 desc->entry->fattr->gencount = desc->gencount;
232 } else
233 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
234 return 0; 378 return 0;
235} 379}
236 380
237static inline 381static
238void dir_page_release(nfs_readdir_descriptor_t *desc) 382int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
239{ 383{
240 kunmap(desc->page); 384 struct nfs_inode *node;
241 page_cache_release(desc->page); 385 if (dentry->d_inode == NULL)
242 desc->page = NULL; 386 goto different;
243 desc->ptr = NULL; 387 node = NFS_I(dentry->d_inode);
388 if (node->fh.size != entry->fh->size)
389 goto different;
390 if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
391 goto different;
392 return 1;
393different:
394 return 0;
244} 395}
245 396
246/* 397static
247 * Given a pointer to a buffer that has already been filled by a call 398void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
248 * to readdir, find the next entry with cookie '*desc->dir_cookie'.
249 *
250 * If the end of the buffer has been reached, return -EAGAIN, if not,
251 * return the offset within the buffer of the next entry to be
252 * read.
253 */
254static inline
255int find_dirent(nfs_readdir_descriptor_t *desc)
256{ 399{
257 struct nfs_entry *entry = desc->entry; 400 struct qstr filename = {
258 int loop_count = 0, 401 .len = entry->len,
259 status; 402 .name = entry->name,
403 };
404 struct dentry *dentry;
405 struct dentry *alias;
406 struct inode *dir = parent->d_inode;
407 struct inode *inode;
260 408
261 while((status = dir_decode(desc)) == 0) { 409 if (filename.name[0] == '.') {
262 dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", 410 if (filename.len == 1)
263 __func__, (unsigned long long)entry->cookie); 411 return;
264 if (entry->prev_cookie == *desc->dir_cookie) 412 if (filename.len == 2 && filename.name[1] == '.')
265 break; 413 return;
266 if (loop_count++ > 200) { 414 }
267 loop_count = 0; 415 filename.hash = full_name_hash(filename.name, filename.len);
268 schedule(); 416
417 dentry = d_lookup(parent, &filename);
418 if (dentry != NULL) {
419 if (nfs_same_file(dentry, entry)) {
420 nfs_refresh_inode(dentry->d_inode, entry->fattr);
421 goto out;
422 } else {
423 d_drop(dentry);
424 dput(dentry);
269 } 425 }
270 } 426 }
271 return status; 427
428 dentry = d_alloc(parent, &filename);
429 if (dentry == NULL)
430 return;
431
432 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
433 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
434 if (IS_ERR(inode))
435 goto out;
436
437 alias = d_materialise_unique(dentry, inode);
438 if (IS_ERR(alias))
439 goto out;
440 else if (alias) {
441 nfs_set_verifier(alias, nfs_save_change_attribute(dir));
442 dput(alias);
443 } else
444 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
445
446out:
447 dput(dentry);
448}
449
450/* Perform conversion from xdr to cache array */
451static
452void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
453 void *xdr_page, struct page *page, unsigned int buflen)
454{
455 struct xdr_stream stream;
456 struct xdr_buf buf;
457 __be32 *ptr = xdr_page;
458 int status;
459 struct nfs_cache_array *array;
460
461 buf.head->iov_base = xdr_page;
462 buf.head->iov_len = buflen;
463 buf.tail->iov_len = 0;
464 buf.page_base = 0;
465 buf.page_len = 0;
466 buf.buflen = buf.head->iov_len;
467 buf.len = buf.head->iov_len;
468
469 xdr_init_decode(&stream, &buf, ptr);
470
471
472 do {
473 status = xdr_decode(desc, entry, &stream);
474 if (status != 0)
475 break;
476
477 if (nfs_readdir_add_to_array(entry, page) == -1)
478 break;
479 if (desc->plus == 1)
480 nfs_prime_dcache(desc->file->f_path.dentry, entry);
481 } while (!entry->eof);
482
483 if (status == -EBADCOOKIE && entry->eof) {
484 array = nfs_readdir_get_array(page);
485 array->eof_index = array->size - 1;
486 status = 0;
487 nfs_readdir_release_array(page);
488 }
489}
490
491static
492void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
493{
494 unsigned int i;
495 for (i = 0; i < npages; i++)
496 put_page(pages[i]);
497}
498
499static
500void nfs_readdir_free_large_page(void *ptr, struct page **pages,
501 unsigned int npages)
502{
503 vm_unmap_ram(ptr, npages);
504 nfs_readdir_free_pagearray(pages, npages);
272} 505}
273 506
274/* 507/*
275 * Given a pointer to a buffer that has already been filled by a call 508 * nfs_readdir_large_page will allocate pages that must be freed with a call
276 * to readdir, find the entry at offset 'desc->file->f_pos'. 509 * to nfs_readdir_free_large_page
277 *
278 * If the end of the buffer has been reached, return -EAGAIN, if not,
279 * return the offset within the buffer of the next entry to be
280 * read.
281 */ 510 */
282static inline 511static
283int find_dirent_index(nfs_readdir_descriptor_t *desc) 512void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
284{ 513{
285 struct nfs_entry *entry = desc->entry; 514 void *ptr;
286 int loop_count = 0, 515 unsigned int i;
287 status; 516
517 for (i = 0; i < npages; i++) {
518 struct page *page = alloc_page(GFP_KERNEL);
519 if (page == NULL)
520 goto out_freepages;
521 pages[i] = page;
522 }
288 523
289 for(;;) { 524 ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
290 status = dir_decode(desc); 525 if (!IS_ERR_OR_NULL(ptr))
291 if (status) 526 return ptr;
292 break; 527out_freepages:
528 nfs_readdir_free_pagearray(pages, i);
529 return NULL;
530}
531
532static
533int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
534{
535 struct page *pages[NFS_MAX_READDIR_PAGES];
536 void *pages_ptr = NULL;
537 struct nfs_entry entry;
538 struct file *file = desc->file;
539 struct nfs_cache_array *array;
540 int status = 0;
541 unsigned int array_size = ARRAY_SIZE(pages);
542
543 entry.prev_cookie = 0;
544 entry.cookie = *desc->dir_cookie;
545 entry.eof = 0;
546 entry.fh = nfs_alloc_fhandle();
547 entry.fattr = nfs_alloc_fattr();
548 if (entry.fh == NULL || entry.fattr == NULL)
549 goto out;
550
551 array = nfs_readdir_get_array(page);
552 memset(array, 0, sizeof(struct nfs_cache_array));
553 array->eof_index = -1;
293 554
294 dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", 555 pages_ptr = nfs_readdir_large_page(pages, array_size);
295 (unsigned long long)entry->cookie, desc->current_index); 556 if (!pages_ptr)
557 goto out_release_array;
558 do {
559 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
296 560
297 if (desc->file->f_pos == desc->current_index) { 561 if (status < 0)
298 *desc->dir_cookie = entry->cookie;
299 break; 562 break;
300 } 563 nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
301 desc->current_index++; 564 } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
302 if (loop_count++ > 200) { 565
303 loop_count = 0; 566 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
304 schedule(); 567out_release_array:
305 } 568 nfs_readdir_release_array(page);
306 } 569out:
570 nfs_free_fattr(entry.fattr);
571 nfs_free_fhandle(entry.fh);
307 return status; 572 return status;
308} 573}
309 574
310/* 575/*
311 * Find the given page, and call find_dirent() or find_dirent_index in 576 * Now we cache directories properly, by converting xdr information
312 * order to try to return the next entry. 577 * to an array that can be used for lookups later. This results in
578 * fewer cache pages, since we can store more information on each page.
579 * We only need to convert from xdr once so future lookups are much simpler
313 */ 580 */
314static inline 581static
315int find_dirent_page(nfs_readdir_descriptor_t *desc) 582int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
316{ 583{
317 struct inode *inode = desc->file->f_path.dentry->d_inode; 584 struct inode *inode = desc->file->f_path.dentry->d_inode;
318 struct page *page;
319 int status;
320 585
321 dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", 586 if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
322 __func__, desc->page_index, 587 goto error;
323 (long long) *desc->dir_cookie); 588 SetPageUptodate(page);
324 589
325 /* If we find the page in the page_cache, we cannot be sure 590 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
326 * how fresh the data is, so we will ignore readdir_plus attributes. 591 /* Should never happen */
327 */ 592 nfs_zap_mapping(inode, inode->i_mapping);
328 desc->timestamp_valid = 0;
329 page = read_cache_page(inode->i_mapping, desc->page_index,
330 (filler_t *)nfs_readdir_filler, desc);
331 if (IS_ERR(page)) {
332 status = PTR_ERR(page);
333 goto out;
334 } 593 }
594 unlock_page(page);
595 return 0;
596 error:
597 unlock_page(page);
598 return -EIO;
599}
335 600
336 /* NOTE: Someone else may have changed the READDIRPLUS flag */ 601static
337 desc->page = page; 602void cache_page_release(nfs_readdir_descriptor_t *desc)
338 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 603{
339 if (*desc->dir_cookie != 0) 604 page_cache_release(desc->page);
340 status = find_dirent(desc); 605 desc->page = NULL;
341 else 606}
342 status = find_dirent_index(desc); 607
343 if (status < 0) 608static
344 dir_page_release(desc); 609struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
345 out: 610{
346 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); 611 struct page *page;
347 return status; 612 page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
613 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
614 if (IS_ERR(page))
615 desc->eof = 1;
616 return page;
348} 617}
349 618
350/* 619/*
351 * Recurse through the page cache pages, and return a 620 * Returns 0 if desc->dir_cookie was found on page desc->page_index
352 * filled nfs_entry structure of the next directory entry if possible.
353 *
354 * The target for the search is '*desc->dir_cookie' if non-0,
355 * 'desc->file->f_pos' otherwise
356 */ 621 */
622static
623int find_cache_page(nfs_readdir_descriptor_t *desc)
624{
625 int res;
626
627 desc->page = get_cache_page(desc);
628 if (IS_ERR(desc->page))
629 return PTR_ERR(desc->page);
630
631 res = nfs_readdir_search_array(desc);
632 if (res == 0)
633 return 0;
634 cache_page_release(desc);
635 return res;
636}
637
638/* Search for desc->dir_cookie from the beginning of the page cache */
357static inline 639static inline
358int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 640int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
359{ 641{
360 int loop_count = 0; 642 int res = -EAGAIN;
361 int res;
362
363 /* Always search-by-index from the beginning of the cache */
364 if (*desc->dir_cookie == 0) {
365 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
366 (long long)desc->file->f_pos);
367 desc->page_index = 0;
368 desc->entry->cookie = desc->entry->prev_cookie = 0;
369 desc->entry->eof = 0;
370 desc->current_index = 0;
371 } else
372 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
373 (unsigned long long)*desc->dir_cookie);
374 643
375 for (;;) { 644 while (1) {
376 res = find_dirent_page(desc); 645 res = find_cache_page(desc);
377 if (res != -EAGAIN) 646 if (res != -EAGAIN)
378 break; 647 break;
379 /* Align to beginning of next page */ 648 desc->page_index++;
380 desc->page_index ++;
381 if (loop_count++ > 200) {
382 loop_count = 0;
383 schedule();
384 }
385 } 649 }
386
387 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
388 return res; 650 return res;
389} 651}
390 652
@@ -393,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
393 return (inode->i_mode >> 12) & 15; 655 return (inode->i_mode >> 12) & 15;
394} 656}
395 657
396static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
397
398/* 658/*
399 * Once we've found the start of the dirent within a page: fill 'er up... 659 * Once we've found the start of the dirent within a page: fill 'er up...
400 */ 660 */
@@ -403,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
403 filldir_t filldir) 663 filldir_t filldir)
404{ 664{
405 struct file *file = desc->file; 665 struct file *file = desc->file;
406 struct nfs_entry *entry = desc->entry; 666 int i = 0;
407 struct dentry *dentry = NULL; 667 int res = 0;
408 u64 fileid; 668 struct nfs_cache_array *array = NULL;
409 int loop_count = 0, 669 unsigned int d_type = DT_UNKNOWN;
410 res; 670 struct dentry *dentry = NULL;
411
412 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
413 (unsigned long long)entry->cookie);
414
415 for(;;) {
416 unsigned d_type = DT_UNKNOWN;
417 /* Note: entry->prev_cookie contains the cookie for
418 * retrieving the current dirent on the server */
419 fileid = entry->ino;
420
421 /* Get a dentry if we have one */
422 if (dentry != NULL)
423 dput(dentry);
424 dentry = nfs_readdir_lookup(desc);
425 671
426 /* Use readdirplus info */ 672 array = nfs_readdir_get_array(desc->page);
427 if (dentry != NULL && dentry->d_inode != NULL) {
428 d_type = dt_type(dentry->d_inode);
429 fileid = NFS_FILEID(dentry->d_inode);
430 }
431 673
432 res = filldir(dirent, entry->name, entry->len, 674 for (i = desc->cache_entry_index; i < array->size; i++) {
433 file->f_pos, nfs_compat_user_ino64(fileid), 675 d_type = DT_UNKNOWN;
434 d_type); 676
677 res = filldir(dirent, array->array[i].string.name,
678 array->array[i].string.len, file->f_pos,
679 nfs_compat_user_ino64(array->array[i].ino), d_type);
435 if (res < 0) 680 if (res < 0)
436 break; 681 break;
437 file->f_pos++; 682 file->f_pos++;
438 *desc->dir_cookie = entry->cookie; 683 desc->cache_entry_index = i;
439 if (dir_decode(desc) != 0) { 684 if (i < (array->size-1))
440 desc->page_index ++; 685 *desc->dir_cookie = array->array[i+1].cookie;
686 else
687 *desc->dir_cookie = array->last_cookie;
688 if (i == array->eof_index) {
689 desc->eof = 1;
441 break; 690 break;
442 } 691 }
443 if (loop_count++ > 200) {
444 loop_count = 0;
445 schedule();
446 }
447 } 692 }
448 dir_page_release(desc); 693
694 nfs_readdir_release_array(desc->page);
695 cache_page_release(desc);
449 if (dentry != NULL) 696 if (dentry != NULL)
450 dput(dentry); 697 dput(dentry);
451 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 698 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -469,12 +716,9 @@ static inline
469int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 716int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
470 filldir_t filldir) 717 filldir_t filldir)
471{ 718{
472 struct file *file = desc->file;
473 struct inode *inode = file->f_path.dentry->d_inode;
474 struct rpc_cred *cred = nfs_file_cred(file);
475 struct page *page = NULL; 719 struct page *page = NULL;
476 int status; 720 int status;
477 unsigned long timestamp, gencount; 721 struct inode *inode = desc->file->f_path.dentry->d_inode;
478 722
479 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 723 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
480 (unsigned long long)*desc->dir_cookie); 724 (unsigned long long)*desc->dir_cookie);
@@ -484,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
484 status = -ENOMEM; 728 status = -ENOMEM;
485 goto out; 729 goto out;
486 } 730 }
487 timestamp = jiffies; 731
488 gencount = nfs_inc_attr_generation_counter(); 732 if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
489 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
490 *desc->dir_cookie, page,
491 NFS_SERVER(inode)->dtsize,
492 desc->plus);
493 desc->page = page;
494 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
495 if (status >= 0) {
496 desc->timestamp = timestamp;
497 desc->gencount = gencount;
498 desc->timestamp_valid = 1;
499 if ((status = dir_decode(desc)) == 0)
500 desc->entry->prev_cookie = *desc->dir_cookie;
501 } else
502 status = -EIO; 733 status = -EIO;
503 if (status < 0)
504 goto out_release; 734 goto out_release;
735 }
505 736
737 desc->page_index = 0;
738 desc->page = page;
506 status = nfs_do_filldir(desc, dirent, filldir); 739 status = nfs_do_filldir(desc, dirent, filldir);
507 740
508 /* Reset read descriptor so it searches the page cache from
509 * the start upon the next call to readdir_search_pagecache() */
510 desc->page_index = 0;
511 desc->entry->cookie = desc->entry->prev_cookie = 0;
512 desc->entry->eof = 0;
513 out: 741 out:
514 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 742 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
515 __func__, status); 743 __func__, status);
516 return status; 744 return status;
517 out_release: 745 out_release:
518 dir_page_release(desc); 746 cache_page_release(desc);
519 goto out; 747 goto out;
520} 748}
521 749
@@ -529,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
529 struct inode *inode = dentry->d_inode; 757 struct inode *inode = dentry->d_inode;
530 nfs_readdir_descriptor_t my_desc, 758 nfs_readdir_descriptor_t my_desc,
531 *desc = &my_desc; 759 *desc = &my_desc;
532 struct nfs_entry my_entry;
533 int res = -ENOMEM; 760 int res = -ENOMEM;
534 761
535 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 762 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -550,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
550 desc->decode = NFS_PROTO(inode)->decode_dirent; 777 desc->decode = NFS_PROTO(inode)->decode_dirent;
551 desc->plus = NFS_USE_READDIRPLUS(inode); 778 desc->plus = NFS_USE_READDIRPLUS(inode);
552 779
553 my_entry.cookie = my_entry.prev_cookie = 0;
554 my_entry.eof = 0;
555 my_entry.fh = nfs_alloc_fhandle();
556 my_entry.fattr = nfs_alloc_fattr();
557 if (my_entry.fh == NULL || my_entry.fattr == NULL)
558 goto out_alloc_failed;
559
560 desc->entry = &my_entry;
561
562 nfs_block_sillyrename(dentry); 780 nfs_block_sillyrename(dentry);
563 res = nfs_revalidate_mapping(inode, filp->f_mapping); 781 res = nfs_revalidate_mapping(inode, filp->f_mapping);
564 if (res < 0) 782 if (res < 0)
565 goto out; 783 goto out;
566 784
567 while(!desc->entry->eof) { 785 while (desc->eof != 1) {
568 res = readdir_search_pagecache(desc); 786 res = readdir_search_pagecache(desc);
569 787
570 if (res == -EBADCOOKIE) { 788 if (res == -EBADCOOKIE) {
571 /* This means either end of directory */ 789 /* This means either end of directory */
572 if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { 790 if (*desc->dir_cookie && desc->eof == 0) {
573 /* Or that the server has 'lost' a cookie */ 791 /* Or that the server has 'lost' a cookie */
574 res = uncached_readdir(desc, dirent, filldir); 792 res = uncached_readdir(desc, dirent, filldir);
575 if (res >= 0) 793 if (res >= 0)
@@ -581,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
581 if (res == -ETOOSMALL && desc->plus) { 799 if (res == -ETOOSMALL && desc->plus) {
582 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 800 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
583 nfs_zap_caches(inode); 801 nfs_zap_caches(inode);
802 desc->page_index = 0;
584 desc->plus = 0; 803 desc->plus = 0;
585 desc->entry->eof = 0; 804 desc->eof = 0;
586 continue; 805 continue;
587 } 806 }
588 if (res < 0) 807 if (res < 0)
@@ -598,9 +817,6 @@ out:
598 nfs_unblock_sillyrename(dentry); 817 nfs_unblock_sillyrename(dentry);
599 if (res > 0) 818 if (res > 0)
600 res = 0; 819 res = 0;
601out_alloc_failed:
602 nfs_free_fattr(my_entry.fattr);
603 nfs_free_fhandle(my_entry.fh);
604 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", 820 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
605 dentry->d_parent->d_name.name, dentry->d_name.name, 821 dentry->d_parent->d_name.name, dentry->d_name.name,
606 res); 822 res);
@@ -1022,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
1022 return 1; 1238 return 1;
1023} 1239}
1024 1240
1241static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
1242{
1243 struct path path = {
1244 .mnt = nd->path.mnt,
1245 .dentry = dentry,
1246 };
1247 struct nfs_open_context *ctx;
1248 struct rpc_cred *cred;
1249 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
1250
1251 cred = rpc_lookup_cred();
1252 if (IS_ERR(cred))
1253 return ERR_CAST(cred);
1254 ctx = alloc_nfs_open_context(&path, cred, fmode);
1255 put_rpccred(cred);
1256 if (ctx == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 return ctx;
1259}
1260
1261static int do_open(struct inode *inode, struct file *filp)
1262{
1263 nfs_fscache_set_inode_cookie(inode, filp);
1264 return 0;
1265}
1266
1267static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
1268{
1269 struct file *filp;
1270 int ret = 0;
1271
1272 /* If the open_intent is for execute, we have an extra check to make */
1273 if (ctx->mode & FMODE_EXEC) {
1274 ret = nfs_may_open(ctx->path.dentry->d_inode,
1275 ctx->cred,
1276 nd->intent.open.flags);
1277 if (ret < 0)
1278 goto out;
1279 }
1280 filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
1281 if (IS_ERR(filp))
1282 ret = PTR_ERR(filp);
1283 else
1284 nfs_file_set_open_context(filp, ctx);
1285out:
1286 put_nfs_open_context(ctx);
1287 return ret;
1288}
1289
1025static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1290static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1026{ 1291{
1292 struct nfs_open_context *ctx;
1293 struct iattr attr;
1027 struct dentry *res = NULL; 1294 struct dentry *res = NULL;
1028 int error; 1295 struct inode *inode;
1296 int open_flags;
1297 int err;
1029 1298
1030 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", 1299 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
1031 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1300 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1047,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1047 goto out; 1316 goto out;
1048 } 1317 }
1049 1318
1319 ctx = nameidata_to_nfs_open_context(dentry, nd);
1320 res = ERR_CAST(ctx);
1321 if (IS_ERR(ctx))
1322 goto out;
1323
1324 open_flags = nd->intent.open.flags;
1325 if (nd->flags & LOOKUP_CREATE) {
1326 attr.ia_mode = nd->intent.open.create_mode;
1327 attr.ia_valid = ATTR_MODE;
1328 if (!IS_POSIXACL(dir))
1329 attr.ia_mode &= ~current_umask();
1330 } else {
1331 open_flags &= ~(O_EXCL | O_CREAT);
1332 attr.ia_valid = 0;
1333 }
1334
1050 /* Open the file on the server */ 1335 /* Open the file on the server */
1051 res = nfs4_atomic_open(dir, dentry, nd); 1336 nfs_block_sillyrename(dentry->d_parent);
1052 if (IS_ERR(res)) { 1337 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1053 error = PTR_ERR(res); 1338 if (IS_ERR(inode)) {
1054 switch (error) { 1339 nfs_unblock_sillyrename(dentry->d_parent);
1340 put_nfs_open_context(ctx);
1341 switch (PTR_ERR(inode)) {
1055 /* Make a negative dentry */ 1342 /* Make a negative dentry */
1056 case -ENOENT: 1343 case -ENOENT:
1344 d_add(dentry, NULL);
1057 res = NULL; 1345 res = NULL;
1058 goto out; 1346 goto out;
1059 /* This turned out not to be a regular file */ 1347 /* This turned out not to be a regular file */
@@ -1065,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1065 goto no_open; 1353 goto no_open;
1066 /* case -EINVAL: */ 1354 /* case -EINVAL: */
1067 default: 1355 default:
1356 res = ERR_CAST(inode);
1068 goto out; 1357 goto out;
1069 } 1358 }
1070 } else if (res != NULL) 1359 }
1360 res = d_add_unique(dentry, inode);
1361 nfs_unblock_sillyrename(dentry->d_parent);
1362 if (res != NULL) {
1363 dput(ctx->path.dentry);
1364 ctx->path.dentry = dget(res);
1071 dentry = res; 1365 dentry = res;
1366 }
1367 err = nfs_intent_set_file(nd, ctx);
1368 if (err < 0) {
1369 if (res != NULL)
1370 dput(res);
1371 return ERR_PTR(err);
1372 }
1072out: 1373out:
1374 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1073 return res; 1375 return res;
1074no_open: 1376no_open:
1075 return nfs_lookup(dir, dentry, nd); 1377 return nfs_lookup(dir, dentry, nd);
@@ -1080,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1080 struct dentry *parent = NULL; 1382 struct dentry *parent = NULL;
1081 struct inode *inode = dentry->d_inode; 1383 struct inode *inode = dentry->d_inode;
1082 struct inode *dir; 1384 struct inode *dir;
1385 struct nfs_open_context *ctx;
1083 int openflags, ret = 0; 1386 int openflags, ret = 0;
1084 1387
1085 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1388 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1086 goto no_open; 1389 goto no_open;
1390
1087 parent = dget_parent(dentry); 1391 parent = dget_parent(dentry);
1088 dir = parent->d_inode; 1392 dir = parent->d_inode;
1393
1089 /* We can't create new files in nfs_open_revalidate(), so we 1394 /* We can't create new files in nfs_open_revalidate(), so we
1090 * optimize away revalidation of negative dentries. 1395 * optimize away revalidation of negative dentries.
1091 */ 1396 */
@@ -1103,101 +1408,98 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1103 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1408 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1104 goto no_open_dput; 1409 goto no_open_dput;
1105 /* We can't create new files, or truncate existing ones here */ 1410 /* We can't create new files, or truncate existing ones here */
1106 openflags &= ~(O_CREAT|O_TRUNC); 1411 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1107 1412
1413 ctx = nameidata_to_nfs_open_context(dentry, nd);
1414 ret = PTR_ERR(ctx);
1415 if (IS_ERR(ctx))
1416 goto out;
1108 /* 1417 /*
1109 * Note: we're not holding inode->i_mutex and so may be racing with 1418 * Note: we're not holding inode->i_mutex and so may be racing with
1110 * operations that change the directory. We therefore save the 1419 * operations that change the directory. We therefore save the
1111 * change attribute *before* we do the RPC call. 1420 * change attribute *before* we do the RPC call.
1112 */ 1421 */
1113 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1422 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
1423 if (IS_ERR(inode)) {
1424 ret = PTR_ERR(inode);
1425 switch (ret) {
1426 case -EPERM:
1427 case -EACCES:
1428 case -EDQUOT:
1429 case -ENOSPC:
1430 case -EROFS:
1431 goto out_put_ctx;
1432 default:
1433 goto out_drop;
1434 }
1435 }
1436 iput(inode);
1437 if (inode != dentry->d_inode)
1438 goto out_drop;
1439
1440 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1441 ret = nfs_intent_set_file(nd, ctx);
1442 if (ret >= 0)
1443 ret = 1;
1114out: 1444out:
1115 dput(parent); 1445 dput(parent);
1116 if (!ret)
1117 d_drop(dentry);
1118 return ret; 1446 return ret;
1447out_drop:
1448 d_drop(dentry);
1449 ret = 0;
1450out_put_ctx:
1451 put_nfs_open_context(ctx);
1452 goto out;
1453
1119no_open_dput: 1454no_open_dput:
1120 dput(parent); 1455 dput(parent);
1121no_open: 1456no_open:
1122 return nfs_lookup_revalidate(dentry, nd); 1457 return nfs_lookup_revalidate(dentry, nd);
1123} 1458}
1124#endif /* CONFIG_NFSV4 */
1125 1459
1126static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) 1460static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1461 struct nameidata *nd)
1127{ 1462{
1128 struct dentry *parent = desc->file->f_path.dentry; 1463 struct nfs_open_context *ctx = NULL;
1129 struct inode *dir = parent->d_inode; 1464 struct iattr attr;
1130 struct nfs_entry *entry = desc->entry; 1465 int error;
1131 struct dentry *dentry, *alias; 1466 int open_flags = 0;
1132 struct qstr name = {
1133 .name = entry->name,
1134 .len = entry->len,
1135 };
1136 struct inode *inode;
1137 unsigned long verf = nfs_save_change_attribute(dir);
1138 1467
1139 switch (name.len) { 1468 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1140 case 2: 1469 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1141 if (name.name[0] == '.' && name.name[1] == '.')
1142 return dget_parent(parent);
1143 break;
1144 case 1:
1145 if (name.name[0] == '.')
1146 return dget(parent);
1147 }
1148 1470
1149 spin_lock(&dir->i_lock); 1471 attr.ia_mode = mode;
1150 if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { 1472 attr.ia_valid = ATTR_MODE;
1151 spin_unlock(&dir->i_lock);
1152 return NULL;
1153 }
1154 spin_unlock(&dir->i_lock);
1155 1473
1156 name.hash = full_name_hash(name.name, name.len); 1474 if ((nd->flags & LOOKUP_CREATE) != 0) {
1157 dentry = d_lookup(parent, &name); 1475 open_flags = nd->intent.open.flags;
1158 if (dentry != NULL) {
1159 /* Is this a positive dentry that matches the readdir info? */
1160 if (dentry->d_inode != NULL &&
1161 (NFS_FILEID(dentry->d_inode) == entry->ino ||
1162 d_mountpoint(dentry))) {
1163 if (!desc->plus || entry->fh->size == 0)
1164 return dentry;
1165 if (nfs_compare_fh(NFS_FH(dentry->d_inode),
1166 entry->fh) == 0)
1167 goto out_renew;
1168 }
1169 /* No, so d_drop to allow one to be created */
1170 d_drop(dentry);
1171 dput(dentry);
1172 }
1173 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
1174 return NULL;
1175 if (name.len > NFS_SERVER(dir)->namelen)
1176 return NULL;
1177 /* Note: caller is already holding the dir->i_mutex! */
1178 dentry = d_alloc(parent, &name);
1179 if (dentry == NULL)
1180 return NULL;
1181 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1182 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
1183 if (IS_ERR(inode)) {
1184 dput(dentry);
1185 return NULL;
1186 }
1187 1476
1188 alias = d_materialise_unique(dentry, inode); 1477 ctx = nameidata_to_nfs_open_context(dentry, nd);
1189 if (alias != NULL) { 1478 error = PTR_ERR(ctx);
1190 dput(dentry); 1479 if (IS_ERR(ctx))
1191 if (IS_ERR(alias)) 1480 goto out_err_drop;
1192 return NULL;
1193 dentry = alias;
1194 } 1481 }
1195 1482
1196out_renew: 1483 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
1197 nfs_set_verifier(dentry, verf); 1484 if (error != 0)
1198 return dentry; 1485 goto out_put_ctx;
1486 if (ctx != NULL) {
1487 error = nfs_intent_set_file(nd, ctx);
1488 if (error < 0)
1489 goto out_err;
1490 }
1491 return 0;
1492out_put_ctx:
1493 if (ctx != NULL)
1494 put_nfs_open_context(ctx);
1495out_err_drop:
1496 d_drop(dentry);
1497out_err:
1498 return error;
1199} 1499}
1200 1500
1501#endif /* CONFIG_NFSV4 */
1502
1201/* 1503/*
1202 * Code common to create, mkdir, and mknod. 1504 * Code common to create, mkdir, and mknod.
1203 */ 1505 */
@@ -1251,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1251{ 1553{
1252 struct iattr attr; 1554 struct iattr attr;
1253 int error; 1555 int error;
1254 int open_flags = 0;
1255 1556
1256 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1557 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1257 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1558 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1259,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1259 attr.ia_mode = mode; 1560 attr.ia_mode = mode;
1260 attr.ia_valid = ATTR_MODE; 1561 attr.ia_valid = ATTR_MODE;
1261 1562
1262 if ((nd->flags & LOOKUP_CREATE) != 0) 1563 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
1263 open_flags = nd->intent.open.flags;
1264
1265 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1266 if (error != 0) 1564 if (error != 0)
1267 goto out_err; 1565 goto out_err;
1268 return 0; 1566 return 0;
@@ -1344,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1344 return error; 1642 return error;
1345} 1643}
1346 1644
1347static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1348{
1349 static unsigned int sillycounter;
1350 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
1351 const int countersize = sizeof(sillycounter)*2;
1352 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
1353 char silly[slen+1];
1354 struct qstr qsilly;
1355 struct dentry *sdentry;
1356 int error = -EIO;
1357
1358 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
1359 dentry->d_parent->d_name.name, dentry->d_name.name,
1360 atomic_read(&dentry->d_count));
1361 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
1362
1363 /*
1364 * We don't allow a dentry to be silly-renamed twice.
1365 */
1366 error = -EBUSY;
1367 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1368 goto out;
1369
1370 sprintf(silly, ".nfs%*.*Lx",
1371 fileidsize, fileidsize,
1372 (unsigned long long)NFS_FILEID(dentry->d_inode));
1373
1374 /* Return delegation in anticipation of the rename */
1375 nfs_inode_return_delegation(dentry->d_inode);
1376
1377 sdentry = NULL;
1378 do {
1379 char *suffix = silly + slen - countersize;
1380
1381 dput(sdentry);
1382 sillycounter++;
1383 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
1384
1385 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
1386 dentry->d_name.name, silly);
1387
1388 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
1389 /*
1390 * N.B. Better to return EBUSY here ... it could be
1391 * dangerous to delete the file while it's in use.
1392 */
1393 if (IS_ERR(sdentry))
1394 goto out;
1395 } while(sdentry->d_inode != NULL); /* need negative lookup */
1396
1397 qsilly.name = silly;
1398 qsilly.len = strlen(silly);
1399 if (dentry->d_inode) {
1400 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1401 dir, &qsilly);
1402 nfs_mark_for_revalidate(dentry->d_inode);
1403 } else
1404 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1405 dir, &qsilly);
1406 if (!error) {
1407 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1408 d_move(dentry, sdentry);
1409 error = nfs_async_unlink(dir, dentry);
1410 /* If we return 0 we don't unlink */
1411 }
1412 dput(sdentry);
1413out:
1414 return error;
1415}
1416
1417/* 1645/*
1418 * Remove a file after making sure there are no pending writes, 1646 * Remove a file after making sure there are no pending writes,
1419 * and after checking that the file has only one user. 1647 * and after checking that the file has only one user.
@@ -1573,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1573 d_drop(dentry); 1801 d_drop(dentry);
1574 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1802 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1575 if (error == 0) { 1803 if (error == 0) {
1576 atomic_inc(&inode->i_count); 1804 ihold(inode);
1577 d_add(dentry, inode); 1805 d_add(dentry, inode);
1578 } 1806 }
1579 return error; 1807 return error;
@@ -1652,16 +1880,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1652 } 1880 }
1653 } 1881 }
1654 1882
1655 /*
1656 * ... prune child dentries and writebacks if needed.
1657 */
1658 if (atomic_read(&old_dentry->d_count) > 1) {
1659 if (S_ISREG(old_inode->i_mode))
1660 nfs_wb_all(old_inode);
1661 shrink_dcache_parent(old_dentry);
1662 }
1663 nfs_inode_return_delegation(old_inode); 1883 nfs_inode_return_delegation(old_inode);
1664
1665 if (new_inode != NULL) 1884 if (new_inode != NULL)
1666 nfs_inode_return_delegation(new_inode); 1885 nfs_inode_return_delegation(new_inode);
1667 1886
@@ -1713,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
1713int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 1932int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1714{ 1933{
1715 LIST_HEAD(head); 1934 LIST_HEAD(head);
1716 struct nfs_inode *nfsi; 1935 struct nfs_inode *nfsi, *next;
1717 struct nfs_access_entry *cache; 1936 struct nfs_access_entry *cache;
1718 1937
1719 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 1938 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1720 return (nr_to_scan == 0) ? 0 : -1; 1939 return (nr_to_scan == 0) ? 0 : -1;
1721 1940
1722 spin_lock(&nfs_access_lru_lock); 1941 spin_lock(&nfs_access_lru_lock);
1723 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1942 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1724 struct inode *inode; 1943 struct inode *inode;
1725 1944
1726 if (nr_to_scan-- == 0) 1945 if (nr_to_scan-- == 0)
@@ -1953,7 +2172,7 @@ int nfs_permission(struct inode *inode, int mask)
1953 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2172 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
1954 goto out; 2173 goto out;
1955 /* Is this sys_access() ? */ 2174 /* Is this sys_access() ? */
1956 if (mask & MAY_ACCESS) 2175 if (mask & (MAY_ACCESS | MAY_CHDIR))
1957 goto force_lookup; 2176 goto force_lookup;
1958 2177
1959 switch (inode->i_mode & S_IFMT) { 2178 switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d605..84d3c8b9020 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
69 69
70 /* I/O parameters */ 70 /* I/O parameters */
71 struct nfs_open_context *ctx; /* file open context info */ 71 struct nfs_open_context *ctx; /* file open context info */
72 struct nfs_lock_context *l_ctx; /* Lock context info */
72 struct kiocb * iocb; /* controlling i/o request */ 73 struct kiocb * iocb; /* controlling i/o request */
73 struct inode * inode; /* target file of i/o */ 74 struct inode * inode; /* target file of i/o */
74 75
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
160 INIT_LIST_HEAD(&dreq->rewrite_list); 161 INIT_LIST_HEAD(&dreq->rewrite_list);
161 dreq->iocb = NULL; 162 dreq->iocb = NULL;
162 dreq->ctx = NULL; 163 dreq->ctx = NULL;
164 dreq->l_ctx = NULL;
163 spin_lock_init(&dreq->lock); 165 spin_lock_init(&dreq->lock);
164 atomic_set(&dreq->io_count, 0); 166 atomic_set(&dreq->io_count, 0);
165 dreq->count = 0; 167 dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
173{ 175{
174 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 176 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
175 177
178 if (dreq->l_ctx != NULL)
179 nfs_put_lock_context(dreq->l_ctx);
176 if (dreq->ctx != NULL) 180 if (dreq->ctx != NULL)
177 put_nfs_open_context(dreq->ctx); 181 put_nfs_open_context(dreq->ctx);
178 kmem_cache_free(nfs_direct_cachep, dreq); 182 kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
336 data->cred = msg.rpc_cred; 340 data->cred = msg.rpc_cred;
337 data->args.fh = NFS_FH(inode); 341 data->args.fh = NFS_FH(inode);
338 data->args.context = ctx; 342 data->args.context = ctx;
343 data->args.lock_context = dreq->l_ctx;
339 data->args.offset = pos; 344 data->args.offset = pos;
340 data->args.pgbase = pgbase; 345 data->args.pgbase = pgbase;
341 data->args.pages = data->pagevec; 346 data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
416static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
417 unsigned long nr_segs, loff_t pos) 422 unsigned long nr_segs, loff_t pos)
418{ 423{
419 ssize_t result = 0; 424 ssize_t result = -ENOMEM;
420 struct inode *inode = iocb->ki_filp->f_mapping->host; 425 struct inode *inode = iocb->ki_filp->f_mapping->host;
421 struct nfs_direct_req *dreq; 426 struct nfs_direct_req *dreq;
422 427
423 dreq = nfs_direct_req_alloc(); 428 dreq = nfs_direct_req_alloc();
424 if (!dreq) 429 if (dreq == NULL)
425 return -ENOMEM; 430 goto out;
426 431
427 dreq->inode = inode; 432 dreq->inode = inode;
428 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 433 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
434 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
435 if (dreq->l_ctx == NULL)
436 goto out_release;
429 if (!is_sync_kiocb(iocb)) 437 if (!is_sync_kiocb(iocb))
430 dreq->iocb = iocb; 438 dreq->iocb = iocb;
431 439
432 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); 440 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
433 if (!result) 441 if (!result)
434 result = nfs_direct_wait(dreq); 442 result = nfs_direct_wait(dreq);
443out_release:
435 nfs_direct_req_release(dreq); 444 nfs_direct_req_release(dreq);
436 445out:
437 return result; 446 return result;
438} 447}
439 448
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
574 data->args.offset = 0; 583 data->args.offset = 0;
575 data->args.count = 0; 584 data->args.count = 0;
576 data->args.context = dreq->ctx; 585 data->args.context = dreq->ctx;
586 data->args.lock_context = dreq->l_ctx;
577 data->res.count = 0; 587 data->res.count = 0;
578 data->res.fattr = &data->fattr; 588 data->res.fattr = &data->fattr;
579 data->res.verf = &data->verf; 589 data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
761 data->cred = msg.rpc_cred; 771 data->cred = msg.rpc_cred;
762 data->args.fh = NFS_FH(inode); 772 data->args.fh = NFS_FH(inode);
763 data->args.context = ctx; 773 data->args.context = ctx;
774 data->args.lock_context = dreq->l_ctx;
764 data->args.offset = pos; 775 data->args.offset = pos;
765 data->args.pgbase = pgbase; 776 data->args.pgbase = pgbase;
766 data->args.pages = data->pagevec; 777 data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
845 unsigned long nr_segs, loff_t pos, 856 unsigned long nr_segs, loff_t pos,
846 size_t count) 857 size_t count)
847{ 858{
848 ssize_t result = 0; 859 ssize_t result = -ENOMEM;
849 struct inode *inode = iocb->ki_filp->f_mapping->host; 860 struct inode *inode = iocb->ki_filp->f_mapping->host;
850 struct nfs_direct_req *dreq; 861 struct nfs_direct_req *dreq;
851 size_t wsize = NFS_SERVER(inode)->wsize; 862 size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
853 864
854 dreq = nfs_direct_req_alloc(); 865 dreq = nfs_direct_req_alloc();
855 if (!dreq) 866 if (!dreq)
856 return -ENOMEM; 867 goto out;
857 nfs_alloc_commit_data(dreq); 868 nfs_alloc_commit_data(dreq);
858 869
859 if (dreq->commit_data == NULL || count < wsize) 870 if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
861 872
862 dreq->inode = inode; 873 dreq->inode = inode;
863 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
876 if (dreq->l_ctx == NULL)
877 goto out_release;
864 if (!is_sync_kiocb(iocb)) 878 if (!is_sync_kiocb(iocb))
865 dreq->iocb = iocb; 879 dreq->iocb = iocb;
866 880
867 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); 881 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
868 if (!result) 882 if (!result)
869 result = nfs_direct_wait(dreq); 883 result = nfs_direct_wait(dreq);
884out_release:
870 nfs_direct_req_release(dreq); 885 nfs_direct_req_release(dreq);
871 886out:
872 return result; 887 return result;
873} 888}
874 889
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d002..a6e711ad130 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
6 * Resolves DNS hostnames into valid ip addresses 6 * Resolves DNS hostnames into valid ip addresses
7 */ 7 */
8 8
9#ifdef CONFIG_NFS_USE_KERNEL_DNS
10
11#include <linux/sunrpc/clnt.h>
12#include <linux/dns_resolver.h>
13
14ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
15 struct sockaddr *sa, size_t salen)
16{
17 ssize_t ret;
18 char *ip_addr = NULL;
19 int ip_len;
20
21 ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
22 if (ip_len > 0)
23 ret = rpc_pton(ip_addr, ip_len, sa, salen);
24 else
25 ret = -ESRCH;
26 kfree(ip_addr);
27 return ret;
28}
29
30#else
31
9#include <linux/hash.h> 32#include <linux/hash.h>
10#include <linux/string.h> 33#include <linux/string.h>
11#include <linux/kmod.h> 34#include <linux/kmod.h>
@@ -144,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
144 return 0; 167 return 0;
145 } 168 }
146 item = container_of(h, struct nfs_dns_ent, h); 169 item = container_of(h, struct nfs_dns_ent, h);
147 ttl = (long)item->h.expiry_time - (long)get_seconds(); 170 ttl = item->h.expiry_time - seconds_since_boot();
148 if (ttl < 0) 171 if (ttl < 0)
149 ttl = 0; 172 ttl = 0;
150 173
@@ -216,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
216 ttl = get_expiry(&buf); 239 ttl = get_expiry(&buf);
217 if (ttl == 0) 240 if (ttl == 0)
218 goto out; 241 goto out;
219 key.h.expiry_time = ttl + get_seconds(); 242 key.h.expiry_time = ttl + seconds_since_boot();
220 243
221 ret = -ENOMEM; 244 ret = -ENOMEM;
222 item = nfs_dns_lookup(cd, &key); 245 item = nfs_dns_lookup(cd, &key);
@@ -278,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
278 goto out_err; 301 goto out_err;
279 ret = -ETIMEDOUT; 302 ret = -ETIMEDOUT;
280 if (!test_bit(CACHE_VALID, &(*item)->h.flags) 303 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
281 || (*item)->h.expiry_time < get_seconds() 304 || (*item)->h.expiry_time < seconds_since_boot()
282 || cd->flush_time > (*item)->h.last_refresh) 305 || cd->flush_time > (*item)->h.last_refresh)
283 goto out_put; 306 goto out_put;
284 ret = -ENOENT; 307 ret = -ENOENT;
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
346 nfs_cache_unregister(&nfs_dns_resolve); 369 nfs_cache_unregister(&nfs_dns_resolve);
347} 370}
348 371
372#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf..199bb5543a9 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
6 6
7#define NFS_DNS_HOSTNAME_MAXLEN (128) 7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8 8
9
10#ifdef CONFIG_NFS_USE_KERNEL_DNS
11static inline int nfs_dns_resolver_init(void)
12{
13 return 0;
14}
15
16static inline void nfs_dns_resolver_destroy(void)
17{}
18#else
9extern int nfs_dns_resolver_init(void); 19extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void); 20extern void nfs_dns_resolver_destroy(void);
21#endif
22
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 23extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen); 24 struct sockaddr *sa, size_t salen);
13 25
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f036153d9f5..60677f9f131 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
41 42
@@ -203,37 +204,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
203} 204}
204 205
205/* 206/*
206 * Helper for nfs_file_flush() and nfs_file_fsync()
207 *
208 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
209 * disk, but it retrieves and clears ctx->error after synching, despite
210 * the two being set at the same time in nfs_context_set_write_error().
211 * This is because the former is used to notify the _next_ call to
212 * nfs_file_write() that a write error occured, and hence cause it to
213 * fall back to doing a synchronous write.
214 */
215static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
216{
217 int have_error, status;
218 int ret = 0;
219
220 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
221 status = nfs_wb_all(inode);
222 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
223 if (have_error)
224 ret = xchg(&ctx->error, 0);
225 if (!ret)
226 ret = status;
227 return ret;
228}
229
230/*
231 * Flush all dirty pages, and check for write errors. 207 * Flush all dirty pages, and check for write errors.
232 */ 208 */
233static int 209static int
234nfs_file_flush(struct file *file, fl_owner_t id) 210nfs_file_flush(struct file *file, fl_owner_t id)
235{ 211{
236 struct nfs_open_context *ctx = nfs_file_open_context(file);
237 struct dentry *dentry = file->f_path.dentry; 212 struct dentry *dentry = file->f_path.dentry;
238 struct inode *inode = dentry->d_inode; 213 struct inode *inode = dentry->d_inode;
239 214
@@ -246,7 +221,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
246 return 0; 221 return 0;
247 222
248 /* Flush writes to the server and return any errors */ 223 /* Flush writes to the server and return any errors */
249 return nfs_do_fsync(ctx, inode); 224 return vfs_fsync(file, 0);
250} 225}
251 226
252static ssize_t 227static ssize_t
@@ -321,6 +296,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
321 * Flush any dirty pages for this process, and check for write errors. 296 * Flush any dirty pages for this process, and check for write errors.
322 * The return status from this call provides a reliable indication of 297 * The return status from this call provides a reliable indication of
323 * whether any write errors occurred for this process. 298 * whether any write errors occurred for this process.
299 *
300 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
301 * disk, but it retrieves and clears ctx->error after synching, despite
302 * the two being set at the same time in nfs_context_set_write_error().
303 * This is because the former is used to notify the _next_ call to
304 * nfs_file_write() that a write error occured, and hence cause it to
305 * fall back to doing a synchronous write.
324 */ 306 */
325static int 307static int
326nfs_file_fsync(struct file *file, int datasync) 308nfs_file_fsync(struct file *file, int datasync)
@@ -328,13 +310,23 @@ nfs_file_fsync(struct file *file, int datasync)
328 struct dentry *dentry = file->f_path.dentry; 310 struct dentry *dentry = file->f_path.dentry;
329 struct nfs_open_context *ctx = nfs_file_open_context(file); 311 struct nfs_open_context *ctx = nfs_file_open_context(file);
330 struct inode *inode = dentry->d_inode; 312 struct inode *inode = dentry->d_inode;
313 int have_error, status;
314 int ret = 0;
315
331 316
332 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 317 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
333 dentry->d_parent->d_name.name, dentry->d_name.name, 318 dentry->d_parent->d_name.name, dentry->d_name.name,
334 datasync); 319 datasync);
335 320
336 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 321 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
337 return nfs_do_fsync(ctx, inode); 322 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
323 status = nfs_commit_inode(inode, FLUSH_SYNC);
324 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
325 if (have_error)
326 ret = xchg(&ctx->error, 0);
327 if (!ret && status < 0)
328 ret = status;
329 return ret;
338} 330}
339 331
340/* 332/*
@@ -395,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
395 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
396 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
397 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
398start: 394start:
399 /* 395 /*
400 * Prevent starvation issues if someone is doing a consistency 396 * Prevent starvation issues if someone is doing a consistency
@@ -560,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
560 struct file *filp = vma->vm_file; 556 struct file *filp = vma->vm_file;
561 struct dentry *dentry = filp->f_path.dentry; 557 struct dentry *dentry = filp->f_path.dentry;
562 unsigned pagelen; 558 unsigned pagelen;
563 int ret = -EINVAL; 559 int ret = VM_FAULT_NOPAGE;
564 struct address_space *mapping; 560 struct address_space *mapping;
565 561
566 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", 562 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -576,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
576 if (mapping != dentry->d_inode->i_mapping) 572 if (mapping != dentry->d_inode->i_mapping)
577 goto out_unlock; 573 goto out_unlock;
578 574
579 ret = 0;
580 pagelen = nfs_page_length(page); 575 pagelen = nfs_page_length(page);
581 if (pagelen == 0) 576 if (pagelen == 0)
582 goto out_unlock; 577 goto out_unlock;
583 578
584 ret = nfs_flush_incompatible(filp, page); 579 ret = VM_FAULT_LOCKED;
585 if (ret != 0) 580 if (nfs_flush_incompatible(filp, page) == 0 &&
586 goto out_unlock; 581 nfs_updatepage(filp, page, 0, pagelen) == 0)
582 goto out;
587 583
588 ret = nfs_updatepage(filp, page, 0, pagelen); 584 ret = VM_FAULT_SIGBUS;
589out_unlock: 585out_unlock:
590 if (!ret)
591 return VM_FAULT_LOCKED;
592 unlock_page(page); 586 unlock_page(page);
593 return VM_FAULT_SIGBUS; 587out:
588 return ret;
594} 589}
595 590
596static const struct vm_operations_struct nfs_file_vm_ops = { 591static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -648,7 +643,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
648 643
649 /* Return error values for O_DSYNC and IS_SYNC() */ 644 /* Return error values for O_DSYNC and IS_SYNC() */
650 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 645 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
651 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 646 int err = vfs_fsync(iocb->ki_filp, 0);
652 if (err < 0) 647 if (err < 0)
653 result = err; 648 result = err;
654 } 649 }
@@ -684,7 +679,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
684 written = ret; 679 written = ret;
685 680
686 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 681 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
687 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 682 int err = vfs_fsync(filp, 0);
688 if (err < 0) 683 if (err < 0)
689 ret = err; 684 ret = err;
690 } 685 }
@@ -693,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
693 return ret; 688 return ret;
694} 689}
695 690
696static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 691static int
692do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
697{ 693{
698 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
699 int status = 0; 695 int status = 0;
@@ -708,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
708 if (nfs_have_delegation(inode, FMODE_READ)) 704 if (nfs_have_delegation(inode, FMODE_READ))
709 goto out_noconflict; 705 goto out_noconflict;
710 706
711 if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) 707 if (is_local)
712 goto out_noconflict; 708 goto out_noconflict;
713 709
714 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 710 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -732,14 +728,11 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
732 default: 728 default:
733 BUG(); 729 BUG();
734 } 730 }
735 if (res < 0)
736 dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
737 " - error %d!\n",
738 __func__, res);
739 return res; 731 return res;
740} 732}
741 733
742static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) 734static int
735do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
743{ 736{
744 struct inode *inode = filp->f_mapping->host; 737 struct inode *inode = filp->f_mapping->host;
745 int status; 738 int status;
@@ -754,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
754 * If we're signalled while cleaning up locks on process exit, we 747 * If we're signalled while cleaning up locks on process exit, we
755 * still need to complete the unlock. 748 * still need to complete the unlock.
756 */ 749 */
757 /* Use local locking if mounted with "-onolock" */ 750 /*
758 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 751 * Use local locking if mounted with "-onolock" or with appropriate
752 * "-olocal_lock="
753 */
754 if (!is_local)
759 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 755 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
760 else 756 else
761 status = do_vfs_lock(filp, fl); 757 status = do_vfs_lock(filp, fl);
762 return status; 758 return status;
763} 759}
764 760
765static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) 761static int
762is_time_granular(struct timespec *ts) {
763 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
764}
765
766static int
767do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
766{ 768{
767 struct inode *inode = filp->f_mapping->host; 769 struct inode *inode = filp->f_mapping->host;
768 int status; 770 int status;
@@ -775,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
775 if (status != 0) 777 if (status != 0)
776 goto out; 778 goto out;
777 779
778 /* Use local locking if mounted with "-onolock" */ 780 /*
779 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 781 * Use local locking if mounted with "-onolock" or with appropriate
782 * "-olocal_lock="
783 */
784 if (!is_local)
780 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 785 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
781 else 786 else
782 status = do_vfs_lock(filp, fl); 787 status = do_vfs_lock(filp, fl);
783 if (status < 0) 788 if (status < 0)
784 goto out; 789 goto out;
790
785 /* 791 /*
786 * Make sure we clear the cache whenever we try to get the lock. 792 * Revalidate the cache if the server has time stamps granular
793 * enough to detect subsecond changes. Otherwise, clear the
794 * cache to prevent missing any changes.
795 *
787 * This makes locking act as a cache coherency point. 796 * This makes locking act as a cache coherency point.
788 */ 797 */
789 nfs_sync_mapping(filp->f_mapping); 798 nfs_sync_mapping(filp->f_mapping);
790 if (!nfs_have_delegation(inode, FMODE_READ)) 799 if (!nfs_have_delegation(inode, FMODE_READ)) {
791 nfs_zap_caches(inode); 800 if (is_time_granular(&NFS_SERVER(inode)->time_delta))
801 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
802 else
803 nfs_zap_caches(inode);
804 }
792out: 805out:
793 return status; 806 return status;
794} 807}
@@ -800,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
800{ 813{
801 struct inode *inode = filp->f_mapping->host; 814 struct inode *inode = filp->f_mapping->host;
802 int ret = -ENOLCK; 815 int ret = -ENOLCK;
816 int is_local = 0;
803 817
804 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", 818 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
805 filp->f_path.dentry->d_parent->d_name.name, 819 filp->f_path.dentry->d_parent->d_name.name,
@@ -813,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
813 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 827 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
814 goto out_err; 828 goto out_err;
815 829
830 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
831 is_local = 1;
832
816 if (NFS_PROTO(inode)->lock_check_bounds != NULL) { 833 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
817 ret = NFS_PROTO(inode)->lock_check_bounds(fl); 834 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
818 if (ret < 0) 835 if (ret < 0)
@@ -820,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
820 } 837 }
821 838
822 if (IS_GETLK(cmd)) 839 if (IS_GETLK(cmd))
823 ret = do_getlk(filp, cmd, fl); 840 ret = do_getlk(filp, cmd, fl, is_local);
824 else if (fl->fl_type == F_UNLCK) 841 else if (fl->fl_type == F_UNLCK)
825 ret = do_unlk(filp, cmd, fl); 842 ret = do_unlk(filp, cmd, fl, is_local);
826 else 843 else
827 ret = do_setlk(filp, cmd, fl); 844 ret = do_setlk(filp, cmd, fl, is_local);
828out_err: 845out_err:
829 return ret; 846 return ret;
830} 847}
@@ -834,6 +851,9 @@ out_err:
834 */ 851 */
835static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 852static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
836{ 853{
854 struct inode *inode = filp->f_mapping->host;
855 int is_local = 0;
856
837 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", 857 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
838 filp->f_path.dentry->d_parent->d_name.name, 858 filp->f_path.dentry->d_parent->d_name.name,
839 filp->f_path.dentry->d_name.name, 859 filp->f_path.dentry->d_name.name,
@@ -842,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
842 if (!(fl->fl_flags & FL_FLOCK)) 862 if (!(fl->fl_flags & FL_FLOCK))
843 return -ENOLCK; 863 return -ENOLCK;
844 864
865 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
866 is_local = 1;
867
845 /* We're simulating flock() locks using posix locks on the server */ 868 /* We're simulating flock() locks using posix locks on the server */
846 fl->fl_owner = (fl_owner_t)filp; 869 fl->fl_owner = (fl_owner_t)filp;
847 fl->fl_start = 0; 870 fl->fl_start = 0;
848 fl->fl_end = OFFSET_MAX; 871 fl->fl_end = OFFSET_MAX;
849 872
850 if (fl->fl_type == F_UNLCK) 873 if (fl->fl_type == F_UNLCK)
851 return do_unlk(filp, cmd, fl); 874 return do_unlk(filp, cmd, fl, is_local);
852 return do_setlk(filp, cmd, fl); 875 return do_setlk(filp, cmd, fl, is_local);
853} 876}
854 877
855/* 878/*
@@ -861,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
861 dprintk("NFS: setlease(%s/%s, arg=%ld)\n", 884 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
862 file->f_path.dentry->d_parent->d_name.name, 885 file->f_path.dentry->d_parent->d_name.name,
863 file->f_path.dentry->d_name.name, arg); 886 file->f_path.dentry->d_name.name, arg);
864
865 return -EINVAL; 887 return -EINVAL;
866} 888}
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e160..ac7b814ce16 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
54 iput(inode); 54 iput(inode);
55 return -ENOMEM; 55 return -ENOMEM;
56 } 56 }
57 /* Circumvent igrab(): we know the inode is not being freed */ 57 ihold(inode);
58 atomic_inc(&inode->i_count);
59 /* 58 /*
60 * Ensure that this dentry is invisible to d_find_alias(). 59 * Ensure that this dentry is invisible to d_find_alias().
61 * Otherwise, it may be spliced into the tree by 60 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916..4e2d9b6b138 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38
39#include <linux/slab.h>
40#include <linux/cred.h>
41#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h>
43#include <linux/key-type.h>
44#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h>
47
48#include <keys/user-type.h>
49
50#define NFS_UINT_MAXLEN 11
51
52const struct cred *id_resolver_cache;
53
54struct key_type key_type_id_resolver = {
55 .name = "id_resolver",
56 .instantiate = user_instantiate,
57 .match = user_match,
58 .revoke = user_revoke,
59 .destroy = user_destroy,
60 .describe = user_describe,
61 .read = user_read,
62};
63
64int nfs_idmap_init(void)
65{
66 struct cred *cred;
67 struct key *keyring;
68 int ret = 0;
69
70 printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
71
72 cred = prepare_kernel_cred(NULL);
73 if (!cred)
74 return -ENOMEM;
75
76 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
77 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
78 KEY_USR_VIEW | KEY_USR_READ,
79 KEY_ALLOC_NOT_IN_QUOTA);
80 if (IS_ERR(keyring)) {
81 ret = PTR_ERR(keyring);
82 goto failed_put_cred;
83 }
84
85 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
86 if (ret < 0)
87 goto failed_put_key;
88
89 ret = register_key_type(&key_type_id_resolver);
90 if (ret < 0)
91 goto failed_put_key;
92
93 cred->thread_keyring = keyring;
94 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
95 id_resolver_cache = cred;
96 return 0;
97
98failed_put_key:
99 key_put(keyring);
100failed_put_cred:
101 put_cred(cred);
102 return ret;
103}
104
105void nfs_idmap_quit(void)
106{
107 key_revoke(id_resolver_cache->thread_keyring);
108 unregister_key_type(&key_type_id_resolver);
109 put_cred(id_resolver_cache);
110}
111
112/*
113 * Assemble the description to pass to request_key()
114 * This function will allocate a new string and update dest to point
115 * at it. The caller is responsible for freeing dest.
116 *
117 * On error 0 is returned. Otherwise, the length of dest is returned.
118 */
119static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
120 const char *type, size_t typelen, char **desc)
121{
122 char *cp;
123 size_t desclen = typelen + namelen + 2;
124
125 *desc = kmalloc(desclen, GFP_KERNEL);
126 if (!*desc)
127 return -ENOMEM;
128
129 cp = *desc;
130 memcpy(cp, type, typelen);
131 cp += typelen;
132 *cp++ = ':';
133
134 memcpy(cp, name, namelen);
135 cp += namelen;
136 *cp = '\0';
137 return desclen;
138}
139
140static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
141 const char *type, void *data, size_t data_size)
142{
143 const struct cred *saved_cred;
144 struct key *rkey;
145 char *desc;
146 struct user_key_payload *payload;
147 ssize_t ret;
148
149 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
150 if (ret <= 0)
151 goto out;
152
153 saved_cred = override_creds(id_resolver_cache);
154 rkey = request_key(&key_type_id_resolver, desc, "");
155 revert_creds(saved_cred);
156 kfree(desc);
157 if (IS_ERR(rkey)) {
158 ret = PTR_ERR(rkey);
159 goto out;
160 }
161
162 rcu_read_lock();
163 rkey->perm |= KEY_USR_VIEW;
164
165 ret = key_validate(rkey);
166 if (ret < 0)
167 goto out_up;
168
169 payload = rcu_dereference(rkey->payload.data);
170 if (IS_ERR_OR_NULL(payload)) {
171 ret = PTR_ERR(payload);
172 goto out_up;
173 }
174
175 ret = payload->datalen;
176 if (ret > 0 && ret <= data_size)
177 memcpy(data, payload->data, ret);
178 else
179 ret = -EINVAL;
180
181out_up:
182 rcu_read_unlock();
183 key_put(rkey);
184out:
185 return ret;
186}
187
188
189/* ID -> Name */
190static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
191{
192 char id_str[NFS_UINT_MAXLEN];
193 int id_len;
194 ssize_t ret;
195
196 id_len = snprintf(id_str, sizeof(id_str), "%u", id);
197 ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
198 if (ret < 0)
199 return -EINVAL;
200 return ret;
201}
202
203/* Name -> ID */
204static int nfs_idmap_lookup_id(const char *name, size_t namelen,
205 const char *type, __u32 *id)
206{
207 char id_str[NFS_UINT_MAXLEN];
208 long id_long;
209 ssize_t data_size;
210 int ret = 0;
211
212 data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
213 if (data_size <= 0) {
214 ret = -EINVAL;
215 } else {
216 ret = strict_strtol(id_str, 10, &id_long);
217 *id = (__u32)id_long;
218 }
219 return ret;
220}
221
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
223{
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225}
226
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
228{
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230}
231
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
233{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen);
235}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
237{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239}
240
241#else /* CONFIG_NFS_USE_IDMAPPER not defined */
242
37#include <linux/module.h> 243#include <linux/module.h>
38#include <linux/mutex.h> 244#include <linux/mutex.h>
39#include <linux/init.h> 245#include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
503 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
504} 710}
505 711
506int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) 712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
507{ 713{
508 struct idmap *idmap = clp->cl_idmap; 714 struct idmap *idmap = clp->cl_idmap;
509 715
510 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
511} 717}
512int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) 718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
513{ 719{
514 struct idmap *idmap = clp->cl_idmap; 720 struct idmap *idmap = clp->cl_idmap;
515 721
516 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
517} 723}
518 724
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518fee..314f5716460 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h" 49#include "fscache.h"
50#include "dns_resolve.h" 50#include "dns_resolve.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_VFS 53#define NFSDBG_FACILITY NFSDBG_VFS
53 54
@@ -98,7 +99,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
98 return ino; 99 return ino;
99} 100}
100 101
101void nfs_clear_inode(struct inode *inode) 102static void nfs_clear_inode(struct inode *inode)
102{ 103{
103 /* 104 /*
104 * The following should never happen... 105 * The following should never happen...
@@ -110,6 +111,13 @@ void nfs_clear_inode(struct inode *inode)
110 nfs_fscache_release_inode_cookie(inode); 111 nfs_fscache_release_inode_cookie(inode);
111} 112}
112 113
114void nfs_evict_inode(struct inode *inode)
115{
116 truncate_inode_pages(&inode->i_data, 0);
117 end_writeback(inode);
118 nfs_clear_inode(inode);
119}
120
113/** 121/**
114 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk 122 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
115 */ 123 */
@@ -227,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
227 return 0; 235 return 0;
228} 236}
229 237
230/* Don't use READDIRPLUS on directories that we believe are too large */
231#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
232
233/* 238/*
234 * This is our front-end to iget that looks up inodes by file handle 239 * This is our front-end to iget that looks up inodes by file handle
235 * instead of inode number. 240 * instead of inode number.
@@ -284,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
284 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
285 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
286 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
287 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
288 && fattr->size <= NFS_LIMIT_READDIRPLUS)
289 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 293 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
290 /* Deal with crossing mountpoints */ 294 /* Deal with crossing mountpoints */
291 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 295 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -413,10 +417,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
413 return 0; 417 return 0;
414 418
415 /* Write all dirty data */ 419 /* Write all dirty data */
416 if (S_ISREG(inode->i_mode)) { 420 if (S_ISREG(inode->i_mode))
417 filemap_write_and_wait(inode->i_mapping);
418 nfs_wb_all(inode); 421 nfs_wb_all(inode);
419 }
420 422
421 fattr = nfs_alloc_fattr(); 423 fattr = nfs_alloc_fattr();
422 if (fattr == NULL) 424 if (fattr == NULL)
@@ -530,6 +532,68 @@ out:
530 return err; 532 return err;
531} 533}
532 534
535static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
536{
537 atomic_set(&l_ctx->count, 1);
538 l_ctx->lockowner = current->files;
539 l_ctx->pid = current->tgid;
540 INIT_LIST_HEAD(&l_ctx->list);
541}
542
543static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
544{
545 struct nfs_lock_context *pos;
546
547 list_for_each_entry(pos, &ctx->lock_context.list, list) {
548 if (pos->lockowner != current->files)
549 continue;
550 if (pos->pid != current->tgid)
551 continue;
552 atomic_inc(&pos->count);
553 return pos;
554 }
555 return NULL;
556}
557
558struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
559{
560 struct nfs_lock_context *res, *new = NULL;
561 struct inode *inode = ctx->path.dentry->d_inode;
562
563 spin_lock(&inode->i_lock);
564 res = __nfs_find_lock_context(ctx);
565 if (res == NULL) {
566 spin_unlock(&inode->i_lock);
567 new = kmalloc(sizeof(*new), GFP_KERNEL);
568 if (new == NULL)
569 return NULL;
570 nfs_init_lock_context(new);
571 spin_lock(&inode->i_lock);
572 res = __nfs_find_lock_context(ctx);
573 if (res == NULL) {
574 list_add_tail(&new->list, &ctx->lock_context.list);
575 new->open_context = ctx;
576 res = new;
577 new = NULL;
578 }
579 }
580 spin_unlock(&inode->i_lock);
581 kfree(new);
582 return res;
583}
584
585void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
586{
587 struct nfs_open_context *ctx = l_ctx->open_context;
588 struct inode *inode = ctx->path.dentry->d_inode;
589
590 if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
591 return;
592 list_del(&l_ctx->list);
593 spin_unlock(&inode->i_lock);
594 kfree(l_ctx);
595}
596
533/** 597/**
534 * nfs_close_context - Common close_context() routine NFSv2/v3 598 * nfs_close_context - Common close_context() routine NFSv2/v3
535 * @ctx: pointer to context 599 * @ctx: pointer to context
@@ -556,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
556 nfs_revalidate_inode(server, inode); 620 nfs_revalidate_inode(server, inode);
557} 621}
558 622
559static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) 623struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
560{ 624{
561 struct nfs_open_context *ctx; 625 struct nfs_open_context *ctx;
562 626
@@ -566,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
566 path_get(&ctx->path); 630 path_get(&ctx->path);
567 ctx->cred = get_rpccred(cred); 631 ctx->cred = get_rpccred(cred);
568 ctx->state = NULL; 632 ctx->state = NULL;
569 ctx->lockowner = current->files; 633 ctx->mode = f_mode;
570 ctx->flags = 0; 634 ctx->flags = 0;
571 ctx->error = 0; 635 ctx->error = 0;
572 ctx->dir_cookie = 0; 636 ctx->dir_cookie = 0;
573 atomic_set(&ctx->count, 1); 637 nfs_init_lock_context(&ctx->lock_context);
638 ctx->lock_context.open_context = ctx;
639 INIT_LIST_HEAD(&ctx->list);
574 } 640 }
575 return ctx; 641 return ctx;
576} 642}
@@ -578,7 +644,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
578struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) 644struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
579{ 645{
580 if (ctx != NULL) 646 if (ctx != NULL)
581 atomic_inc(&ctx->count); 647 atomic_inc(&ctx->lock_context.count);
582 return ctx; 648 return ctx;
583} 649}
584 650
@@ -586,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
586{ 652{
587 struct inode *inode = ctx->path.dentry->d_inode; 653 struct inode *inode = ctx->path.dentry->d_inode;
588 654
589 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 655 if (!list_empty(&ctx->list)) {
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
657 return;
658 list_del(&ctx->list);
659 spin_unlock(&inode->i_lock);
660 } else if (!atomic_dec_and_test(&ctx->lock_context.count))
590 return; 661 return;
591 list_del(&ctx->list); 662 if (inode != NULL)
592 spin_unlock(&inode->i_lock); 663 NFS_PROTO(inode)->close_context(ctx, is_sync);
593 NFS_PROTO(inode)->close_context(ctx, is_sync);
594 if (ctx->cred != NULL) 664 if (ctx->cred != NULL)
595 put_rpccred(ctx->cred); 665 put_rpccred(ctx->cred);
596 path_put(&ctx->path); 666 path_put(&ctx->path);
@@ -606,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
606 * Ensure that mmap has a recent RPC credential for use when writing out 676 * Ensure that mmap has a recent RPC credential for use when writing out
607 * shared pages 677 * shared pages
608 */ 678 */
609static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 679void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
610{ 680{
611 struct inode *inode = filp->f_path.dentry->d_inode; 681 struct inode *inode = filp->f_path.dentry->d_inode;
612 struct nfs_inode *nfsi = NFS_I(inode); 682 struct nfs_inode *nfsi = NFS_I(inode);
@@ -663,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
663 cred = rpc_lookup_cred(); 733 cred = rpc_lookup_cred();
664 if (IS_ERR(cred)) 734 if (IS_ERR(cred))
665 return PTR_ERR(cred); 735 return PTR_ERR(cred);
666 ctx = alloc_nfs_open_context(&filp->f_path, cred); 736 ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
667 put_rpccred(cred); 737 put_rpccred(cred);
668 if (ctx == NULL) 738 if (ctx == NULL)
669 return -ENOMEM; 739 return -ENOMEM;
670 ctx->mode = filp->f_mode;
671 nfs_file_set_open_context(filp, ctx); 740 nfs_file_set_open_context(filp, ctx);
672 put_nfs_open_context(ctx); 741 put_nfs_open_context(ctx);
673 nfs_fscache_set_inode_cookie(inode, filp); 742 nfs_fscache_set_inode_cookie(inode, filp);
@@ -1338,8 +1407,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1338 * to open() calls that passed nfs_atomic_lookup, but failed to call 1407 * to open() calls that passed nfs_atomic_lookup, but failed to call
1339 * nfs_open(). 1408 * nfs_open().
1340 */ 1409 */
1341void nfs4_clear_inode(struct inode *inode) 1410void nfs4_evict_inode(struct inode *inode)
1342{ 1411{
1412 truncate_inode_pages(&inode->i_data, 0);
1413 end_writeback(inode);
1414 pnfs_destroy_layout(NFS_I(inode));
1343 /* If we are holding a delegation, return it! */ 1415 /* If we are holding a delegation, return it! */
1344 nfs_inode_return_delegation_noreclaim(inode); 1416 nfs_inode_return_delegation_noreclaim(inode);
1345 /* First call standard NFS clear_inode() code */ 1417 /* First call standard NFS clear_inode() code */
@@ -1377,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1377 nfsi->delegation = NULL; 1449 nfsi->delegation = NULL;
1378 nfsi->delegation_state = 0; 1450 nfsi->delegation_state = 0;
1379 init_rwsem(&nfsi->rwsem); 1451 init_rwsem(&nfsi->rwsem);
1452 nfsi->layout = NULL;
1380#endif 1453#endif
1381} 1454}
1382 1455
@@ -1424,7 +1497,7 @@ static int nfsiod_start(void)
1424{ 1497{
1425 struct workqueue_struct *wq; 1498 struct workqueue_struct *wq;
1426 dprintk("RPC: creating workqueue nfsiod\n"); 1499 dprintk("RPC: creating workqueue nfsiod\n");
1427 wq = create_singlethread_workqueue("nfsiod"); 1500 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
1428 if (wq == NULL) 1501 if (wq == NULL)
1429 return -ENOMEM; 1502 return -ENOMEM;
1430 nfsiod_workqueue = wq; 1503 nfsiod_workqueue = wq;
@@ -1452,6 +1525,10 @@ static int __init init_nfs_fs(void)
1452{ 1525{
1453 int err; 1526 int err;
1454 1527
1528 err = nfs_idmap_init();
1529 if (err < 0)
1530 goto out9;
1531
1455 err = nfs_dns_resolver_init(); 1532 err = nfs_dns_resolver_init();
1456 if (err < 0) 1533 if (err < 0)
1457 goto out8; 1534 goto out8;
@@ -1516,6 +1593,8 @@ out6:
1516out7: 1593out7:
1517 nfs_dns_resolver_destroy(); 1594 nfs_dns_resolver_destroy();
1518out8: 1595out8:
1596 nfs_idmap_quit();
1597out9:
1519 return err; 1598 return err;
1520} 1599}
1521 1600
@@ -1528,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
1528 nfs_destroy_nfspagecache(); 1607 nfs_destroy_nfspagecache();
1529 nfs_fscache_unregister(); 1608 nfs_fscache_unregister();
1530 nfs_dns_resolver_destroy(); 1609 nfs_dns_resolver_destroy();
1610 nfs_idmap_quit();
1531#ifdef CONFIG_PROC_FS 1611#ifdef CONFIG_PROC_FS
1532 rpc_proc_unregister("nfs"); 1612 rpc_proc_unregister("nfs");
1533#endif 1613#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e70f44b9b3f..db08ff3ff45 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
63#define NFS_UNSPEC_PORT (-1) 63#define NFS_UNSPEC_PORT (-1)
64 64
65/* 65/*
66 * Maximum number of pages that readdir can use for creating
67 * a vmapped array of pages.
68 */
69#define NFS_MAX_READDIR_PAGES 8
70
71/*
66 * In-kernel mount arguments 72 * In-kernel mount arguments
67 */ 73 */
68struct nfs_parsed_mount_data { 74struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
181/* nfs2xdr.c */ 187/* nfs2xdr.c */
182extern int nfs_stat_to_errno(int); 188extern int nfs_stat_to_errno(int);
183extern struct rpc_procinfo nfs_procedures[]; 189extern struct rpc_procinfo nfs_procedures[];
184extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); 190extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
185 191
186/* nfs3xdr.c */ 192/* nfs3xdr.c */
187extern struct rpc_procinfo nfs3_procedures[]; 193extern struct rpc_procinfo nfs3_procedures[];
188extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 194extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
189 195
190/* nfs4xdr.c */ 196/* nfs4xdr.c */
191#ifdef CONFIG_NFS_V4 197#ifdef CONFIG_NFS_V4
192extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 198extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
193#endif 199#endif
194#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
195extern const u32 nfs41_maxread_overhead; 201extern const u32 nfs41_maxread_overhead;
@@ -213,9 +219,9 @@ extern struct workqueue_struct *nfsiod_workqueue;
213extern struct inode *nfs_alloc_inode(struct super_block *sb); 219extern struct inode *nfs_alloc_inode(struct super_block *sb);
214extern void nfs_destroy_inode(struct inode *); 220extern void nfs_destroy_inode(struct inode *);
215extern int nfs_write_inode(struct inode *, struct writeback_control *); 221extern int nfs_write_inode(struct inode *, struct writeback_control *);
216extern void nfs_clear_inode(struct inode *); 222extern void nfs_evict_inode(struct inode *);
217#ifdef CONFIG_NFS_V4 223#ifdef CONFIG_NFS_V4
218extern void nfs4_clear_inode(struct inode *); 224extern void nfs4_evict_inode(struct inode *);
219#endif 225#endif
220void nfs_zap_acl_cache(struct inode *inode); 226void nfs_zap_acl_cache(struct inode *inode);
221extern int nfs_wait_bit_killable(void *word); 227extern int nfs_wait_bit_killable(void *word);
@@ -370,10 +376,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
370 * Helper for restarting RPC calls in the possible presence of NFSv4.1 376 * Helper for restarting RPC calls in the possible presence of NFSv4.1
371 * sessions. 377 * sessions.
372 */ 378 */
373static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) 379static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
374{ 380{
375 if (nfs4_has_session(clp)) 381 if (nfs4_has_session(clp))
376 rpc_restart_call_prepare(task); 382 return rpc_restart_call_prepare(task);
377 else 383 return rpc_restart_call(task);
378 rpc_restart_call(task);
379} 384}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d7..eceafe74f47 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
153 .rpc_resp = &result, 153 .rpc_resp = &result,
154 }; 154 };
155 struct rpc_create_args args = { 155 struct rpc_create_args args = {
156 .net = &init_net,
156 .protocol = info->protocol, 157 .protocol = info->protocol,
157 .address = info->sap, 158 .address = info->sap,
158 .addrsize = info->salen, 159 .addrsize = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
224 .to_retries = 2, 225 .to_retries = 2,
225 }; 226 };
226 struct rpc_create_args args = { 227 struct rpc_create_args args = {
228 .net = &init_net,
227 .protocol = IPPROTO_UDP, 229 .protocol = IPPROTO_UDP,
228 .address = info->sap, 230 .address = info->sap,
229 .addrsize = info->salen, 231 .addrsize = info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
436 438
437 for (i = 0; i < entries; i++) { 439 for (i = 0; i < entries; i++) {
438 flavors[i] = ntohl(*p++); 440 flavors[i] = ntohl(*p++);
439 dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); 441 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
440 } 442 }
441 *count = i; 443 *count = i;
442 444
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf1425791..e6bf45710cc 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
233static int 233static int
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
235{ 235{
236 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 236 struct rpc_auth *auth = req->rq_cred->cr_auth;
237 unsigned int replen; 237 unsigned int replen;
238 u32 offset = (u32)args->offset; 238 u32 offset = (u32)args->offset;
239 u32 count = args->count; 239 u32 count = args->count;
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
337static int 337static int
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
339{ 339{
340 p = xdr_encode_fhandle(p, args->fromfh); 340 p = xdr_encode_fhandle(p, args->old_dir);
341 p = xdr_encode_array(p, args->fromname, args->fromlen); 341 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
342 p = xdr_encode_fhandle(p, args->tofh); 342 p = xdr_encode_fhandle(p, args->new_dir);
343 p = xdr_encode_array(p, args->toname, args->tolen); 343 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
345 return 0; 345 return 0;
346} 346}
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
393static int 393static int
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
395{ 395{
396 struct rpc_task *task = req->rq_task; 396 struct rpc_auth *auth = req->rq_cred->cr_auth;
397 struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
398 unsigned int replen; 397 unsigned int replen;
399 u32 count = args->count; 398 u32 count = args->count;
400 399
@@ -424,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
424 struct page **page; 423 struct page **page;
425 size_t hdrlen; 424 size_t hdrlen;
426 unsigned int pglen, recvd; 425 unsigned int pglen, recvd;
427 u32 len;
428 int status, nr = 0; 426 int status, nr = 0;
429 __be32 *end, *entry, *kaddr;
430 427
431 if ((status = ntohl(*p++))) 428 if ((status = ntohl(*p++)))
432 return nfs_stat_to_errno(status); 429 return nfs_stat_to_errno(status);
@@ -446,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
446 if (pglen > recvd) 443 if (pglen > recvd)
447 pglen = recvd; 444 pglen = recvd;
448 page = rcvbuf->pages; 445 page = rcvbuf->pages;
449 kaddr = p = kmap_atomic(*page, KM_USER0);
450 end = (__be32 *)((char *)p + pglen);
451 entry = p;
452
453 /* Make sure the packet actually has a value_follows and EOF entry */
454 if ((entry + 1) > end)
455 goto short_pkt;
456
457 for (; *p++; nr++) {
458 if (p + 2 > end)
459 goto short_pkt;
460 p++; /* fileid */
461 len = ntohl(*p++);
462 p += XDR_QUADLEN(len) + 1; /* name plus cookie */
463 if (len > NFS2_MAXNAMLEN) {
464 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
465 len);
466 goto err_unmap;
467 }
468 if (p + 2 > end)
469 goto short_pkt;
470 entry = p;
471 }
472
473 /*
474 * Apparently some server sends responses that are a valid size, but
475 * contain no entries, and have value_follows==0 and EOF==0. For
476 * those, just set the EOF marker.
477 */
478 if (!nr && entry[1] == 0) {
479 dprintk("NFS: readdir reply truncated!\n");
480 entry[1] = 1;
481 }
482 out:
483 kunmap_atomic(kaddr, KM_USER0);
484 return nr; 446 return nr;
485 short_pkt: 447}
486 /* 448
487 * When we get a short packet there are 2 possibilities. We can 449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
488 * return an error, or fix up the response to look like a valid 450{
489 * response and return what we have so far. If there are no 451 dprintk("nfs: %s: prematurely hit end of receive buffer. "
490 * entries and the packet was short, then return -EIO. If there 452 "Remaining buffer length is %tu words.\n",
491 * are valid entries in the response, return them and pretend that 453 func, xdr->end - xdr->p);
492 * the call was successful, but incomplete. The caller can retry the
493 * readdir starting at the last cookie.
494 */
495 entry[0] = entry[1] = 0;
496 if (!nr)
497 nr = -errno_NFSERR_IO;
498 goto out;
499err_unmap:
500 nr = -errno_NFSERR_IO;
501 goto out;
502} 454}
503 455
504__be32 * 456__be32 *
505nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 457nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
506{ 458{
507 if (!*p++) { 459 __be32 *p;
508 if (!*p) 460 p = xdr_inline_decode(xdr, 4);
461 if (unlikely(!p))
462 goto out_overflow;
463 if (!ntohl(*p++)) {
464 p = xdr_inline_decode(xdr, 4);
465 if (unlikely(!p))
466 goto out_overflow;
467 if (!ntohl(*p++))
509 return ERR_PTR(-EAGAIN); 468 return ERR_PTR(-EAGAIN);
510 entry->eof = 1; 469 entry->eof = 1;
511 return ERR_PTR(-EBADCOOKIE); 470 return ERR_PTR(-EBADCOOKIE);
512 } 471 }
513 472
473 p = xdr_inline_decode(xdr, 8);
474 if (unlikely(!p))
475 goto out_overflow;
476
514 entry->ino = ntohl(*p++); 477 entry->ino = ntohl(*p++);
515 entry->len = ntohl(*p++); 478 entry->len = ntohl(*p++);
479
480 p = xdr_inline_decode(xdr, entry->len + 4);
481 if (unlikely(!p))
482 goto out_overflow;
516 entry->name = (const char *) p; 483 entry->name = (const char *) p;
517 p += XDR_QUADLEN(entry->len); 484 p += XDR_QUADLEN(entry->len);
518 entry->prev_cookie = entry->cookie; 485 entry->prev_cookie = entry->cookie;
519 entry->cookie = ntohl(*p++); 486 entry->cookie = ntohl(*p++);
520 entry->eof = !p[0] && p[1]; 487
488 p = xdr_inline_peek(xdr, 8);
489 if (p != NULL)
490 entry->eof = !p[0] && p[1];
491 else
492 entry->eof = 0;
521 493
522 return p; 494 return p;
495
496out_overflow:
497 print_overflow_msg(__func__, xdr);
498 return ERR_PTR(-EIO);
523} 499}
524 500
525/* 501/*
@@ -575,7 +551,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
575static int 551static int
576nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) 552nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
577{ 553{
578 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 554 struct rpc_auth *auth = req->rq_cred->cr_auth;
579 unsigned int replen; 555 unsigned int replen;
580 556
581 p = xdr_encode_fhandle(p, args->fh); 557 p = xdr_encode_fhandle(p, args->fh);
@@ -597,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
597 struct kvec *iov = rcvbuf->head; 573 struct kvec *iov = rcvbuf->head;
598 size_t hdrlen; 574 size_t hdrlen;
599 u32 len, recvd; 575 u32 len, recvd;
600 char *kaddr;
601 int status; 576 int status;
602 577
603 if ((status = ntohl(*p++))) 578 if ((status = ntohl(*p++)))
@@ -624,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
624 return -EIO; 599 return -EIO;
625 } 600 }
626 601
627 /* NULL terminate the string we got */ 602 xdr_terminate_string(rcvbuf, len);
628 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
629 kaddr[len+rcvbuf->page_base] = '\0';
630 kunmap_atomic(kaddr, KM_USER0);
631 return 0; 603 return 0;
632} 604}
633 605
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a..ce939c062a5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
313 */ 313 */
314static int 314static int
315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nameidata *nd) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
438 return 1; 438 return 1;
439} 439}
440 440
441static void
442nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
443{
444 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
445}
446
447static int
448nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
449 struct inode *new_dir)
450{
451 struct nfs_renameres *res;
452
453 if (nfs3_async_handle_jukebox(task, old_dir))
454 return 0;
455 res = task->tk_msg.rpc_resp;
456
457 nfs_post_op_update_inode(old_dir, res->old_fattr);
458 nfs_post_op_update_inode(new_dir, res->new_fattr);
459 return 1;
460}
461
441static int 462static int
442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 463nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
443 struct inode *new_dir, struct qstr *new_name) 464 struct inode *new_dir, struct qstr *new_name)
444{ 465{
445 struct nfs3_renameargs arg = { 466 struct nfs_renameargs arg = {
446 .fromfh = NFS_FH(old_dir), 467 .old_dir = NFS_FH(old_dir),
447 .fromname = old_name->name, 468 .old_name = old_name,
448 .fromlen = old_name->len, 469 .new_dir = NFS_FH(new_dir),
449 .tofh = NFS_FH(new_dir), 470 .new_name = new_name,
450 .toname = new_name->name,
451 .tolen = new_name->len
452 }; 471 };
453 struct nfs3_renameres res; 472 struct nfs_renameres res;
454 struct rpc_message msg = { 473 struct rpc_message msg = {
455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 474 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
456 .rpc_argp = &arg, 475 .rpc_argp = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
460 479
461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 480 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
462 481
463 res.fromattr = nfs_alloc_fattr(); 482 res.old_fattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr(); 483 res.new_fattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL) 484 if (res.old_fattr == NULL || res.new_fattr == NULL)
466 goto out; 485 goto out;
467 486
468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 487 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
469 nfs_post_op_update_inode(old_dir, res.fromattr); 488 nfs_post_op_update_inode(old_dir, res.old_fattr);
470 nfs_post_op_update_inode(new_dir, res.toattr); 489 nfs_post_op_update_inode(new_dir, res.new_fattr);
471out: 490out:
472 nfs_free_fattr(res.toattr); 491 nfs_free_fattr(res.old_fattr);
473 nfs_free_fattr(res.fromattr); 492 nfs_free_fattr(res.new_fattr);
474 dprintk("NFS reply rename: %d\n", status); 493 dprintk("NFS reply rename: %d\n", status);
475 return status; 494 return status;
476} 495}
@@ -611,7 +630,7 @@ out:
611 */ 630 */
612static int 631static int
613nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 632nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
614 u64 cookie, struct page *page, unsigned int count, int plus) 633 u64 cookie, struct page **pages, unsigned int count, int plus)
615{ 634{
616 struct inode *dir = dentry->d_inode; 635 struct inode *dir = dentry->d_inode;
617 __be32 *verf = NFS_COOKIEVERF(dir); 636 __be32 *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
621 .verf = {verf[0], verf[1]}, 640 .verf = {verf[0], verf[1]},
622 .plus = plus, 641 .plus = plus,
623 .count = count, 642 .count = count,
624 .pages = &page 643 .pages = pages
625 }; 644 };
626 struct nfs3_readdirres res = { 645 struct nfs3_readdirres res = {
627 .verf = verf, 646 .verf = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
652 671
653 nfs_free_fattr(res.dir_attr); 672 nfs_free_fattr(res.dir_attr);
654out: 673out:
655 dprintk("NFS reply readdir: %d\n", status); 674 dprintk("NFS reply readdir%s: %d\n",
675 plus? "plus" : "", status);
656 return status; 676 return status;
657} 677}
658 678
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
722 dprintk("NFS call fsstat\n"); 742 dprintk("NFS call fsstat\n");
723 nfs_fattr_init(stat->fattr); 743 nfs_fattr_init(stat->fattr);
724 status = rpc_call_sync(server->client, &msg, 0); 744 status = rpc_call_sync(server->client, &msg, 0);
725 dprintk("NFS reply statfs: %d\n", status); 745 dprintk("NFS reply fsstat: %d\n", status);
726 return status; 746 return status;
727} 747}
728 748
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
844 .unlink_setup = nfs3_proc_unlink_setup, 864 .unlink_setup = nfs3_proc_unlink_setup,
845 .unlink_done = nfs3_proc_unlink_done, 865 .unlink_done = nfs3_proc_unlink_done,
846 .rename = nfs3_proc_rename, 866 .rename = nfs3_proc_rename,
867 .rename_setup = nfs3_proc_rename_setup,
868 .rename_done = nfs3_proc_rename_done,
847 .link = nfs3_proc_link, 869 .link = nfs3_proc_link,
848 .symlink = nfs3_proc_symlink, 870 .symlink = nfs3_proc_symlink,
849 .mkdir = nfs3_proc_mkdir, 871 .mkdir = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da36..d9a5e832c25 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
100 [NF3FIFO] = S_IFIFO, 100 [NF3FIFO] = S_IFIFO,
101}; 101};
102 102
103static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
104{
105 dprintk("nfs: %s: prematurely hit end of receive buffer. "
106 "Remaining buffer length is %tu words.\n",
107 func, xdr->end - xdr->p);
108}
109
103/* 110/*
104 * Common NFS XDR functions as inlines 111 * Common NFS XDR functions as inlines
105 */ 112 */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
119 return NULL; 126 return NULL;
120} 127}
121 128
129static inline __be32 *
130xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
131{
132 __be32 *p;
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(!p))
135 goto out_overflow;
136 fh->size = ntohl(*p++);
137
138 if (fh->size <= NFS3_FHSIZE) {
139 p = xdr_inline_decode(xdr, fh->size);
140 if (unlikely(!p))
141 goto out_overflow;
142 memcpy(fh->data, p, fh->size);
143 return p + XDR_QUADLEN(fh->size);
144 }
145 return NULL;
146
147out_overflow:
148 print_overflow_msg(__func__, xdr);
149 return ERR_PTR(-EIO);
150}
151
122/* 152/*
123 * Encode/decode time. 153 * Encode/decode time.
124 */ 154 */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
241} 271}
242 272
243static inline __be32 * 273static inline __be32 *
274xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
275{
276 __be32 *p;
277
278 p = xdr_inline_decode(xdr, 4);
279 if (unlikely(!p))
280 goto out_overflow;
281 if (ntohl(*p++)) {
282 p = xdr_inline_decode(xdr, 84);
283 if (unlikely(!p))
284 goto out_overflow;
285 p = xdr_decode_fattr(p, fattr);
286 }
287 return p;
288out_overflow:
289 print_overflow_msg(__func__, xdr);
290 return ERR_PTR(-EIO);
291}
292
293static inline __be32 *
244xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) 294xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
245{ 295{
246 if (*p++) 296 if (*p++)
@@ -330,7 +380,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
330static int 380static int
331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 381nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
332{ 382{
333 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 383 struct rpc_auth *auth = req->rq_cred->cr_auth;
334 unsigned int replen; 384 unsigned int replen;
335 u32 count = args->count; 385 u32 count = args->count;
336 386
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
442 * Encode RENAME arguments 492 * Encode RENAME arguments
443 */ 493 */
444static int 494static int
445nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) 495nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
446{ 496{
447 p = xdr_encode_fhandle(p, args->fromfh); 497 p = xdr_encode_fhandle(p, args->old_dir);
448 p = xdr_encode_array(p, args->fromname, args->fromlen); 498 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
449 p = xdr_encode_fhandle(p, args->tofh); 499 p = xdr_encode_fhandle(p, args->new_dir);
450 p = xdr_encode_array(p, args->toname, args->tolen); 500 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 501 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
452 return 0; 502 return 0;
453} 503}
@@ -471,7 +521,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
471static int 521static int
472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 522nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
473{ 523{
474 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 524 struct rpc_auth *auth = req->rq_cred->cr_auth;
475 unsigned int replen; 525 unsigned int replen;
476 u32 count = args->count; 526 u32 count = args->count;
477 527
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
504 struct kvec *iov = rcvbuf->head; 554 struct kvec *iov = rcvbuf->head;
505 struct page **page; 555 struct page **page;
506 size_t hdrlen; 556 size_t hdrlen;
507 u32 len, recvd, pglen; 557 u32 recvd, pglen;
508 int status, nr = 0; 558 int status, nr = 0;
509 __be32 *entry, *end, *kaddr;
510 559
511 status = ntohl(*p++); 560 status = ntohl(*p++);
512 /* Decode post_op_attrs */ 561 /* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
536 if (pglen > recvd) 585 if (pglen > recvd)
537 pglen = recvd; 586 pglen = recvd;
538 page = rcvbuf->pages; 587 page = rcvbuf->pages;
539 kaddr = p = kmap_atomic(*page, KM_USER0);
540 end = (__be32 *)((char *)p + pglen);
541 entry = p;
542
543 /* Make sure the packet actually has a value_follows and EOF entry */
544 if ((entry + 1) > end)
545 goto short_pkt;
546
547 for (; *p++; nr++) {
548 if (p + 3 > end)
549 goto short_pkt;
550 p += 2; /* inode # */
551 len = ntohl(*p++); /* string length */
552 p += XDR_QUADLEN(len) + 2; /* name + cookie */
553 if (len > NFS3_MAXNAMLEN) {
554 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
555 len);
556 goto err_unmap;
557 }
558 588
559 if (res->plus) {
560 /* post_op_attr */
561 if (p + 2 > end)
562 goto short_pkt;
563 if (*p++) {
564 p += 21;
565 if (p + 1 > end)
566 goto short_pkt;
567 }
568 /* post_op_fh3 */
569 if (*p++) {
570 if (p + 1 > end)
571 goto short_pkt;
572 len = ntohl(*p++);
573 if (len > NFS3_FHSIZE) {
574 dprintk("NFS: giant filehandle in "
575 "readdir (len 0x%x)!\n", len);
576 goto err_unmap;
577 }
578 p += XDR_QUADLEN(len);
579 }
580 }
581
582 if (p + 2 > end)
583 goto short_pkt;
584 entry = p;
585 }
586
587 /*
588 * Apparently some server sends responses that are a valid size, but
589 * contain no entries, and have value_follows==0 and EOF==0. For
590 * those, just set the EOF marker.
591 */
592 if (!nr && entry[1] == 0) {
593 dprintk("NFS: readdir reply truncated!\n");
594 entry[1] = 1;
595 }
596 out:
597 kunmap_atomic(kaddr, KM_USER0);
598 return nr; 589 return nr;
599 short_pkt:
600 /*
601 * When we get a short packet there are 2 possibilities. We can
602 * return an error, or fix up the response to look like a valid
603 * response and return what we have so far. If there are no
604 * entries and the packet was short, then return -EIO. If there
605 * are valid entries in the response, return them and pretend that
606 * the call was successful, but incomplete. The caller can retry the
607 * readdir starting at the last cookie.
608 */
609 entry[0] = entry[1] = 0;
610 if (!nr)
611 nr = -errno_NFSERR_IO;
612 goto out;
613err_unmap:
614 nr = -errno_NFSERR_IO;
615 goto out;
616} 590}
617 591
618__be32 * 592__be32 *
619nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 593nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
620{ 594{
595 __be32 *p;
621 struct nfs_entry old = *entry; 596 struct nfs_entry old = *entry;
622 597
623 if (!*p++) { 598 p = xdr_inline_decode(xdr, 4);
624 if (!*p) 599 if (unlikely(!p))
600 goto out_overflow;
601 if (!ntohl(*p++)) {
602 p = xdr_inline_decode(xdr, 4);
603 if (unlikely(!p))
604 goto out_overflow;
605 if (!ntohl(*p++))
625 return ERR_PTR(-EAGAIN); 606 return ERR_PTR(-EAGAIN);
626 entry->eof = 1; 607 entry->eof = 1;
627 return ERR_PTR(-EBADCOOKIE); 608 return ERR_PTR(-EBADCOOKIE);
628 } 609 }
629 610
611 p = xdr_inline_decode(xdr, 12);
612 if (unlikely(!p))
613 goto out_overflow;
630 p = xdr_decode_hyper(p, &entry->ino); 614 p = xdr_decode_hyper(p, &entry->ino);
631 entry->len = ntohl(*p++); 615 entry->len = ntohl(*p++);
616
617 p = xdr_inline_decode(xdr, entry->len + 8);
618 if (unlikely(!p))
619 goto out_overflow;
632 entry->name = (const char *) p; 620 entry->name = (const char *) p;
633 p += XDR_QUADLEN(entry->len); 621 p += XDR_QUADLEN(entry->len);
634 entry->prev_cookie = entry->cookie; 622 entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
636 624
637 if (plus) { 625 if (plus) {
638 entry->fattr->valid = 0; 626 entry->fattr->valid = 0;
639 p = xdr_decode_post_op_attr(p, entry->fattr); 627 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
628 if (IS_ERR(p))
629 goto out_overflow_exit;
640 /* In fact, a post_op_fh3: */ 630 /* In fact, a post_op_fh3: */
631 p = xdr_inline_decode(xdr, 4);
632 if (unlikely(!p))
633 goto out_overflow;
641 if (*p++) { 634 if (*p++) {
642 p = xdr_decode_fhandle(p, entry->fh); 635 p = xdr_decode_fhandle_stream(xdr, entry->fh);
636 if (IS_ERR(p))
637 goto out_overflow_exit;
643 /* Ugh -- server reply was truncated */ 638 /* Ugh -- server reply was truncated */
644 if (p == NULL) { 639 if (p == NULL) {
645 dprintk("NFS: FH truncated\n"); 640 dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
650 memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); 645 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
651 } 646 }
652 647
653 entry->eof = !p[0] && p[1]; 648 p = xdr_inline_peek(xdr, 8);
649 if (p != NULL)
650 entry->eof = !p[0] && p[1];
651 else
652 entry->eof = 0;
653
654 return p; 654 return p;
655
656out_overflow:
657 print_overflow_msg(__func__, xdr);
658out_overflow_exit:
659 return ERR_PTR(-EIO);
655} 660}
656 661
657/* 662/*
@@ -675,7 +680,7 @@ static int
675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, 680nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
676 struct nfs3_getaclargs *args) 681 struct nfs3_getaclargs *args)
677{ 682{
678 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 683 struct rpc_auth *auth = req->rq_cred->cr_auth;
679 unsigned int replen; 684 unsigned int replen;
680 685
681 p = xdr_encode_fhandle(p, args->fh); 686 p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +807,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
802static int 807static int
803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 808nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
804{ 809{
805 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 810 struct rpc_auth *auth = req->rq_cred->cr_auth;
806 unsigned int replen; 811 unsigned int replen;
807 812
808 p = xdr_encode_fhandle(p, args->fh); 813 p = xdr_encode_fhandle(p, args->fh);
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
824 struct kvec *iov = rcvbuf->head; 829 struct kvec *iov = rcvbuf->head;
825 size_t hdrlen; 830 size_t hdrlen;
826 u32 len, recvd; 831 u32 len, recvd;
827 char *kaddr;
828 int status; 832 int status;
829 833
830 status = ntohl(*p++); 834 status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
857 return -EIO; 861 return -EIO;
858 } 862 }
859 863
860 /* NULL terminate the string we got */ 864 xdr_terminate_string(rcvbuf, len);
861 kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
862 kaddr[len+rcvbuf->page_base] = '\0';
863 kunmap_atomic(kaddr, KM_USER0);
864 return 0; 865 return 0;
865} 866}
866 867
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
970 * Decode RENAME reply 971 * Decode RENAME reply
971 */ 972 */
972static int 973static int
973nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) 974nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
974{ 975{
975 int status; 976 int status;
976 977
977 if ((status = ntohl(*p++)) != 0) 978 if ((status = ntohl(*p++)) != 0)
978 status = nfs_stat_to_errno(status); 979 status = nfs_stat_to_errno(status);
979 p = xdr_decode_wcc_data(p, res->fromattr); 980 p = xdr_decode_wcc_data(p, res->old_fattr);
980 p = xdr_decode_wcc_data(p, res->toattr); 981 p = xdr_decode_wcc_data(p, res->new_fattr);
981 return status; 982 return status;
982} 983}
983 984
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
1043 res->wtmult = ntohl(*p++); 1044 res->wtmult = ntohl(*p++);
1044 res->dtpref = ntohl(*p++); 1045 res->dtpref = ntohl(*p++);
1045 p = xdr_decode_hyper(p, &res->maxfilesize); 1046 p = xdr_decode_hyper(p, &res->maxfilesize);
1047 p = xdr_decode_time3(p, &res->time_delta);
1046 1048
1047 /* ignore time_delta and properties */ 1049 /* ignore properties */
1048 res->lease_time = 0; 1050 res->lease_time = 0;
1049 return 0; 1051 return 0;
1050} 1052}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c538c6106e1..9fa496387fd 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_RESET, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT, 48 NFS4CLNT_RECALL_SLOT,
50}; 49};
51 50
51enum nfs4_session_state {
52 NFS4_SESSION_INITING,
53 NFS4_SESSION_DRAINING,
54};
55
56struct nfs4_minor_version_ops {
57 u32 minor_version;
58
59 int (*call_sync)(struct nfs_server *server,
60 struct rpc_message *msg,
61 struct nfs4_sequence_args *args,
62 struct nfs4_sequence_res *res,
63 int cache_reply);
64 int (*validate_stateid)(struct nfs_delegation *,
65 const nfs4_stateid *);
66 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
67 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
68 const struct nfs4_state_maintenance_ops *state_renewal_ops;
69};
70
52/* 71/*
53 * struct rpc_sequence ensures that RPC calls are sent in the exact 72 * struct rpc_sequence ensures that RPC calls are sent in the exact
54 * order that they appear on the list. 73 * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
89 */ 108 */
90struct nfs4_state_owner { 109struct nfs4_state_owner {
91 struct nfs_unique_id so_owner_id; 110 struct nfs_unique_id so_owner_id;
92 struct nfs_client *so_client;
93 struct nfs_server *so_server; 111 struct nfs_server *so_server;
94 struct rb_node so_client_node; 112 struct rb_node so_client_node;
95 113
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
99 atomic_t so_count; 117 atomic_t so_count;
100 unsigned long so_flags; 118 unsigned long so_flags;
101 struct list_head so_states; 119 struct list_head so_states;
102 struct list_head so_delegations;
103 struct nfs_seqid_counter so_seqid; 120 struct nfs_seqid_counter so_seqid;
104 struct rpc_sequence so_sequence; 121 struct rpc_sequence so_sequence;
105}; 122};
@@ -125,10 +142,20 @@ enum {
125 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) 142 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
126 */ 143 */
127 144
145struct nfs4_lock_owner {
146 unsigned int lo_type;
147#define NFS4_ANY_LOCK_TYPE (0U)
148#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
149#define NFS4_POSIX_LOCK_TYPE (1U << 1)
150 union {
151 fl_owner_t posix_owner;
152 pid_t flock_owner;
153 } lo_u;
154};
155
128struct nfs4_lock_state { 156struct nfs4_lock_state {
129 struct list_head ls_locks; /* Other lock stateids */ 157 struct list_head ls_locks; /* Other lock stateids */
130 struct nfs4_state * ls_state; /* Pointer to open state */ 158 struct nfs4_state * ls_state; /* Pointer to open state */
131 fl_owner_t ls_owner; /* POSIX lock owner */
132#define NFS_LOCK_INITIALIZED 1 159#define NFS_LOCK_INITIALIZED 1
133 int ls_flags; 160 int ls_flags;
134 struct nfs_seqid_counter ls_seqid; 161 struct nfs_seqid_counter ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
136 struct nfs_unique_id ls_id; 163 struct nfs_unique_id ls_id;
137 nfs4_stateid ls_stateid; 164 nfs4_stateid ls_stateid;
138 atomic_t ls_count; 165 atomic_t ls_count;
166 struct nfs4_lock_owner ls_owner;
139}; 167};
140 168
141/* bits for nfs4_state->flags */ 169/* bits for nfs4_state->flags */
@@ -214,16 +242,18 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 245extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
220extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 246extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
221 struct nfs4_fs_locations *fs_locations, struct page *page); 247 struct nfs4_fs_locations *fs_locations, struct page *page);
248extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
222 249
223extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
224extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
225#if defined(CONFIG_NFS_V4_1) 250#if defined(CONFIG_NFS_V4_1)
226extern int nfs4_setup_sequence(struct nfs_client *clp, 251static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
252{
253 return server->nfs_client->cl_session;
254}
255
256extern int nfs4_setup_sequence(const struct nfs_server *server,
227 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 257 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
228 int cache_reply, struct rpc_task *task); 258 int cache_reply, struct rpc_task *task);
229extern void nfs4_destroy_session(struct nfs4_session *session); 259extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +264,12 @@ extern int nfs4_init_session(struct nfs_server *server);
234extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 264extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
235 struct nfs_fsinfo *fsinfo); 265 struct nfs_fsinfo *fsinfo);
236#else /* CONFIG_NFS_v4_1 */ 266#else /* CONFIG_NFS_v4_1 */
237static inline int nfs4_setup_sequence(struct nfs_client *clp, 267static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
268{
269 return NULL;
270}
271
272static inline int nfs4_setup_sequence(const struct nfs_server *server,
238 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 273 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
239 int cache_reply, struct rpc_task *task) 274 int cache_reply, struct rpc_task *task)
240{ 275{
@@ -247,7 +282,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
247} 282}
248#endif /* CONFIG_NFS_V4_1 */ 283#endif /* CONFIG_NFS_V4_1 */
249 284
250extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; 285extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
251 286
252extern const u32 nfs4_fattr_bitmap[2]; 287extern const u32 nfs4_fattr_bitmap[2];
253extern const u32 nfs4_statfs_bitmap[2]; 288extern const u32 nfs4_statfs_bitmap[2];
@@ -284,7 +319,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
284extern void nfs41_handle_recall_slot(struct nfs_client *clp); 319extern void nfs41_handle_recall_slot(struct nfs_client *clp);
285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 320extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 321extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 322extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
288 323
289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 324extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 325extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -296,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
296extern const nfs4_stateid zero_stateid; 331extern const nfs4_stateid zero_stateid;
297 332
298/* nfs4xdr.c */ 333/* nfs4xdr.c */
299extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 334extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
300extern struct rpc_procinfo nfs4_procedures[]; 335extern struct rpc_procinfo nfs4_procedures[];
301 336
302struct nfs4_mount_data; 337struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 00000000000..2e92f0d8d65
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
1/*
2 * Module for the pnfs nfs4 file layout driver.
3 * Defines all I/O and Policy interface operations, plus code
4 * to register itself with the pNFS client.
5 *
6 * Copyright (c) 2002
7 * The Regents of the University of Michigan
8 * All Rights Reserved
9 *
10 * Dean Hildebrand <dhildebz@umich.edu>
11 *
12 * Permission is granted to use, copy, create derivative works, and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the University of Michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. If
17 * the above copyright notice or any other identification of the
18 * University of Michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * This software is provided as is, without representation or warranty
22 * of any kind either express or implied, including without limitation
23 * the implied warranties of merchantability, fitness for a particular
24 * purpose, or noninfringement. The Regents of the University of
25 * Michigan shall not be liable for any damages, including special,
26 * indirect, incidental, or consequential damages, with respect to any
27 * claim arising out of or in connection with the use of the software,
28 * even if it has been or is hereafter advised of the possibility of
29 * such damages.
30 */
31
32#include <linux/nfs_fs.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42
43static int
44filelayout_set_layoutdriver(struct nfs_server *nfss)
45{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
47 nfs4_fl_free_deviceid_callback);
48 if (status) {
49 printk(KERN_WARNING "%s: deviceid cache could not be "
50 "initialized\n", __func__);
51 return status;
52 }
53 dprintk("%s: deviceid cache has been initialized successfully\n",
54 __func__);
55 return 0;
56}
57
58/* Clear out the layout by destroying its device list */
59static int
60filelayout_clear_layoutdriver(struct nfs_server *nfss)
61{
62 dprintk("--> %s\n", __func__);
63
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0;
67}
68
69/*
70 * filelayout_check_layout()
71 *
72 * Make sure layout segment parameters are sane WRT the device.
73 * At this point no generic layer initialization of the lseg has occurred,
74 * and nothing has been added to the layout_hdr cache.
75 *
76 */
77static int
78filelayout_check_layout(struct pnfs_layout_hdr *lo,
79 struct nfs4_filelayout_segment *fl,
80 struct nfs4_layoutget_res *lgr,
81 struct nfs4_deviceid *id)
82{
83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->inode);
86
87 dprintk("--> %s\n", __func__);
88
89 if (fl->pattern_offset > lgr->range.offset) {
90 dprintk("%s pattern_offset %lld to large\n",
91 __func__, fl->pattern_offset);
92 goto out;
93 }
94
95 if (fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n",
97 __func__, fl->stripe_unit);
98 goto out;
99 }
100
101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->inode, id);
105 if (dsaddr == NULL)
106 goto out;
107 }
108 fl->dsaddr = dsaddr;
109
110 if (fl->first_stripe_index < 0 ||
111 fl->first_stripe_index >= dsaddr->stripe_count) {
112 dprintk("%s Bad first_stripe_index %d\n",
113 __func__, fl->first_stripe_index);
114 goto out_put;
115 }
116
117 if ((fl->stripe_type == STRIPE_SPARSE &&
118 fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
119 (fl->stripe_type == STRIPE_DENSE &&
120 fl->num_fh != dsaddr->stripe_count)) {
121 dprintk("%s num_fh %u not valid for given packing\n",
122 __func__, fl->num_fh);
123 goto out_put;
124 }
125
126 if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
127 dprintk("%s Stripe unit (%u) not aligned with rsize %u "
128 "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
129 nfss->wsize);
130 }
131
132 status = 0;
133out:
134 dprintk("--> %s returns %d\n", __func__, status);
135 return status;
136out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
138 goto out;
139}
140
141static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
142{
143 int i;
144
145 for (i = 0; i < fl->num_fh; i++) {
146 if (!fl->fh_array[i])
147 break;
148 kfree(fl->fh_array[i]);
149 }
150 kfree(fl->fh_array);
151 fl->fh_array = NULL;
152}
153
154static void
155_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
156{
157 filelayout_free_fh_array(fl);
158 kfree(fl);
159}
160
161static int
162filelayout_decode_layout(struct pnfs_layout_hdr *flo,
163 struct nfs4_filelayout_segment *fl,
164 struct nfs4_layoutget_res *lgr,
165 struct nfs4_deviceid *id)
166{
167 uint32_t *p = (uint32_t *)lgr->layout.buf;
168 uint32_t nfl_util;
169 int i;
170
171 dprintk("%s: set_layout_map Begin\n", __func__);
172
173 memcpy(id, p, sizeof(*id));
174 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
175 print_deviceid(id);
176
177 nfl_util = be32_to_cpup(p++);
178 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
179 fl->commit_through_mds = 1;
180 if (nfl_util & NFL4_UFLG_DENSE)
181 fl->stripe_type = STRIPE_DENSE;
182 else
183 fl->stripe_type = STRIPE_SPARSE;
184 fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
185
186 fl->first_stripe_index = be32_to_cpup(p++);
187 p = xdr_decode_hyper(p, &fl->pattern_offset);
188 fl->num_fh = be32_to_cpup(p++);
189
190 dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
191 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
192 fl->pattern_offset);
193
194 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
195 GFP_KERNEL);
196 if (!fl->fh_array)
197 return -ENOMEM;
198
199 for (i = 0; i < fl->num_fh; i++) {
200 /* Do we want to use a mempool here? */
201 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
202 if (!fl->fh_array[i]) {
203 filelayout_free_fh_array(fl);
204 return -ENOMEM;
205 }
206 fl->fh_array[i]->size = be32_to_cpup(p++);
207 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
208 printk(KERN_ERR "Too big fh %d received %d\n",
209 i, fl->fh_array[i]->size);
210 filelayout_free_fh_array(fl);
211 return -EIO;
212 }
213 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
214 p += XDR_QUADLEN(fl->fh_array[i]->size);
215 dprintk("DEBUG: %s: fh len %d\n", __func__,
216 fl->fh_array[i]->size);
217 }
218
219 return 0;
220}
221
222static struct pnfs_layout_segment *
223filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
224 struct nfs4_layoutget_res *lgr)
225{
226 struct nfs4_filelayout_segment *fl;
227 int rc;
228 struct nfs4_deviceid id;
229
230 dprintk("--> %s\n", __func__);
231 fl = kzalloc(sizeof(*fl), GFP_KERNEL);
232 if (!fl)
233 return NULL;
234
235 rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
236 if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
237 _filelayout_free_lseg(fl);
238 return NULL;
239 }
240 return &fl->generic_hdr;
241}
242
243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248
249 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl);
253}
254
255static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver,
260 .clear_layoutdriver = filelayout_clear_layoutdriver,
261 .alloc_lseg = filelayout_alloc_lseg,
262 .free_lseg = filelayout_free_lseg,
263};
264
265static int __init nfs4filelayout_init(void)
266{
267 printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
268 __func__);
269 return pnfs_register_layoutdriver(&filelayout_type);
270}
271
272static void __exit nfs4filelayout_exit(void)
273{
274 printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
275 __func__);
276 pnfs_unregister_layoutdriver(&filelayout_type);
277}
278
279module_init(nfs4filelayout_init);
280module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 00000000000..bbf60dd2ab9
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
1/*
2 * NFSv4 file layout driver data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_NFS4FILELAYOUT_H
31#define FS_NFS_NFS4FILELAYOUT_H
32
33#include "pnfs.h"
34
35/*
36 * Field testing shows we need to support upto 4096 stripe indices.
37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
38 * reasonable. This in turn means we support a maximum of 256
39 * RFC 5661 multipath_list4 structures.
40 */
41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
43
44enum stripetype4 {
45 STRIPE_SPARSE = 1,
46 STRIPE_DENSE = 2
47};
48
49/* Individual ip address */
50struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr;
53 u32 ds_port;
54 struct nfs_client *ds_clp;
55 atomic_t ds_count;
56};
57
58struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid;
60 u32 stripe_count;
61 u8 *stripe_indices;
62 u32 ds_num;
63 struct nfs4_pnfs_ds *ds_list[1];
64};
65
66struct nfs4_filelayout_segment {
67 struct pnfs_layout_segment generic_hdr;
68 u32 stripe_type;
69 u32 commit_through_mds;
70 u32 stripe_unit;
71 u32 first_stripe_index;
72 u64 pattern_offset;
73 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
74 unsigned int num_fh;
75 struct nfs_fh **fh_array;
76};
77
78static inline struct nfs4_filelayout_segment *
79FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_filelayout_segment,
83 generic_hdr);
84}
85
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
87extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id);
89extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
91struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93
94#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 00000000000..51fe64ace55
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39/*
40 * Data server cache
41 *
42 * Data servers can be mapped to different device ids.
43 * nfs4_pnfs_ds reference counting
44 * - set to 1 on allocation
45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache.
47 */
48DEFINE_SPINLOCK(nfs4_ds_cache_lock);
49static LIST_HEAD(nfs4_data_server_cache);
50
51/* Debug routines */
52void
53print_ds(struct nfs4_pnfs_ds *ds)
54{
55 if (ds == NULL) {
56 printk("%s NULL device\n", __func__);
57 return;
58 }
59 printk(" ip_addr %x port %hu\n"
60 " ref count %d\n"
61 " client %p\n"
62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66}
67
68void
69print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
70{
71 int i;
72
73 ifdebug(FACILITY) {
74 printk("%s dsaddr->ds_num %d\n", __func__,
75 dsaddr->ds_num);
76 for (i = 0; i < dsaddr->ds_num; i++)
77 print_ds(dsaddr->ds_list[i]);
78 }
79}
80
81void print_deviceid(struct nfs4_deviceid *id)
82{
83 u32 *p = (u32 *)id;
84
85 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
86 p[0], p[1], p[2], p[3]);
87}
88
89/* nfs4_ds_cache_lock is held */
90static struct nfs4_pnfs_ds *
91_data_server_lookup_locked(u32 ip_addr, u32 port)
92{
93 struct nfs4_pnfs_ds *ds;
94
95 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
96 ntohl(ip_addr), ntohs(port));
97
98 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
99 if (ds->ds_ip_addr == ip_addr &&
100 ds->ds_port == port) {
101 return ds;
102 }
103 }
104 return NULL;
105}
106
107static void
108destroy_ds(struct nfs4_pnfs_ds *ds)
109{
110 dprintk("--> %s\n", __func__);
111 ifdebug(FACILITY)
112 print_ds(ds);
113
114 if (ds->ds_clp)
115 nfs_put_client(ds->ds_clp);
116 kfree(ds);
117}
118
119static void
120nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
121{
122 struct nfs4_pnfs_ds *ds;
123 int i;
124
125 print_deviceid(&dsaddr->deviceid.de_id);
126
127 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i];
129 if (ds != NULL) {
130 if (atomic_dec_and_lock(&ds->ds_count,
131 &nfs4_ds_cache_lock)) {
132 list_del_init(&ds->ds_node);
133 spin_unlock(&nfs4_ds_cache_lock);
134 destroy_ds(ds);
135 }
136 }
137 }
138 kfree(dsaddr->stripe_indices);
139 kfree(dsaddr);
140}
141
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{
154 struct nfs4_pnfs_ds *tmp_ds, *ds;
155
156 ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
157 if (!ds)
158 goto out;
159
160 spin_lock(&nfs4_ds_cache_lock);
161 tmp_ds = _data_server_lookup_locked(ip_addr, port);
162 if (tmp_ds == NULL) {
163 ds->ds_ip_addr = ip_addr;
164 ds->ds_port = port;
165 atomic_set(&ds->ds_count, 1);
166 INIT_LIST_HEAD(&ds->ds_node);
167 ds->ds_clp = NULL;
168 list_add(&ds->ds_node, &nfs4_data_server_cache);
169 dprintk("%s add new data server ip 0x%x\n", __func__,
170 ds->ds_ip_addr);
171 } else {
172 kfree(ds);
173 atomic_inc(&tmp_ds->ds_count);
174 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
175 __func__, tmp_ds->ds_ip_addr,
176 atomic_read(&tmp_ds->ds_count));
177 ds = tmp_ds;
178 }
179 spin_unlock(&nfs4_ds_cache_lock);
180out:
181 return ds;
182}
183
184/*
185 * Currently only support ipv4, and one multi-path address.
186 */
187static struct nfs4_pnfs_ds *
188decode_and_add_ds(__be32 **pp, struct inode *inode)
189{
190 struct nfs4_pnfs_ds *ds = NULL;
191 char *buf;
192 const char *ipend, *pstr;
193 u32 ip_addr, port;
194 int nlen, rlen, i;
195 int tmp[2];
196 __be32 *r_netid, *r_addr, *p = *pp;
197
198 /* r_netid */
199 nlen = be32_to_cpup(p++);
200 r_netid = p;
201 p += XDR_QUADLEN(nlen);
202
203 /* r_addr */
204 rlen = be32_to_cpup(p++);
205 r_addr = p;
206 p += XDR_QUADLEN(rlen);
207 *pp = p;
208
209 /* Check that netid is "tcp" */
210 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
211 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
212 goto out_err;
213 }
214
215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__,
218 rlen);
219 goto out_err;
220 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL);
222 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen);
224
225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.');
228 *res = '-';
229 }
230
231 /* Currently only support ipv4 address */
232 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
233 dprintk("%s: Only ipv4 addresses supported\n", __func__);
234 goto out_free;
235 }
236
237 /* port */
238 pstr = ipend;
239 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
240 port = htons((tmp[0] << 8) | (tmp[1]));
241
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf);
244out_free:
245 kfree(buf);
246out_err:
247 return ds;
248}
249
250/* Decode opaque device data and return the result */
251static struct nfs4_file_layout_dsaddr*
252decode_device(struct inode *ino, struct pnfs_device *pdev)
253{
254 int i, dummy;
255 u32 cnt, num;
256 u8 *indexp;
257 __be32 *p = (__be32 *)pdev->area, *indicesp;
258 struct nfs4_file_layout_dsaddr *dsaddr;
259
260 /* Get the stripe count (number of stripe index) */
261 cnt = be32_to_cpup(p++);
262 dprintk("%s stripe count %d\n", __func__, cnt);
263 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
264 printk(KERN_WARNING "%s: stripe count %d greater than "
265 "supported maximum %d\n", __func__,
266 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
267 goto out_err;
268 }
269
270 /* Check the multipath list count */
271 indicesp = p;
272 p += XDR_QUADLEN(cnt << 2);
273 num = be32_to_cpup(p++);
274 dprintk("%s ds_num %u\n", __func__, num);
275 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
276 printk(KERN_WARNING "%s: multipath count %d greater than "
277 "supported maximum %d\n", __func__,
278 num, NFS4_PNFS_MAX_MULTI_CNT);
279 goto out_err;
280 }
281 dsaddr = kzalloc(sizeof(*dsaddr) +
282 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
283 GFP_KERNEL);
284 if (!dsaddr)
285 goto out_err;
286
287 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
288 if (!dsaddr->stripe_indices)
289 goto out_err_free;
290
291 dsaddr->stripe_count = cnt;
292 dsaddr->ds_num = num;
293
294 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
295
296 /* Go back an read stripe indices */
297 p = indicesp;
298 indexp = &dsaddr->stripe_indices[0];
299 for (i = 0; i < dsaddr->stripe_count; i++) {
300 *indexp = be32_to_cpup(p++);
301 if (*indexp >= num)
302 goto out_err_free;
303 indexp++;
304 }
305 /* Skip already read multipath list count */
306 p++;
307
308 for (i = 0; i < dsaddr->ds_num; i++) {
309 int j;
310
311 dummy = be32_to_cpup(p++); /* multipath count */
312 if (dummy > 1) {
313 printk(KERN_WARNING
314 "%s: Multipath count %d not supported, "
315 "skipping all greater than 1\n", __func__,
316 dummy);
317 }
318 for (j = 0; j < dummy; j++) {
319 if (j == 0) {
320 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
321 if (dsaddr->ds_list[i] == NULL)
322 goto out_err_free;
323 } else {
324 u32 len;
325 /* skip extra multipath */
326 len = be32_to_cpup(p++);
327 p += XDR_QUADLEN(len);
328 len = be32_to_cpup(p++);
329 p += XDR_QUADLEN(len);
330 continue;
331 }
332 }
333 }
334 return dsaddr;
335
336out_err_free:
337 nfs4_fl_free_deviceid(dsaddr);
338out_err:
339 dprintk("%s ERROR: returning NULL\n", __func__);
340 return NULL;
341}
342
343/*
344 * Decode the opaque device specified in 'dev'
345 * and add it to the list of available devices.
346 * If the deviceid is already cached, nfs4_add_deviceid will return
347 * a pointer to the cached struct and throw away the new.
348 */
349static struct nfs4_file_layout_dsaddr*
350decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
351{
352 struct nfs4_file_layout_dsaddr *dsaddr;
353 struct pnfs_deviceid_node *d;
354
355 dsaddr = decode_device(inode, dev);
356 if (!dsaddr) {
357 printk(KERN_WARNING "%s: Could not decode or add device\n",
358 __func__);
359 return NULL;
360 }
361
362 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
363 &dsaddr->deviceid);
364
365 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
366}
367
368/*
369 * Retrieve the information for dev_id, add it to the list
370 * of available devices, and return it.
371 */
372struct nfs4_file_layout_dsaddr *
373get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
374{
375 struct pnfs_device *pdev = NULL;
376 u32 max_resp_sz;
377 int max_pages;
378 struct page **pages = NULL;
379 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
380 int rc, i;
381 struct nfs_server *server = NFS_SERVER(inode);
382
383 /*
384 * Use the session max response size as the basis for setting
385 * GETDEVICEINFO's maxcount
386 */
387 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
388 max_pages = max_resp_sz >> PAGE_SHIFT;
389 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
390 __func__, inode, max_resp_sz, max_pages);
391
392 pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
393 if (pdev == NULL)
394 return NULL;
395
396 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
397 if (pages == NULL) {
398 kfree(pdev);
399 return NULL;
400 }
401 for (i = 0; i < max_pages; i++) {
402 pages[i] = alloc_page(GFP_KERNEL);
403 if (!pages[i])
404 goto out_free;
405 }
406
407 /* set pdev->area */
408 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
409 if (!pdev->area)
410 goto out_free;
411
412 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
413 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
414 pdev->pages = pages;
415 pdev->pgbase = 0;
416 pdev->pglen = PAGE_SIZE * max_pages;
417 pdev->mincount = 0;
418
419 rc = nfs4_proc_getdeviceinfo(server, pdev);
420 dprintk("%s getdevice info returns %d\n", __func__, rc);
421 if (rc)
422 goto out_free;
423
424 /*
425 * Found new device, need to decode it and then add it to the
426 * list of known devices for this mountpoint.
427 */
428 dsaddr = decode_and_add_device(inode, pdev);
429out_free:
430 if (pdev->area != NULL)
431 vunmap(pdev->area);
432 for (i = 0; i < max_pages; i++)
433 __free_page(pages[i]);
434 kfree(pages);
435 kfree(pdev);
436 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
437 return dsaddr;
438}
439
440struct nfs4_file_layout_dsaddr *
441nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
442{
443 struct pnfs_deviceid_node *d;
444
445 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
446 return (d == NULL) ? NULL :
447 container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
448}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 70015dd60a9..0f24cdf2cb1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
55#include "internal.h" 55#include "internal.h"
56#include "iostat.h" 56#include "iostat.h"
57#include "callback.h" 57#include "callback.h"
58#include "pnfs.h"
58 59
59#define NFSDBG_FACILITY NFSDBG_PROC 60#define NFSDBG_FACILITY NFSDBG_PROC
60 61
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
129 | FATTR4_WORD0_MAXREAD 130 | FATTR4_WORD0_MAXREAD
130 | FATTR4_WORD0_MAXWRITE 131 | FATTR4_WORD0_MAXWRITE
131 | FATTR4_WORD0_LEASE_TIME, 132 | FATTR4_WORD0_LEASE_TIME,
132 0 133 FATTR4_WORD1_TIME_DELTA
134 | FATTR4_WORD1_FS_LAYOUT_TYPES
133}; 135};
134 136
135const u32 nfs4_fs_locations_bitmap[2] = { 137const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
255 nfs4_state_mark_reclaim_nograce(clp, state); 257 nfs4_state_mark_reclaim_nograce(clp, state);
256 goto do_state_recovery; 258 goto do_state_recovery;
257 case -NFS4ERR_STALE_STATEID: 259 case -NFS4ERR_STALE_STATEID:
258 if (state == NULL)
259 break;
260 nfs4_state_mark_reclaim_reboot(clp, state);
261 case -NFS4ERR_STALE_CLIENTID: 260 case -NFS4ERR_STALE_CLIENTID:
262 case -NFS4ERR_EXPIRED: 261 case -NFS4ERR_EXPIRED:
263 goto do_state_recovery; 262 goto do_state_recovery;
@@ -303,15 +302,19 @@ do_state_recovery:
303} 302}
304 303
305 304
306static void renew_lease(const struct nfs_server *server, unsigned long timestamp) 305static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
307{ 306{
308 struct nfs_client *clp = server->nfs_client;
309 spin_lock(&clp->cl_lock); 307 spin_lock(&clp->cl_lock);
310 if (time_before(clp->cl_last_renewal,timestamp)) 308 if (time_before(clp->cl_last_renewal,timestamp))
311 clp->cl_last_renewal = timestamp; 309 clp->cl_last_renewal = timestamp;
312 spin_unlock(&clp->cl_lock); 310 spin_unlock(&clp->cl_lock);
313} 311}
314 312
313static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
314{
315 do_renew_lease(server->nfs_client, timestamp);
316}
317
315#if defined(CONFIG_NFS_V4_1) 318#if defined(CONFIG_NFS_V4_1)
316 319
317/* 320/*
@@ -330,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
330 * Must be called while holding tbl->slot_tbl_lock 333 * Must be called while holding tbl->slot_tbl_lock
331 */ 334 */
332static void 335static void
333nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 336nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
334{ 337{
338 int free_slotid = free_slot - tbl->slots;
335 int slotid = free_slotid; 339 int slotid = free_slotid;
336 340
341 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
337 /* clear used bit in bitmap */ 342 /* clear used bit in bitmap */
338 __clear_bit(slotid, tbl->used_slots); 343 __clear_bit(slotid, tbl->used_slots);
339 344
@@ -356,7 +361,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
356{ 361{
357 struct rpc_task *task; 362 struct rpc_task *task;
358 363
359 if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { 364 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
360 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); 365 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
361 if (task) 366 if (task)
362 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 367 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -370,13 +375,12 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
370 complete(&ses->complete); 375 complete(&ses->complete);
371} 376}
372 377
373static void nfs41_sequence_free_slot(const struct nfs_client *clp, 378static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
374 struct nfs4_sequence_res *res)
375{ 379{
376 struct nfs4_slot_table *tbl; 380 struct nfs4_slot_table *tbl;
377 381
378 tbl = &clp->cl_session->fc_slot_table; 382 tbl = &res->sr_session->fc_slot_table;
379 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 383 if (!res->sr_slot) {
380 /* just wake up the next guy waiting since 384 /* just wake up the next guy waiting since
381 * we may have not consumed a slot after all */ 385 * we may have not consumed a slot after all */
382 dprintk("%s: No slot\n", __func__); 386 dprintk("%s: No slot\n", __func__);
@@ -384,19 +388,16 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
384 } 388 }
385 389
386 spin_lock(&tbl->slot_tbl_lock); 390 spin_lock(&tbl->slot_tbl_lock);
387 nfs4_free_slot(tbl, res->sr_slotid); 391 nfs4_free_slot(tbl, res->sr_slot);
388 nfs41_check_drain_session_complete(clp->cl_session); 392 nfs41_check_drain_session_complete(res->sr_session);
389 spin_unlock(&tbl->slot_tbl_lock); 393 spin_unlock(&tbl->slot_tbl_lock);
390 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 394 res->sr_slot = NULL;
391} 395}
392 396
393static void nfs41_sequence_done(struct nfs_client *clp, 397static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
394 struct nfs4_sequence_res *res,
395 int rpc_status)
396{ 398{
397 unsigned long timestamp; 399 unsigned long timestamp;
398 struct nfs4_slot_table *tbl; 400 struct nfs_client *clp;
399 struct nfs4_slot *slot;
400 401
401 /* 402 /*
402 * sr_status remains 1 if an RPC level error occurred. The server 403 * sr_status remains 1 if an RPC level error occurred. The server
@@ -408,28 +409,53 @@ static void nfs41_sequence_done(struct nfs_client *clp,
408 res->sr_status = NFS_OK; 409 res->sr_status = NFS_OK;
409 410
410 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ 411 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
411 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 412 if (!res->sr_slot)
412 goto out; 413 goto out;
413 414
414 /* Check the SEQUENCE operation status */ 415 /* Check the SEQUENCE operation status */
415 if (res->sr_status == 0) { 416 switch (res->sr_status) {
416 tbl = &clp->cl_session->fc_slot_table; 417 case 0:
417 slot = tbl->slots + res->sr_slotid;
418 /* Update the slot's sequence and clientid lease timer */ 418 /* Update the slot's sequence and clientid lease timer */
419 ++slot->seq_nr; 419 ++res->sr_slot->seq_nr;
420 timestamp = res->sr_renewal_time; 420 timestamp = res->sr_renewal_time;
421 spin_lock(&clp->cl_lock); 421 clp = res->sr_session->clp;
422 if (time_before(clp->cl_last_renewal, timestamp)) 422 do_renew_lease(clp, timestamp);
423 clp->cl_last_renewal = timestamp;
424 spin_unlock(&clp->cl_lock);
425 /* Check sequence flags */ 423 /* Check sequence flags */
426 if (atomic_read(&clp->cl_count) > 1) 424 if (atomic_read(&clp->cl_count) > 1)
427 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 425 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
426 break;
427 case -NFS4ERR_DELAY:
428 /* The server detected a resend of the RPC call and
429 * returned NFS4ERR_DELAY as per Section 2.10.6.2
430 * of RFC5661.
431 */
432 dprintk("%s: slot=%td seq=%d: Operation in progress\n",
433 __func__,
434 res->sr_slot - res->sr_session->fc_slot_table.slots,
435 res->sr_slot->seq_nr);
436 goto out_retry;
437 default:
438 /* Just update the slot sequence no. */
439 ++res->sr_slot->seq_nr;
428 } 440 }
429out: 441out:
430 /* The session may be reset by one of the error handlers. */ 442 /* The session may be reset by one of the error handlers. */
431 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 443 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
432 nfs41_sequence_free_slot(clp, res); 444 nfs41_sequence_free_slot(res);
445 return 1;
446out_retry:
447 if (!rpc_restart_call(task))
448 goto out;
449 rpc_delay(task, NFS4_POLL_RETRY_MAX);
450 return 0;
451}
452
453static int nfs4_sequence_done(struct rpc_task *task,
454 struct nfs4_sequence_res *res)
455{
456 if (res->sr_session == NULL)
457 return 1;
458 return nfs41_sequence_done(task, res);
433} 459}
434 460
435/* 461/*
@@ -477,15 +503,13 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
477 503
478 dprintk("--> %s\n", __func__); 504 dprintk("--> %s\n", __func__);
479 /* slot already allocated? */ 505 /* slot already allocated? */
480 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 506 if (res->sr_slot != NULL)
481 return 0; 507 return 0;
482 508
483 memset(res, 0, sizeof(*res));
484 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
485 tbl = &session->fc_slot_table; 509 tbl = &session->fc_slot_table;
486 510
487 spin_lock(&tbl->slot_tbl_lock); 511 spin_lock(&tbl->slot_tbl_lock);
488 if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && 512 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
489 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 513 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
490 /* 514 /*
491 * The state manager will wait until the slot table is empty. 515 * The state manager will wait until the slot table is empty.
@@ -523,8 +547,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
523 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 547 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
524 548
525 res->sr_session = session; 549 res->sr_session = session;
526 res->sr_slotid = slotid; 550 res->sr_slot = slot;
527 res->sr_renewal_time = jiffies; 551 res->sr_renewal_time = jiffies;
552 res->sr_status_flags = 0;
528 /* 553 /*
529 * sr_status is only set in decode_sequence, and so will remain 554 * sr_status is only set in decode_sequence, and so will remain
530 * set to 1 if an rpc level failure occurs. 555 * set to 1 if an rpc level failure occurs.
@@ -533,33 +558,34 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
533 return 0; 558 return 0;
534} 559}
535 560
536int nfs4_setup_sequence(struct nfs_client *clp, 561int nfs4_setup_sequence(const struct nfs_server *server,
537 struct nfs4_sequence_args *args, 562 struct nfs4_sequence_args *args,
538 struct nfs4_sequence_res *res, 563 struct nfs4_sequence_res *res,
539 int cache_reply, 564 int cache_reply,
540 struct rpc_task *task) 565 struct rpc_task *task)
541{ 566{
567 struct nfs4_session *session = nfs4_get_session(server);
542 int ret = 0; 568 int ret = 0;
543 569
544 dprintk("--> %s clp %p session %p sr_slotid %d\n", 570 if (session == NULL) {
545 __func__, clp, clp->cl_session, res->sr_slotid); 571 args->sa_session = NULL;
546 572 res->sr_session = NULL;
547 if (!nfs4_has_session(clp))
548 goto out; 573 goto out;
549 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
550 task);
551 if (ret && ret != -EAGAIN) {
552 /* terminate rpc task */
553 task->tk_status = ret;
554 task->tk_action = NULL;
555 } 574 }
575
576 dprintk("--> %s clp %p session %p sr_slot %td\n",
577 __func__, session->clp, session, res->sr_slot ?
578 res->sr_slot - session->fc_slot_table.slots : -1);
579
580 ret = nfs41_setup_sequence(session, args, res, cache_reply,
581 task);
556out: 582out:
557 dprintk("<-- %s status=%d\n", __func__, ret); 583 dprintk("<-- %s status=%d\n", __func__, ret);
558 return ret; 584 return ret;
559} 585}
560 586
561struct nfs41_call_sync_data { 587struct nfs41_call_sync_data {
562 struct nfs_client *clp; 588 const struct nfs_server *seq_server;
563 struct nfs4_sequence_args *seq_args; 589 struct nfs4_sequence_args *seq_args;
564 struct nfs4_sequence_res *seq_res; 590 struct nfs4_sequence_res *seq_res;
565 int cache_reply; 591 int cache_reply;
@@ -569,9 +595,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
569{ 595{
570 struct nfs41_call_sync_data *data = calldata; 596 struct nfs41_call_sync_data *data = calldata;
571 597
572 dprintk("--> %s data->clp->cl_session %p\n", __func__, 598 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
573 data->clp->cl_session); 599
574 if (nfs4_setup_sequence(data->clp, data->seq_args, 600 if (nfs4_setup_sequence(data->seq_server, data->seq_args,
575 data->seq_res, data->cache_reply, task)) 601 data->seq_res, data->cache_reply, task))
576 return; 602 return;
577 rpc_call_start(task); 603 rpc_call_start(task);
@@ -587,7 +613,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
587{ 613{
588 struct nfs41_call_sync_data *data = calldata; 614 struct nfs41_call_sync_data *data = calldata;
589 615
590 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); 616 nfs41_sequence_done(task, data->seq_res);
591} 617}
592 618
593struct rpc_call_ops nfs41_call_sync_ops = { 619struct rpc_call_ops nfs41_call_sync_ops = {
@@ -600,8 +626,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
600 .rpc_call_done = nfs41_call_sync_done, 626 .rpc_call_done = nfs41_call_sync_done,
601}; 627};
602 628
603static int nfs4_call_sync_sequence(struct nfs_client *clp, 629static int nfs4_call_sync_sequence(struct nfs_server *server,
604 struct rpc_clnt *clnt,
605 struct rpc_message *msg, 630 struct rpc_message *msg,
606 struct nfs4_sequence_args *args, 631 struct nfs4_sequence_args *args,
607 struct nfs4_sequence_res *res, 632 struct nfs4_sequence_res *res,
@@ -611,19 +636,19 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
611 int ret; 636 int ret;
612 struct rpc_task *task; 637 struct rpc_task *task;
613 struct nfs41_call_sync_data data = { 638 struct nfs41_call_sync_data data = {
614 .clp = clp, 639 .seq_server = server,
615 .seq_args = args, 640 .seq_args = args,
616 .seq_res = res, 641 .seq_res = res,
617 .cache_reply = cache_reply, 642 .cache_reply = cache_reply,
618 }; 643 };
619 struct rpc_task_setup task_setup = { 644 struct rpc_task_setup task_setup = {
620 .rpc_client = clnt, 645 .rpc_client = server->client,
621 .rpc_message = msg, 646 .rpc_message = msg,
622 .callback_ops = &nfs41_call_sync_ops, 647 .callback_ops = &nfs41_call_sync_ops,
623 .callback_data = &data 648 .callback_data = &data
624 }; 649 };
625 650
626 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 651 res->sr_slot = NULL;
627 if (privileged) 652 if (privileged)
628 task_setup.callback_ops = &nfs41_call_priv_sync_ops; 653 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
629 task = rpc_run_task(&task_setup); 654 task = rpc_run_task(&task_setup);
@@ -642,10 +667,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
642 struct nfs4_sequence_res *res, 667 struct nfs4_sequence_res *res,
643 int cache_reply) 668 int cache_reply)
644{ 669{
645 return nfs4_call_sync_sequence(server->nfs_client, server->client, 670 return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
646 msg, args, res, cache_reply, 0);
647} 671}
648 672
673#else
674static int nfs4_sequence_done(struct rpc_task *task,
675 struct nfs4_sequence_res *res)
676{
677 return 1;
678}
649#endif /* CONFIG_NFS_V4_1 */ 679#endif /* CONFIG_NFS_V4_1 */
650 680
651int _nfs4_call_sync(struct nfs_server *server, 681int _nfs4_call_sync(struct nfs_server *server,
@@ -659,18 +689,9 @@ int _nfs4_call_sync(struct nfs_server *server,
659} 689}
660 690
661#define nfs4_call_sync(server, msg, args, res, cache_reply) \ 691#define nfs4_call_sync(server, msg, args, res, cache_reply) \
662 (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ 692 (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
663 &(res)->seq_res, (cache_reply)) 693 &(res)->seq_res, (cache_reply))
664 694
665static void nfs4_sequence_done(const struct nfs_server *server,
666 struct nfs4_sequence_res *res, int rpc_status)
667{
668#ifdef CONFIG_NFS_V4_1
669 if (nfs4_has_session(server->nfs_client))
670 nfs41_sequence_done(server->nfs_client, res, rpc_status);
671#endif /* CONFIG_NFS_V4_1 */
672}
673
674static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 695static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
675{ 696{
676 struct nfs_inode *nfsi = NFS_I(dir); 697 struct nfs_inode *nfsi = NFS_I(dir);
@@ -712,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
712 p->o_res.server = p->o_arg.server; 733 p->o_res.server = p->o_arg.server;
713 nfs_fattr_init(&p->f_attr); 734 nfs_fattr_init(&p->f_attr);
714 nfs_fattr_init(&p->dir_attr); 735 nfs_fattr_init(&p->dir_attr);
715 p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
716} 736}
717 737
718static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 738static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -745,19 +765,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
745 p->o_arg.server = server; 765 p->o_arg.server = server;
746 p->o_arg.bitmask = server->attr_bitmask; 766 p->o_arg.bitmask = server->attr_bitmask;
747 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 767 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
748 if (flags & O_EXCL) { 768 if (flags & O_CREAT) {
749 if (nfs4_has_persistent_session(server->nfs_client)) { 769 u32 *s;
750 /* GUARDED */ 770
751 p->o_arg.u.attrs = &p->attrs;
752 memcpy(&p->attrs, attrs, sizeof(p->attrs));
753 } else { /* EXCLUSIVE4_1 */
754 u32 *s = (u32 *) p->o_arg.u.verifier.data;
755 s[0] = jiffies;
756 s[1] = current->pid;
757 }
758 } else if (flags & O_CREAT) {
759 p->o_arg.u.attrs = &p->attrs; 771 p->o_arg.u.attrs = &p->attrs;
760 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 772 memcpy(&p->attrs, attrs, sizeof(p->attrs));
773 s = (u32 *) p->o_arg.u.verifier.data;
774 s[0] = jiffies;
775 s[1] = current->pid;
761 } 776 }
762 p->c_arg.fh = &p->o_res.fh; 777 p->c_arg.fh = &p->o_res.fh;
763 p->c_arg.stateid = &p->o_res.stateid; 778 p->c_arg.stateid = &p->o_res.stateid;
@@ -1102,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1102 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1117 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1103 smp_rmb(); 1118 smp_rmb();
1104 if (state->n_rdwr != 0) { 1119 if (state->n_rdwr != 0) {
1120 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1105 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1121 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1106 if (ret != 0) 1122 if (ret != 0)
1107 return ret; 1123 return ret;
@@ -1109,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1109 return -ESTALE; 1125 return -ESTALE;
1110 } 1126 }
1111 if (state->n_wronly != 0) { 1127 if (state->n_wronly != 0) {
1128 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1112 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1129 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1113 if (ret != 0) 1130 if (ret != 0)
1114 return ret; 1131 return ret;
@@ -1116,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1116 return -ESTALE; 1133 return -ESTALE;
1117 } 1134 }
1118 if (state->n_rdonly != 0) { 1135 if (state->n_rdonly != 0) {
1136 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1119 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1120 if (ret != 0) 1138 if (ret != 0)
1121 return ret; 1139 return ret;
@@ -1170,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1170 int err; 1188 int err;
1171 do { 1189 do {
1172 err = _nfs4_do_open_reclaim(ctx, state); 1190 err = _nfs4_do_open_reclaim(ctx, state);
1173 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 1191 if (err != -NFS4ERR_DELAY)
1174 break; 1192 break;
1175 nfs4_handle_exception(server, err, &exception); 1193 nfs4_handle_exception(server, err, &exception);
1176 } while (exception.retry); 1194 } while (exception.retry);
@@ -1240,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1240 case -NFS4ERR_ADMIN_REVOKED: 1258 case -NFS4ERR_ADMIN_REVOKED:
1241 case -NFS4ERR_BAD_STATEID: 1259 case -NFS4ERR_BAD_STATEID:
1242 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
1261 case -EKEYEXPIRED:
1262 /*
1263 * User RPCSEC_GSS context has expired.
1264 * We cannot recover this stateid now, so
1265 * skip it and allow recovery thread to
1266 * proceed.
1267 */
1243 case -ENOMEM: 1268 case -ENOMEM:
1244 err = 0; 1269 err = 0;
1245 goto out; 1270 goto out;
@@ -1255,8 +1280,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1255 struct nfs4_opendata *data = calldata; 1280 struct nfs4_opendata *data = calldata;
1256 1281
1257 data->rpc_status = task->tk_status; 1282 data->rpc_status = task->tk_status;
1258 if (RPC_ASSASSINATED(task))
1259 return;
1260 if (data->rpc_status == 0) { 1283 if (data->rpc_status == 0) {
1261 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 1284 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
1262 sizeof(data->o_res.stateid.data)); 1285 sizeof(data->o_res.stateid.data));
@@ -1356,13 +1379,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1356 } 1379 }
1357 /* Update sequence id. */ 1380 /* Update sequence id. */
1358 data->o_arg.id = sp->so_owner_id.id; 1381 data->o_arg.id = sp->so_owner_id.id;
1359 data->o_arg.clientid = sp->so_client->cl_clientid; 1382 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1360 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1383 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1361 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1384 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
1362 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1385 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
1363 } 1386 }
1364 data->timestamp = jiffies; 1387 data->timestamp = jiffies;
1365 if (nfs4_setup_sequence(data->o_arg.server->nfs_client, 1388 if (nfs4_setup_sequence(data->o_arg.server,
1366 &data->o_arg.seq_args, 1389 &data->o_arg.seq_args,
1367 &data->o_res.seq_res, 1, task)) 1390 &data->o_res.seq_res, 1, task))
1368 return; 1391 return;
@@ -1385,11 +1408,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1385 1408
1386 data->rpc_status = task->tk_status; 1409 data->rpc_status = task->tk_status;
1387 1410
1388 nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, 1411 if (!nfs4_sequence_done(task, &data->o_res.seq_res))
1389 task->tk_status);
1390
1391 if (RPC_ASSASSINATED(task))
1392 return; 1412 return;
1413
1393 if (task->tk_status == 0) { 1414 if (task->tk_status == 0) {
1394 switch (data->o_res.f_attr->mode & S_IFMT) { 1415 switch (data->o_res.f_attr->mode & S_IFMT) {
1395 case S_IFREG: 1416 case S_IFREG:
@@ -1591,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1591 goto out; 1612 goto out;
1592 case -NFS4ERR_GRACE: 1613 case -NFS4ERR_GRACE:
1593 case -NFS4ERR_DELAY: 1614 case -NFS4ERR_DELAY:
1594 case -EKEYEXPIRED:
1595 nfs4_handle_exception(server, err, &exception); 1615 nfs4_handle_exception(server, err, &exception);
1596 err = 0; 1616 err = 0;
1597 } 1617 }
@@ -1773,7 +1793,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1773 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1793 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
1774 /* Use that stateid */ 1794 /* Use that stateid */
1775 } else if (state != NULL) { 1795 } else if (state != NULL) {
1776 nfs4_copy_stateid(&arg.stateid, state, current->files); 1796 nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
1777 } else 1797 } else
1778 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1798 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1779 1799
@@ -1838,8 +1858,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1838 struct nfs4_state *state = calldata->state; 1858 struct nfs4_state *state = calldata->state;
1839 struct nfs_server *server = NFS_SERVER(calldata->inode); 1859 struct nfs_server *server = NFS_SERVER(calldata->inode);
1840 1860
1841 nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); 1861 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
1842 if (RPC_ASSASSINATED(task))
1843 return; 1862 return;
1844 /* hmm. we are done with the inode, and in the process of freeing 1863 /* hmm. we are done with the inode, and in the process of freeing
1845 * the state_owner. we keep this around to process errors 1864 * the state_owner. we keep this around to process errors
@@ -1903,7 +1922,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1903 1922
1904 nfs_fattr_init(calldata->res.fattr); 1923 nfs_fattr_init(calldata->res.fattr);
1905 calldata->timestamp = jiffies; 1924 calldata->timestamp = jiffies;
1906 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, 1925 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
1907 &calldata->arg.seq_args, &calldata->res.seq_res, 1926 &calldata->arg.seq_args, &calldata->res.seq_res,
1908 1, task)) 1927 1, task))
1909 return; 1928 return;
@@ -1962,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1962 calldata->res.fattr = &calldata->fattr; 1981 calldata->res.fattr = &calldata->fattr;
1963 calldata->res.seqid = calldata->arg.seqid; 1982 calldata->res.seqid = calldata->arg.seqid;
1964 calldata->res.server = server; 1983 calldata->res.server = server;
1965 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1966 path_get(path); 1984 path_get(path);
1967 calldata->path = *path; 1985 calldata->path = *path;
1968 1986
@@ -1985,118 +2003,17 @@ out:
1985 return status; 2003 return status;
1986} 2004}
1987 2005
1988static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) 2006static struct inode *
2007nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
1989{ 2008{
1990 struct file *filp;
1991 int ret;
1992
1993 /* If the open_intent is for execute, we have an extra check to make */
1994 if (fmode & FMODE_EXEC) {
1995 ret = nfs_may_open(state->inode,
1996 state->owner->so_cred,
1997 nd->intent.open.flags);
1998 if (ret < 0)
1999 goto out_close;
2000 }
2001 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
2002 if (!IS_ERR(filp)) {
2003 struct nfs_open_context *ctx;
2004 ctx = nfs_file_open_context(filp);
2005 ctx->state = state;
2006 return 0;
2007 }
2008 ret = PTR_ERR(filp);
2009out_close:
2010 nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
2011 return ret;
2012}
2013
2014struct dentry *
2015nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2016{
2017 struct path path = {
2018 .mnt = nd->path.mnt,
2019 .dentry = dentry,
2020 };
2021 struct dentry *parent;
2022 struct iattr attr;
2023 struct rpc_cred *cred;
2024 struct nfs4_state *state; 2009 struct nfs4_state *state;
2025 struct dentry *res;
2026 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2027
2028 if (nd->flags & LOOKUP_CREATE) {
2029 attr.ia_mode = nd->intent.open.create_mode;
2030 attr.ia_valid = ATTR_MODE;
2031 if (!IS_POSIXACL(dir))
2032 attr.ia_mode &= ~current_umask();
2033 } else {
2034 attr.ia_valid = 0;
2035 BUG_ON(nd->intent.open.flags & O_CREAT);
2036 }
2037 2010
2038 cred = rpc_lookup_cred();
2039 if (IS_ERR(cred))
2040 return (struct dentry *)cred;
2041 parent = dentry->d_parent;
2042 /* Protect against concurrent sillydeletes */ 2011 /* Protect against concurrent sillydeletes */
2043 nfs_block_sillyrename(parent); 2012 state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
2044 state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); 2013 if (IS_ERR(state))
2045 put_rpccred(cred); 2014 return ERR_CAST(state);
2046 if (IS_ERR(state)) { 2015 ctx->state = state;
2047 if (PTR_ERR(state) == -ENOENT) { 2016 return igrab(state->inode);
2048 d_add(dentry, NULL);
2049 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2050 }
2051 nfs_unblock_sillyrename(parent);
2052 return (struct dentry *)state;
2053 }
2054 res = d_add_unique(dentry, igrab(state->inode));
2055 if (res != NULL)
2056 path.dentry = res;
2057 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
2058 nfs_unblock_sillyrename(parent);
2059 nfs4_intent_set_file(nd, &path, state, fmode);
2060 return res;
2061}
2062
2063int
2064nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
2065{
2066 struct path path = {
2067 .mnt = nd->path.mnt,
2068 .dentry = dentry,
2069 };
2070 struct rpc_cred *cred;
2071 struct nfs4_state *state;
2072 fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
2073
2074 cred = rpc_lookup_cred();
2075 if (IS_ERR(cred))
2076 return PTR_ERR(cred);
2077 state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
2078 put_rpccred(cred);
2079 if (IS_ERR(state)) {
2080 switch (PTR_ERR(state)) {
2081 case -EPERM:
2082 case -EACCES:
2083 case -EDQUOT:
2084 case -ENOSPC:
2085 case -EROFS:
2086 return PTR_ERR(state);
2087 default:
2088 goto out_drop;
2089 }
2090 }
2091 if (state->inode == dentry->d_inode) {
2092 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2093 nfs4_intent_set_file(nd, &path, state, fmode);
2094 return 1;
2095 }
2096 nfs4_close_sync(&path, state, fmode);
2097out_drop:
2098 d_drop(dentry);
2099 return 0;
2100} 2017}
2101 2018
2102static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2019static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2260,8 +2177,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
2260out: 2177out:
2261 if (page) 2178 if (page)
2262 __free_page(page); 2179 __free_page(page);
2263 if (locations) 2180 kfree(locations);
2264 kfree(locations);
2265 return status; 2181 return status;
2266} 2182}
2267 2183
@@ -2554,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
2554 2470
2555static int 2471static int
2556nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 2472nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2557 int flags, struct nameidata *nd) 2473 int flags, struct nfs_open_context *ctx)
2558{ 2474{
2559 struct path path = { 2475 struct path my_path = {
2560 .mnt = nd->path.mnt,
2561 .dentry = dentry, 2476 .dentry = dentry,
2562 }; 2477 };
2478 struct path *path = &my_path;
2563 struct nfs4_state *state; 2479 struct nfs4_state *state;
2564 struct rpc_cred *cred; 2480 struct rpc_cred *cred = NULL;
2565 fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); 2481 fmode_t fmode = 0;
2566 int status = 0; 2482 int status = 0;
2567 2483
2568 cred = rpc_lookup_cred(); 2484 if (ctx != NULL) {
2569 if (IS_ERR(cred)) { 2485 cred = ctx->cred;
2570 status = PTR_ERR(cred); 2486 path = &ctx->path;
2571 goto out; 2487 fmode = ctx->mode;
2572 } 2488 }
2573 state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); 2489 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2574 d_drop(dentry); 2490 d_drop(dentry);
2575 if (IS_ERR(state)) { 2491 if (IS_ERR(state)) {
2576 status = PTR_ERR(state); 2492 status = PTR_ERR(state);
2577 goto out_putcred; 2493 goto out;
2578 } 2494 }
2579 d_add(dentry, igrab(state->inode)); 2495 d_add(dentry, igrab(state->inode));
2580 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2496 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2581 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2497 if (ctx != NULL)
2582 status = nfs4_intent_set_file(nd, &path, state, fmode); 2498 ctx->state = state;
2583 else 2499 else
2584 nfs4_close_sync(&path, state, fmode); 2500 nfs4_close_sync(path, state, fmode);
2585out_putcred:
2586 put_rpccred(cred);
2587out: 2501out:
2588 return status; 2502 return status;
2589} 2503}
@@ -2641,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2641 2555
2642 args->bitmask = server->cache_consistency_bitmask; 2556 args->bitmask = server->cache_consistency_bitmask;
2643 res->server = server; 2557 res->server = server;
2558 res->seq_res.sr_slot = NULL;
2644 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2559 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2645} 2560}
2646 2561
@@ -2648,7 +2563,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2648{ 2563{
2649 struct nfs_removeres *res = task->tk_msg.rpc_resp; 2564 struct nfs_removeres *res = task->tk_msg.rpc_resp;
2650 2565
2651 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); 2566 if (!nfs4_sequence_done(task, &res->seq_res))
2567 return 0;
2652 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2568 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2653 return 0; 2569 return 0;
2654 update_changeattr(dir, &res->cinfo); 2570 update_changeattr(dir, &res->cinfo);
@@ -2656,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2656 return 1; 2572 return 1;
2657} 2573}
2658 2574
2575static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
2576{
2577 struct nfs_server *server = NFS_SERVER(dir);
2578 struct nfs_renameargs *arg = msg->rpc_argp;
2579 struct nfs_renameres *res = msg->rpc_resp;
2580
2581 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
2582 arg->bitmask = server->attr_bitmask;
2583 res->server = server;
2584}
2585
2586static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
2587 struct inode *new_dir)
2588{
2589 struct nfs_renameres *res = task->tk_msg.rpc_resp;
2590
2591 if (!nfs4_sequence_done(task, &res->seq_res))
2592 return 0;
2593 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2594 return 0;
2595
2596 update_changeattr(old_dir, &res->old_cinfo);
2597 nfs_post_op_update_inode(old_dir, res->old_fattr);
2598 update_changeattr(new_dir, &res->new_cinfo);
2599 nfs_post_op_update_inode(new_dir, res->new_fattr);
2600 return 1;
2601}
2602
2659static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, 2603static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2660 struct inode *new_dir, struct qstr *new_name) 2604 struct inode *new_dir, struct qstr *new_name)
2661{ 2605{
2662 struct nfs_server *server = NFS_SERVER(old_dir); 2606 struct nfs_server *server = NFS_SERVER(old_dir);
2663 struct nfs4_rename_arg arg = { 2607 struct nfs_renameargs arg = {
2664 .old_dir = NFS_FH(old_dir), 2608 .old_dir = NFS_FH(old_dir),
2665 .new_dir = NFS_FH(new_dir), 2609 .new_dir = NFS_FH(new_dir),
2666 .old_name = old_name, 2610 .old_name = old_name,
2667 .new_name = new_name, 2611 .new_name = new_name,
2668 .bitmask = server->attr_bitmask, 2612 .bitmask = server->attr_bitmask,
2669 }; 2613 };
2670 struct nfs4_rename_res res = { 2614 struct nfs_renameres res = {
2671 .server = server, 2615 .server = server,
2672 }; 2616 };
2673 struct rpc_message msg = { 2617 struct rpc_message msg = {
@@ -2881,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2881} 2825}
2882 2826
2883static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2827static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2884 u64 cookie, struct page *page, unsigned int count, int plus) 2828 u64 cookie, struct page **pages, unsigned int count, int plus)
2885{ 2829{
2886 struct inode *dir = dentry->d_inode; 2830 struct inode *dir = dentry->d_inode;
2887 struct nfs4_readdir_arg args = { 2831 struct nfs4_readdir_arg args = {
2888 .fh = NFS_FH(dir), 2832 .fh = NFS_FH(dir),
2889 .pages = &page, 2833 .pages = pages,
2890 .pgbase = 0, 2834 .pgbase = 0,
2891 .count = count, 2835 .count = count,
2892 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2836 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
2837 .plus = plus,
2893 }; 2838 };
2894 struct nfs4_readdir_res res; 2839 struct nfs4_readdir_res res;
2895 struct rpc_message msg = { 2840 struct rpc_message msg = {
@@ -2917,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2917} 2862}
2918 2863
2919static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2864static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2920 u64 cookie, struct page *page, unsigned int count, int plus) 2865 u64 cookie, struct page **pages, unsigned int count, int plus)
2921{ 2866{
2922 struct nfs4_exception exception = { }; 2867 struct nfs4_exception exception = { };
2923 int err; 2868 int err;
2924 do { 2869 do {
2925 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), 2870 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
2926 _nfs4_proc_readdir(dentry, cred, cookie, 2871 _nfs4_proc_readdir(dentry, cred, cookie,
2927 page, count, plus), 2872 pages, count, plus),
2928 &exception); 2873 &exception);
2929 } while (exception.retry); 2874 } while (exception.retry);
2930 return err; 2875 return err;
@@ -3093,7 +3038,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3093 3038
3094 dprintk("--> %s\n", __func__); 3039 dprintk("--> %s\n", __func__);
3095 3040
3096 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); 3041 if (!nfs4_sequence_done(task, &data->res.seq_res))
3042 return -EAGAIN;
3097 3043
3098 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3044 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3099 nfs_restart_rpc(task, server->nfs_client); 3045 nfs_restart_rpc(task, server->nfs_client);
@@ -3116,8 +3062,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3116{ 3062{
3117 struct inode *inode = data->inode; 3063 struct inode *inode = data->inode;
3118 3064
3119 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3065 if (!nfs4_sequence_done(task, &data->res.seq_res))
3120 task->tk_status); 3066 return -EAGAIN;
3121 3067
3122 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3068 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3123 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3069 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3145,8 +3091,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3145{ 3091{
3146 struct inode *inode = data->inode; 3092 struct inode *inode = data->inode;
3147 3093
3148 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3094 if (!nfs4_sequence_done(task, &data->res.seq_res))
3149 task->tk_status); 3095 return -EAGAIN;
3096
3150 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3097 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3151 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3098 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3152 return -EAGAIN; 3099 return -EAGAIN;
@@ -3196,10 +3143,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3196 nfs4_schedule_state_recovery(clp); 3143 nfs4_schedule_state_recovery(clp);
3197 return; 3144 return;
3198 } 3145 }
3199 spin_lock(&clp->cl_lock); 3146 do_renew_lease(clp, timestamp);
3200 if (time_before(clp->cl_last_renewal,timestamp))
3201 clp->cl_last_renewal = timestamp;
3202 spin_unlock(&clp->cl_lock);
3203} 3147}
3204 3148
3205static const struct rpc_call_ops nfs4_renew_ops = { 3149static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3240,10 +3184,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3240 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 3184 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
3241 if (status < 0) 3185 if (status < 0)
3242 return status; 3186 return status;
3243 spin_lock(&clp->cl_lock); 3187 do_renew_lease(clp, now);
3244 if (time_before(clp->cl_last_renewal,now))
3245 clp->cl_last_renewal = now;
3246 spin_unlock(&clp->cl_lock);
3247 return 0; 3188 return 0;
3248} 3189}
3249 3190
@@ -3464,9 +3405,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
3464} 3405}
3465 3406
3466static int 3407static int
3467_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) 3408nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3468{ 3409{
3469 if (!clp || task->tk_status >= 0) 3410 struct nfs_client *clp = server->nfs_client;
3411
3412 if (task->tk_status >= 0)
3470 return 0; 3413 return 0;
3471 switch(task->tk_status) { 3414 switch(task->tk_status) {
3472 case -NFS4ERR_ADMIN_REVOKED: 3415 case -NFS4ERR_ADMIN_REVOKED:
@@ -3477,9 +3420,6 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3477 nfs4_state_mark_reclaim_nograce(clp, state); 3420 nfs4_state_mark_reclaim_nograce(clp, state);
3478 goto do_state_recovery; 3421 goto do_state_recovery;
3479 case -NFS4ERR_STALE_STATEID: 3422 case -NFS4ERR_STALE_STATEID:
3480 if (state == NULL)
3481 break;
3482 nfs4_state_mark_reclaim_reboot(clp, state);
3483 case -NFS4ERR_STALE_CLIENTID: 3423 case -NFS4ERR_STALE_CLIENTID:
3484 case -NFS4ERR_EXPIRED: 3424 case -NFS4ERR_EXPIRED:
3485 goto do_state_recovery; 3425 goto do_state_recovery;
@@ -3498,8 +3438,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3498 return -EAGAIN; 3438 return -EAGAIN;
3499#endif /* CONFIG_NFS_V4_1 */ 3439#endif /* CONFIG_NFS_V4_1 */
3500 case -NFS4ERR_DELAY: 3440 case -NFS4ERR_DELAY:
3501 if (server) 3441 nfs_inc_server_stats(server, NFSIOS_DELAY);
3502 nfs_inc_server_stats(server, NFSIOS_DELAY);
3503 case -NFS4ERR_GRACE: 3442 case -NFS4ERR_GRACE:
3504 case -EKEYEXPIRED: 3443 case -EKEYEXPIRED:
3505 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3444 rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3520,12 +3459,6 @@ do_state_recovery:
3520 return -EAGAIN; 3459 return -EAGAIN;
3521} 3460}
3522 3461
3523static int
3524nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3525{
3526 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3527}
3528
3529int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, 3462int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3530 unsigned short port, struct rpc_cred *cred, 3463 unsigned short port, struct rpc_cred *cred,
3531 struct nfs4_setclientid_res *res) 3464 struct nfs4_setclientid_res *res)
@@ -3620,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3620 case -NFS4ERR_RESOURCE: 3553 case -NFS4ERR_RESOURCE:
3621 /* The IBM lawyers misread another document! */ 3554 /* The IBM lawyers misread another document! */
3622 case -NFS4ERR_DELAY: 3555 case -NFS4ERR_DELAY:
3623 case -EKEYEXPIRED:
3624 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3556 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3625 } 3557 }
3626 } while (err == 0); 3558 } while (err == 0);
@@ -3641,8 +3573,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3641{ 3573{
3642 struct nfs4_delegreturndata *data = calldata; 3574 struct nfs4_delegreturndata *data = calldata;
3643 3575
3644 nfs4_sequence_done(data->res.server, &data->res.seq_res, 3576 if (!nfs4_sequence_done(task, &data->res.seq_res))
3645 task->tk_status); 3577 return;
3646 3578
3647 switch (task->tk_status) { 3579 switch (task->tk_status) {
3648 case -NFS4ERR_STALE_STATEID: 3580 case -NFS4ERR_STALE_STATEID:
@@ -3672,7 +3604,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
3672 3604
3673 d_data = (struct nfs4_delegreturndata *)data; 3605 d_data = (struct nfs4_delegreturndata *)data;
3674 3606
3675 if (nfs4_setup_sequence(d_data->res.server->nfs_client, 3607 if (nfs4_setup_sequence(d_data->res.server,
3676 &d_data->args.seq_args, 3608 &d_data->args.seq_args,
3677 &d_data->res.seq_res, 1, task)) 3609 &d_data->res.seq_res, 1, task))
3678 return; 3610 return;
@@ -3715,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3715 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3647 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3716 data->res.fattr = &data->fattr; 3648 data->res.fattr = &data->fattr;
3717 data->res.server = server; 3649 data->res.server = server;
3718 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3719 nfs_fattr_init(data->res.fattr); 3650 nfs_fattr_init(data->res.fattr);
3720 data->timestamp = jiffies; 3651 data->timestamp = jiffies;
3721 data->rpc_status = 0; 3652 data->rpc_status = 0;
@@ -3868,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3868 p->arg.fl = &p->fl; 3799 p->arg.fl = &p->fl;
3869 p->arg.seqid = seqid; 3800 p->arg.seqid = seqid;
3870 p->res.seqid = seqid; 3801 p->res.seqid = seqid;
3871 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3872 p->arg.stateid = &lsp->ls_stateid; 3802 p->arg.stateid = &lsp->ls_stateid;
3873 p->lsp = lsp; 3803 p->lsp = lsp;
3874 atomic_inc(&lsp->ls_count); 3804 atomic_inc(&lsp->ls_count);
@@ -3892,9 +3822,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3892{ 3822{
3893 struct nfs4_unlockdata *calldata = data; 3823 struct nfs4_unlockdata *calldata = data;
3894 3824
3895 nfs4_sequence_done(calldata->server, &calldata->res.seq_res, 3825 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
3896 task->tk_status);
3897 if (RPC_ASSASSINATED(task))
3898 return; 3826 return;
3899 switch (task->tk_status) { 3827 switch (task->tk_status) {
3900 case 0: 3828 case 0:
@@ -3927,7 +3855,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3927 return; 3855 return;
3928 } 3856 }
3929 calldata->timestamp = jiffies; 3857 calldata->timestamp = jiffies;
3930 if (nfs4_setup_sequence(calldata->server->nfs_client, 3858 if (nfs4_setup_sequence(calldata->server,
3931 &calldata->arg.seq_args, 3859 &calldata->arg.seq_args,
3932 &calldata->res.seq_res, 1, task)) 3860 &calldata->res.seq_res, 1, task))
3933 return; 3861 return;
@@ -4050,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
4050 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3978 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
4051 p->arg.lock_owner.id = lsp->ls_id.id; 3979 p->arg.lock_owner.id = lsp->ls_id.id;
4052 p->res.lock_seqid = p->arg.lock_seqid; 3980 p->res.lock_seqid = p->arg.lock_seqid;
4053 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4054 p->lsp = lsp; 3981 p->lsp = lsp;
4055 p->server = server; 3982 p->server = server;
4056 atomic_inc(&lsp->ls_count); 3983 atomic_inc(&lsp->ls_count);
@@ -4082,7 +4009,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4082 } else 4009 } else
4083 data->arg.new_lock_owner = 0; 4010 data->arg.new_lock_owner = 0;
4084 data->timestamp = jiffies; 4011 data->timestamp = jiffies;
4085 if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, 4012 if (nfs4_setup_sequence(data->server,
4013 &data->arg.seq_args,
4086 &data->res.seq_res, 1, task)) 4014 &data->res.seq_res, 1, task))
4087 return; 4015 return;
4088 rpc_call_start(task); 4016 rpc_call_start(task);
@@ -4101,12 +4029,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4101 4029
4102 dprintk("%s: begin!\n", __func__); 4030 dprintk("%s: begin!\n", __func__);
4103 4031
4104 nfs4_sequence_done(data->server, &data->res.seq_res, 4032 if (!nfs4_sequence_done(task, &data->res.seq_res))
4105 task->tk_status); 4033 return;
4106 4034
4107 data->rpc_status = task->tk_status; 4035 data->rpc_status = task->tk_status;
4108 if (RPC_ASSASSINATED(task))
4109 goto out;
4110 if (data->arg.new_lock_owner != 0) { 4036 if (data->arg.new_lock_owner != 0) {
4111 if (data->rpc_status == 0) 4037 if (data->rpc_status == 0)
4112 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 4038 nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4238,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4238 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4164 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4239 return 0; 4165 return 0;
4240 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4166 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4241 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 4167 if (err != -NFS4ERR_DELAY)
4242 break; 4168 break;
4243 nfs4_handle_exception(server, err, &exception); 4169 nfs4_handle_exception(server, err, &exception);
4244 } while (exception.retry); 4170 } while (exception.retry);
@@ -4263,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4263 goto out; 4189 goto out;
4264 case -NFS4ERR_GRACE: 4190 case -NFS4ERR_GRACE:
4265 case -NFS4ERR_DELAY: 4191 case -NFS4ERR_DELAY:
4266 case -EKEYEXPIRED:
4267 nfs4_handle_exception(server, err, &exception); 4192 nfs4_handle_exception(server, err, &exception);
4268 err = 0; 4193 err = 0;
4269 } 4194 }
@@ -4409,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4409 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4334 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
4410 err = 0; 4335 err = 0;
4411 goto out; 4336 goto out;
4337 case -EKEYEXPIRED:
4338 /*
4339 * User RPCSEC_GSS context has expired.
4340 * We cannot recover this stateid now, so
4341 * skip it and allow recovery thread to
4342 * proceed.
4343 */
4344 err = 0;
4345 goto out;
4412 case -ENOMEM: 4346 case -ENOMEM:
4413 case -NFS4ERR_DENIED: 4347 case -NFS4ERR_DENIED:
4414 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4348 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
4415 err = 0; 4349 err = 0;
4416 goto out; 4350 goto out;
4417 case -NFS4ERR_DELAY: 4351 case -NFS4ERR_DELAY:
4418 case -EKEYEXPIRED:
4419 break; 4352 break;
4420 } 4353 }
4421 err = nfs4_handle_exception(server, err, &exception); 4354 err = nfs4_handle_exception(server, err, &exception);
@@ -4424,6 +4357,34 @@ out:
4424 return err; 4357 return err;
4425} 4358}
4426 4359
4360static void nfs4_release_lockowner_release(void *calldata)
4361{
4362 kfree(calldata);
4363}
4364
4365const struct rpc_call_ops nfs4_release_lockowner_ops = {
4366 .rpc_release = nfs4_release_lockowner_release,
4367};
4368
4369void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4370{
4371 struct nfs_server *server = lsp->ls_state->owner->so_server;
4372 struct nfs_release_lockowner_args *args;
4373 struct rpc_message msg = {
4374 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
4375 };
4376
4377 if (server->nfs_client->cl_mvops->minor_version != 0)
4378 return;
4379 args = kmalloc(sizeof(*args), GFP_NOFS);
4380 if (!args)
4381 return;
4382 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4383 args->lock_owner.id = lsp->ls_id.id;
4384 msg.rpc_argp = args;
4385 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4386}
4387
4427#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4388#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4428 4389
4429int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4390int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4611,11 +4572,11 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4611 (struct nfs4_get_lease_time_data *)calldata; 4572 (struct nfs4_get_lease_time_data *)calldata;
4612 4573
4613 dprintk("--> %s\n", __func__); 4574 dprintk("--> %s\n", __func__);
4614 nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); 4575 if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
4576 return;
4615 switch (task->tk_status) { 4577 switch (task->tk_status) {
4616 case -NFS4ERR_DELAY: 4578 case -NFS4ERR_DELAY:
4617 case -NFS4ERR_GRACE: 4579 case -NFS4ERR_GRACE:
4618 case -EKEYEXPIRED:
4619 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4580 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4620 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4581 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4621 task->tk_status = 0; 4582 task->tk_status = 0;
@@ -4655,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4655 }; 4616 };
4656 int status; 4617 int status;
4657 4618
4658 res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4659 dprintk("--> %s\n", __func__); 4619 dprintk("--> %s\n", __func__);
4660 task = rpc_run_task(&task_setup); 4620 task = rpc_run_task(&task_setup);
4661 4621
@@ -4805,13 +4765,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4805 if (!session) 4765 if (!session)
4806 return NULL; 4766 return NULL;
4807 4767
4808 /*
4809 * The create session reply races with the server back
4810 * channel probe. Mark the client NFS_CS_SESSION_INITING
4811 * so that the client back channel can find the
4812 * nfs_client struct
4813 */
4814 clp->cl_cons_state = NFS_CS_SESSION_INITING;
4815 init_completion(&session->complete); 4768 init_completion(&session->complete);
4816 4769
4817 tbl = &session->fc_slot_table; 4770 tbl = &session->fc_slot_table;
@@ -4824,6 +4777,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4824 spin_lock_init(&tbl->slot_tbl_lock); 4777 spin_lock_init(&tbl->slot_tbl_lock);
4825 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4778 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4826 4779
4780 session->session_state = 1<<NFS4_SESSION_INITING;
4781
4827 session->clp = clp; 4782 session->clp = clp;
4828 return session; 4783 return session;
4829} 4784}
@@ -4887,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4887 args->bc_attrs.max_reqs); 4842 args->bc_attrs.max_reqs);
4888} 4843}
4889 4844
4890static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) 4845static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4891{ 4846{
4892 if (rcvd <= sent) 4847 struct nfs4_channel_attrs *sent = &args->fc_attrs;
4893 return 0; 4848 struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
4894 printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " 4849
4895 "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); 4850 if (rcvd->headerpadsz > sent->headerpadsz)
4896 return -EINVAL; 4851 return -EINVAL;
4852 if (rcvd->max_resp_sz > sent->max_resp_sz)
4853 return -EINVAL;
4854 /*
4855 * Our requested max_ops is the minimum we need; we're not
4856 * prepared to break up compounds into smaller pieces than that.
4857 * So, no point even trying to continue if the server won't
4858 * cooperate:
4859 */
4860 if (rcvd->max_ops < sent->max_ops)
4861 return -EINVAL;
4862 if (rcvd->max_reqs == 0)
4863 return -EINVAL;
4864 return 0;
4897} 4865}
4898 4866
4899#define _verify_fore_channel_attr(_name_) \ 4867static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4900 _verify_channel_attr("fore", #_name_, \ 4868{
4901 args->fc_attrs._name_, \ 4869 struct nfs4_channel_attrs *sent = &args->bc_attrs;
4902 session->fc_attrs._name_) 4870 struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
4903 4871
4904#define _verify_back_channel_attr(_name_) \ 4872 if (rcvd->max_rqst_sz > sent->max_rqst_sz)
4905 _verify_channel_attr("back", #_name_, \ 4873 return -EINVAL;
4906 args->bc_attrs._name_, \ 4874 if (rcvd->max_resp_sz < sent->max_resp_sz)
4907 session->bc_attrs._name_) 4875 return -EINVAL;
4876 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
4877 return -EINVAL;
4878 /* These would render the backchannel useless: */
4879 if (rcvd->max_ops == 0)
4880 return -EINVAL;
4881 if (rcvd->max_reqs == 0)
4882 return -EINVAL;
4883 return 0;
4884}
4908 4885
4909/*
4910 * The server is not allowed to increase the fore channel header pad size,
4911 * maximum response size, or maximum number of operations.
4912 *
4913 * The back channel attributes are only negotiatied down: We send what the
4914 * (back channel) server insists upon.
4915 */
4916static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, 4886static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
4917 struct nfs4_session *session) 4887 struct nfs4_session *session)
4918{ 4888{
4919 int ret = 0; 4889 int ret;
4920
4921 ret |= _verify_fore_channel_attr(headerpadsz);
4922 ret |= _verify_fore_channel_attr(max_resp_sz);
4923 ret |= _verify_fore_channel_attr(max_ops);
4924
4925 ret |= _verify_back_channel_attr(headerpadsz);
4926 ret |= _verify_back_channel_attr(max_rqst_sz);
4927 ret |= _verify_back_channel_attr(max_resp_sz);
4928 ret |= _verify_back_channel_attr(max_resp_sz_cached);
4929 ret |= _verify_back_channel_attr(max_ops);
4930 ret |= _verify_back_channel_attr(max_reqs);
4931 4890
4932 return ret; 4891 ret = nfs4_verify_fore_channel_attrs(args, session);
4892 if (ret)
4893 return ret;
4894 return nfs4_verify_back_channel_attrs(args, session);
4933} 4895}
4934 4896
4935static int _nfs4_proc_create_session(struct nfs_client *clp) 4897static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5040,6 +5002,10 @@ int nfs4_init_session(struct nfs_server *server)
5040 if (!nfs4_has_session(clp)) 5002 if (!nfs4_has_session(clp))
5041 return 0; 5003 return 0;
5042 5004
5005 session = clp->cl_session;
5006 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5007 return 0;
5008
5043 rsize = server->rsize; 5009 rsize = server->rsize;
5044 if (rsize == 0) 5010 if (rsize == 0)
5045 rsize = NFS_MAX_FILE_IO_SIZE; 5011 rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5047,7 +5013,6 @@ int nfs4_init_session(struct nfs_server *server)
5047 if (wsize == 0) 5013 if (wsize == 0)
5048 wsize = NFS_MAX_FILE_IO_SIZE; 5014 wsize = NFS_MAX_FILE_IO_SIZE;
5049 5015
5050 session = clp->cl_session;
5051 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; 5016 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
5052 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; 5017 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
5053 5018
@@ -5060,69 +5025,69 @@ int nfs4_init_session(struct nfs_server *server)
5060/* 5025/*
5061 * Renew the cl_session lease. 5026 * Renew the cl_session lease.
5062 */ 5027 */
5063static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5028struct nfs4_sequence_data {
5064{ 5029 struct nfs_client *clp;
5065 struct nfs4_sequence_args args; 5030 struct nfs4_sequence_args args;
5066 struct nfs4_sequence_res res; 5031 struct nfs4_sequence_res res;
5067 5032};
5068 struct rpc_message msg = {
5069 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5070 .rpc_argp = &args,
5071 .rpc_resp = &res,
5072 .rpc_cred = cred,
5073 };
5074
5075 args.sa_cache_this = 0;
5076
5077 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
5078 &res, args.sa_cache_this, 1);
5079}
5080 5033
5081static void nfs41_sequence_release(void *data) 5034static void nfs41_sequence_release(void *data)
5082{ 5035{
5083 struct nfs_client *clp = (struct nfs_client *)data; 5036 struct nfs4_sequence_data *calldata = data;
5037 struct nfs_client *clp = calldata->clp;
5084 5038
5085 if (atomic_read(&clp->cl_count) > 1) 5039 if (atomic_read(&clp->cl_count) > 1)
5086 nfs4_schedule_state_renewal(clp); 5040 nfs4_schedule_state_renewal(clp);
5087 nfs_put_client(clp); 5041 nfs_put_client(clp);
5042 kfree(calldata);
5043}
5044
5045static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5046{
5047 switch(task->tk_status) {
5048 case -NFS4ERR_DELAY:
5049 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5050 return -EAGAIN;
5051 default:
5052 nfs4_schedule_state_recovery(clp);
5053 }
5054 return 0;
5088} 5055}
5089 5056
5090static void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5057static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5091{ 5058{
5092 struct nfs_client *clp = (struct nfs_client *)data; 5059 struct nfs4_sequence_data *calldata = data;
5060 struct nfs_client *clp = calldata->clp;
5093 5061
5094 nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); 5062 if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
5063 return;
5095 5064
5096 if (task->tk_status < 0) { 5065 if (task->tk_status < 0) {
5097 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5066 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5098 if (atomic_read(&clp->cl_count) == 1) 5067 if (atomic_read(&clp->cl_count) == 1)
5099 goto out; 5068 goto out;
5100 5069
5101 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5070 if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
5102 == -EAGAIN) { 5071 rpc_restart_call_prepare(task);
5103 nfs_restart_rpc(task, clp);
5104 return; 5072 return;
5105 } 5073 }
5106 } 5074 }
5107 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5075 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
5108out: 5076out:
5109 kfree(task->tk_msg.rpc_argp);
5110 kfree(task->tk_msg.rpc_resp);
5111
5112 dprintk("<-- %s\n", __func__); 5077 dprintk("<-- %s\n", __func__);
5113} 5078}
5114 5079
5115static void nfs41_sequence_prepare(struct rpc_task *task, void *data) 5080static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5116{ 5081{
5117 struct nfs_client *clp; 5082 struct nfs4_sequence_data *calldata = data;
5083 struct nfs_client *clp = calldata->clp;
5118 struct nfs4_sequence_args *args; 5084 struct nfs4_sequence_args *args;
5119 struct nfs4_sequence_res *res; 5085 struct nfs4_sequence_res *res;
5120 5086
5121 clp = (struct nfs_client *)data;
5122 args = task->tk_msg.rpc_argp; 5087 args = task->tk_msg.rpc_argp;
5123 res = task->tk_msg.rpc_resp; 5088 res = task->tk_msg.rpc_resp;
5124 5089
5125 if (nfs4_setup_sequence(clp, args, res, 0, task)) 5090 if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
5126 return; 5091 return;
5127 rpc_call_start(task); 5092 rpc_call_start(task);
5128} 5093}
@@ -5133,32 +5098,66 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
5133 .rpc_release = nfs41_sequence_release, 5098 .rpc_release = nfs41_sequence_release,
5134}; 5099};
5135 5100
5136static int nfs41_proc_async_sequence(struct nfs_client *clp, 5101static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5137 struct rpc_cred *cred)
5138{ 5102{
5139 struct nfs4_sequence_args *args; 5103 struct nfs4_sequence_data *calldata;
5140 struct nfs4_sequence_res *res;
5141 struct rpc_message msg = { 5104 struct rpc_message msg = {
5142 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], 5105 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5143 .rpc_cred = cred, 5106 .rpc_cred = cred,
5144 }; 5107 };
5108 struct rpc_task_setup task_setup_data = {
5109 .rpc_client = clp->cl_rpcclient,
5110 .rpc_message = &msg,
5111 .callback_ops = &nfs41_sequence_ops,
5112 .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
5113 };
5145 5114
5146 if (!atomic_inc_not_zero(&clp->cl_count)) 5115 if (!atomic_inc_not_zero(&clp->cl_count))
5147 return -EIO; 5116 return ERR_PTR(-EIO);
5148 args = kzalloc(sizeof(*args), GFP_NOFS); 5117 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5149 res = kzalloc(sizeof(*res), GFP_NOFS); 5118 if (calldata == NULL) {
5150 if (!args || !res) {
5151 kfree(args);
5152 kfree(res);
5153 nfs_put_client(clp); 5119 nfs_put_client(clp);
5154 return -ENOMEM; 5120 return ERR_PTR(-ENOMEM);
5155 } 5121 }
5156 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5122 msg.rpc_argp = &calldata->args;
5157 msg.rpc_argp = args; 5123 msg.rpc_resp = &calldata->res;
5158 msg.rpc_resp = res; 5124 calldata->clp = clp;
5125 task_setup_data.callback_data = calldata;
5159 5126
5160 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 5127 return rpc_run_task(&task_setup_data);
5161 &nfs41_sequence_ops, (void *)clp); 5128}
5129
5130static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5131{
5132 struct rpc_task *task;
5133 int ret = 0;
5134
5135 task = _nfs41_proc_sequence(clp, cred);
5136 if (IS_ERR(task))
5137 ret = PTR_ERR(task);
5138 else
5139 rpc_put_task(task);
5140 dprintk("<-- %s status=%d\n", __func__, ret);
5141 return ret;
5142}
5143
5144static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5145{
5146 struct rpc_task *task;
5147 int ret;
5148
5149 task = _nfs41_proc_sequence(clp, cred);
5150 if (IS_ERR(task)) {
5151 ret = PTR_ERR(task);
5152 goto out;
5153 }
5154 ret = rpc_wait_for_completion_task(task);
5155 if (!ret)
5156 ret = task->tk_status;
5157 rpc_put_task(task);
5158out:
5159 dprintk("<-- %s status=%d\n", __func__, ret);
5160 return ret;
5162} 5161}
5163 5162
5164struct nfs4_reclaim_complete_data { 5163struct nfs4_reclaim_complete_data {
@@ -5172,13 +5171,30 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
5172 struct nfs4_reclaim_complete_data *calldata = data; 5171 struct nfs4_reclaim_complete_data *calldata = data;
5173 5172
5174 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5173 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5175 if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, 5174 if (nfs41_setup_sequence(calldata->clp->cl_session,
5175 &calldata->arg.seq_args,
5176 &calldata->res.seq_res, 0, task)) 5176 &calldata->res.seq_res, 0, task))
5177 return; 5177 return;
5178 5178
5179 rpc_call_start(task); 5179 rpc_call_start(task);
5180} 5180}
5181 5181
5182static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5183{
5184 switch(task->tk_status) {
5185 case 0:
5186 case -NFS4ERR_COMPLETE_ALREADY:
5187 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5188 break;
5189 case -NFS4ERR_DELAY:
5190 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5191 return -EAGAIN;
5192 default:
5193 nfs4_schedule_state_recovery(clp);
5194 }
5195 return 0;
5196}
5197
5182static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) 5198static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5183{ 5199{
5184 struct nfs4_reclaim_complete_data *calldata = data; 5200 struct nfs4_reclaim_complete_data *calldata = data;
@@ -5186,32 +5202,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5186 struct nfs4_sequence_res *res = &calldata->res.seq_res; 5202 struct nfs4_sequence_res *res = &calldata->res.seq_res;
5187 5203
5188 dprintk("--> %s\n", __func__); 5204 dprintk("--> %s\n", __func__);
5189 nfs41_sequence_done(clp, res, task->tk_status); 5205 if (!nfs41_sequence_done(task, res))
5190 switch (task->tk_status) { 5206 return;
5191 case 0:
5192 case -NFS4ERR_COMPLETE_ALREADY:
5193 break;
5194 case -NFS4ERR_BADSESSION:
5195 case -NFS4ERR_DEADSESSION:
5196 /*
5197 * Handle the session error, but do not retry the operation, as
5198 * we have no way of telling whether the clientid had to be
5199 * reset before we got our reply. If reset, a new wave of
5200 * reclaim operations will follow, containing their own reclaim
5201 * complete. We don't want our retry to get on the way of
5202 * recovery by incorrectly indicating to the server that we're
5203 * done reclaiming state since the process had to be restarted.
5204 */
5205 _nfs4_async_handle_error(task, NULL, clp, NULL);
5206 break;
5207 default:
5208 if (_nfs4_async_handle_error(
5209 task, NULL, clp, NULL) == -EAGAIN) {
5210 rpc_restart_call_prepare(task);
5211 return;
5212 }
5213 }
5214 5207
5208 if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
5209 rpc_restart_call_prepare(task);
5210 return;
5211 }
5215 dprintk("<-- %s\n", __func__); 5212 dprintk("<-- %s\n", __func__);
5216} 5213}
5217 5214
@@ -5252,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5252 goto out; 5249 goto out;
5253 calldata->clp = clp; 5250 calldata->clp = clp;
5254 calldata->arg.one_fs = 0; 5251 calldata->arg.one_fs = 0;
5255 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5256 5252
5257 msg.rpc_argp = &calldata->arg; 5253 msg.rpc_argp = &calldata->arg;
5258 msg.rpc_resp = &calldata->res; 5254 msg.rpc_resp = &calldata->res;
@@ -5268,6 +5264,147 @@ out:
5268 dprintk("<-- %s status=%d\n", __func__, status); 5264 dprintk("<-- %s status=%d\n", __func__, status);
5269 return status; 5265 return status;
5270} 5266}
5267
5268static void
5269nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5270{
5271 struct nfs4_layoutget *lgp = calldata;
5272 struct inode *ino = lgp->args.inode;
5273 struct nfs_server *server = NFS_SERVER(ino);
5274
5275 dprintk("--> %s\n", __func__);
5276 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5277 &lgp->res.seq_res, 0, task))
5278 return;
5279 rpc_call_start(task);
5280}
5281
5282static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5283{
5284 struct nfs4_layoutget *lgp = calldata;
5285 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5286
5287 dprintk("--> %s\n", __func__);
5288
5289 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
5290 return;
5291
5292 switch (task->tk_status) {
5293 case 0:
5294 break;
5295 case -NFS4ERR_LAYOUTTRYLATER:
5296 case -NFS4ERR_RECALLCONFLICT:
5297 task->tk_status = -NFS4ERR_DELAY;
5298 /* Fall through */
5299 default:
5300 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5301 rpc_restart_call_prepare(task);
5302 return;
5303 }
5304 }
5305 lgp->status = task->tk_status;
5306 dprintk("<-- %s\n", __func__);
5307}
5308
5309static void nfs4_layoutget_release(void *calldata)
5310{
5311 struct nfs4_layoutget *lgp = calldata;
5312
5313 dprintk("--> %s\n", __func__);
5314 put_layout_hdr(lgp->args.inode);
5315 if (lgp->res.layout.buf != NULL)
5316 free_page((unsigned long) lgp->res.layout.buf);
5317 put_nfs_open_context(lgp->args.ctx);
5318 kfree(calldata);
5319 dprintk("<-- %s\n", __func__);
5320}
5321
5322static const struct rpc_call_ops nfs4_layoutget_call_ops = {
5323 .rpc_call_prepare = nfs4_layoutget_prepare,
5324 .rpc_call_done = nfs4_layoutget_done,
5325 .rpc_release = nfs4_layoutget_release,
5326};
5327
5328int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5329{
5330 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5331 struct rpc_task *task;
5332 struct rpc_message msg = {
5333 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
5334 .rpc_argp = &lgp->args,
5335 .rpc_resp = &lgp->res,
5336 };
5337 struct rpc_task_setup task_setup_data = {
5338 .rpc_client = server->client,
5339 .rpc_message = &msg,
5340 .callback_ops = &nfs4_layoutget_call_ops,
5341 .callback_data = lgp,
5342 .flags = RPC_TASK_ASYNC,
5343 };
5344 int status = 0;
5345
5346 dprintk("--> %s\n", __func__);
5347
5348 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
5349 if (lgp->res.layout.buf == NULL) {
5350 nfs4_layoutget_release(lgp);
5351 return -ENOMEM;
5352 }
5353
5354 lgp->res.seq_res.sr_slot = NULL;
5355 task = rpc_run_task(&task_setup_data);
5356 if (IS_ERR(task))
5357 return PTR_ERR(task);
5358 status = nfs4_wait_for_completion_rpc_task(task);
5359 if (status != 0)
5360 goto out;
5361 status = lgp->status;
5362 if (status != 0)
5363 goto out;
5364 status = pnfs_layout_process(lgp);
5365out:
5366 rpc_put_task(task);
5367 dprintk("<-- %s status=%d\n", __func__, status);
5368 return status;
5369}
5370
5371static int
5372_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5373{
5374 struct nfs4_getdeviceinfo_args args = {
5375 .pdev = pdev,
5376 };
5377 struct nfs4_getdeviceinfo_res res = {
5378 .pdev = pdev,
5379 };
5380 struct rpc_message msg = {
5381 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
5382 .rpc_argp = &args,
5383 .rpc_resp = &res,
5384 };
5385 int status;
5386
5387 dprintk("--> %s\n", __func__);
5388 status = nfs4_call_sync(server, &msg, &args, &res, 0);
5389 dprintk("<-- %s status=%d\n", __func__, status);
5390
5391 return status;
5392}
5393
5394int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5395{
5396 struct nfs4_exception exception = { };
5397 int err;
5398
5399 do {
5400 err = nfs4_handle_exception(server,
5401 _nfs4_proc_getdeviceinfo(server, pdev),
5402 &exception);
5403 } while (exception.retry);
5404 return err;
5405}
5406EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5407
5271#endif /* CONFIG_NFS_V4_1 */ 5408#endif /* CONFIG_NFS_V4_1 */
5272 5409
5273struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5410struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5325,28 +5462,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
5325}; 5462};
5326#endif 5463#endif
5327 5464
5328/* 5465static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
5329 * Per minor version reboot and network partition recovery ops 5466 .minor_version = 0,
5330 */ 5467 .call_sync = _nfs4_call_sync,
5331 5468 .validate_stateid = nfs4_validate_delegation_stateid,
5332struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { 5469 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
5333 &nfs40_reboot_recovery_ops, 5470 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
5334#if defined(CONFIG_NFS_V4_1) 5471 .state_renewal_ops = &nfs40_state_renewal_ops,
5335 &nfs41_reboot_recovery_ops,
5336#endif
5337}; 5472};
5338 5473
5339struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
5340 &nfs40_nograce_recovery_ops,
5341#if defined(CONFIG_NFS_V4_1) 5474#if defined(CONFIG_NFS_V4_1)
5342 &nfs41_nograce_recovery_ops, 5475static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
5343#endif 5476 .minor_version = 1,
5477 .call_sync = _nfs4_call_sync_session,
5478 .validate_stateid = nfs41_validate_delegation_stateid,
5479 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
5480 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
5481 .state_renewal_ops = &nfs41_state_renewal_ops,
5344}; 5482};
5483#endif
5345 5484
5346struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { 5485const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
5347 &nfs40_state_renewal_ops, 5486 [0] = &nfs_v4_0_minor_ops,
5348#if defined(CONFIG_NFS_V4_1) 5487#if defined(CONFIG_NFS_V4_1)
5349 &nfs41_state_renewal_ops, 5488 [1] = &nfs_v4_1_minor_ops,
5350#endif 5489#endif
5351}; 5490};
5352 5491
@@ -5376,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5376 .unlink_setup = nfs4_proc_unlink_setup, 5515 .unlink_setup = nfs4_proc_unlink_setup,
5377 .unlink_done = nfs4_proc_unlink_done, 5516 .unlink_done = nfs4_proc_unlink_done,
5378 .rename = nfs4_proc_rename, 5517 .rename = nfs4_proc_rename,
5518 .rename_setup = nfs4_proc_rename_setup,
5519 .rename_done = nfs4_proc_rename_done,
5379 .link = nfs4_proc_link, 5520 .link = nfs4_proc_link,
5380 .symlink = nfs4_proc_symlink, 5521 .symlink = nfs4_proc_symlink,
5381 .mkdir = nfs4_proc_mkdir, 5522 .mkdir = nfs4_proc_mkdir,
@@ -5396,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5396 .lock = nfs4_proc_lock, 5537 .lock = nfs4_proc_lock,
5397 .clear_acl_cache = nfs4_zap_acl_attr, 5538 .clear_acl_cache = nfs4_zap_acl_attr,
5398 .close_context = nfs4_close_context, 5539 .close_context = nfs4_close_context,
5540 .open_context = nfs4_atomic_open,
5399}; 5541};
5400 5542
5401/* 5543/*
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b7..72b6c580af1 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
54void 54void
55nfs4_renew_state(struct work_struct *work) 55nfs4_renew_state(struct work_struct *work)
56{ 56{
57 struct nfs4_state_maintenance_ops *ops; 57 const struct nfs4_state_maintenance_ops *ops;
58 struct nfs_client *clp = 58 struct nfs_client *clp =
59 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 63
64 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66 /* Are there any active superblocks? */
67 if (list_empty(&clp->cl_superblocks)) 67 if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 34acf5926fd..f575a312673 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
40 40
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/smp_lock.h> 43#include <linux/fs.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h> 46#include <linux/kthread.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/random.h> 48#include <linux/random.h>
49#include <linux/ratelimit.h>
49#include <linux/workqueue.h> 50#include <linux/workqueue.h>
50#include <linux/bitops.h> 51#include <linux/bitops.h>
51 52
@@ -53,6 +54,7 @@
53#include "callback.h" 54#include "callback.h"
54#include "delegation.h" 55#include "delegation.h"
55#include "internal.h" 56#include "internal.h"
57#include "pnfs.h"
56 58
57#define OPENOWNER_POOL_SIZE 8 59#define OPENOWNER_POOL_SIZE 8
58 60
@@ -145,7 +147,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
145 struct nfs4_session *ses = clp->cl_session; 147 struct nfs4_session *ses = clp->cl_session;
146 int max_slots; 148 int max_slots;
147 149
148 if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { 150 if (ses == NULL)
151 return;
152 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
149 spin_lock(&ses->fc_slot_table.slot_tbl_lock); 153 spin_lock(&ses->fc_slot_table.slot_tbl_lock);
150 max_slots = ses->fc_slot_table.max_slots; 154 max_slots = ses->fc_slot_table.max_slots;
151 while (max_slots--) { 155 while (max_slots--) {
@@ -167,7 +171,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
167 struct nfs4_slot_table *tbl = &ses->fc_slot_table; 171 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
168 172
169 spin_lock(&tbl->slot_tbl_lock); 173 spin_lock(&tbl->slot_tbl_lock);
170 set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); 174 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
171 if (tbl->highest_used_slotid != -1) { 175 if (tbl->highest_used_slotid != -1) {
172 INIT_COMPLETION(ses->complete); 176 INIT_COMPLETION(ses->complete);
173 spin_unlock(&tbl->slot_tbl_lock); 177 spin_unlock(&tbl->slot_tbl_lock);
@@ -371,7 +375,6 @@ nfs4_alloc_state_owner(void)
371 return NULL; 375 return NULL;
372 spin_lock_init(&sp->so_lock); 376 spin_lock_init(&sp->so_lock);
373 INIT_LIST_HEAD(&sp->so_states); 377 INIT_LIST_HEAD(&sp->so_states);
374 INIT_LIST_HEAD(&sp->so_delegations);
375 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); 378 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
376 sp->so_seqid.sequence = &sp->so_sequence; 379 sp->so_seqid.sequence = &sp->so_sequence;
377 spin_lock_init(&sp->so_sequence.lock); 380 spin_lock_init(&sp->so_sequence.lock);
@@ -384,7 +387,7 @@ static void
384nfs4_drop_state_owner(struct nfs4_state_owner *sp) 387nfs4_drop_state_owner(struct nfs4_state_owner *sp)
385{ 388{
386 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 389 if (!RB_EMPTY_NODE(&sp->so_client_node)) {
387 struct nfs_client *clp = sp->so_client; 390 struct nfs_client *clp = sp->so_server->nfs_client;
388 391
389 spin_lock(&clp->cl_lock); 392 spin_lock(&clp->cl_lock);
390 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 393 rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -406,7 +409,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
406 new = nfs4_alloc_state_owner(); 409 new = nfs4_alloc_state_owner();
407 if (new == NULL) 410 if (new == NULL)
408 return NULL; 411 return NULL;
409 new->so_client = clp;
410 new->so_server = server; 412 new->so_server = server;
411 new->so_cred = cred; 413 new->so_cred = cred;
412 spin_lock(&clp->cl_lock); 414 spin_lock(&clp->cl_lock);
@@ -423,7 +425,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
423 425
424void nfs4_put_state_owner(struct nfs4_state_owner *sp) 426void nfs4_put_state_owner(struct nfs4_state_owner *sp)
425{ 427{
426 struct nfs_client *clp = sp->so_client; 428 struct nfs_client *clp = sp->so_server->nfs_client;
427 struct rpc_cred *cred = sp->so_cred; 429 struct rpc_cred *cred = sp->so_cred;
428 430
429 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 431 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -602,12 +604,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
602 * that is compatible with current->files 604 * that is compatible with current->files
603 */ 605 */
604static struct nfs4_lock_state * 606static struct nfs4_lock_state *
605__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 607__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
606{ 608{
607 struct nfs4_lock_state *pos; 609 struct nfs4_lock_state *pos;
608 list_for_each_entry(pos, &state->lock_states, ls_locks) { 610 list_for_each_entry(pos, &state->lock_states, ls_locks) {
609 if (pos->ls_owner != fl_owner) 611 if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
610 continue; 612 continue;
613 switch (pos->ls_owner.lo_type) {
614 case NFS4_POSIX_LOCK_TYPE:
615 if (pos->ls_owner.lo_u.posix_owner != fl_owner)
616 continue;
617 break;
618 case NFS4_FLOCK_LOCK_TYPE:
619 if (pos->ls_owner.lo_u.flock_owner != fl_pid)
620 continue;
621 }
611 atomic_inc(&pos->ls_count); 622 atomic_inc(&pos->ls_count);
612 return pos; 623 return pos;
613 } 624 }
@@ -619,10 +630,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
619 * exists, return an uninitialized one. 630 * exists, return an uninitialized one.
620 * 631 *
621 */ 632 */
622static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 633static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
623{ 634{
624 struct nfs4_lock_state *lsp; 635 struct nfs4_lock_state *lsp;
625 struct nfs_client *clp = state->owner->so_client; 636 struct nfs_client *clp = state->owner->so_server->nfs_client;
626 637
627 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 638 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
628 if (lsp == NULL) 639 if (lsp == NULL)
@@ -633,7 +644,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
633 lsp->ls_seqid.sequence = &lsp->ls_sequence; 644 lsp->ls_seqid.sequence = &lsp->ls_sequence;
634 atomic_set(&lsp->ls_count, 1); 645 atomic_set(&lsp->ls_count, 1);
635 lsp->ls_state = state; 646 lsp->ls_state = state;
636 lsp->ls_owner = fl_owner; 647 lsp->ls_owner.lo_type = type;
648 switch (lsp->ls_owner.lo_type) {
649 case NFS4_FLOCK_LOCK_TYPE:
650 lsp->ls_owner.lo_u.flock_owner = fl_pid;
651 break;
652 case NFS4_POSIX_LOCK_TYPE:
653 lsp->ls_owner.lo_u.posix_owner = fl_owner;
654 break;
655 default:
656 kfree(lsp);
657 return NULL;
658 }
637 spin_lock(&clp->cl_lock); 659 spin_lock(&clp->cl_lock);
638 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 660 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
639 spin_unlock(&clp->cl_lock); 661 spin_unlock(&clp->cl_lock);
@@ -643,7 +665,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
643 665
644static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 666static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
645{ 667{
646 struct nfs_client *clp = lsp->ls_state->owner->so_client; 668 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
647 669
648 spin_lock(&clp->cl_lock); 670 spin_lock(&clp->cl_lock);
649 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 671 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -657,13 +679,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
657 * exists, return an uninitialized one. 679 * exists, return an uninitialized one.
658 * 680 *
659 */ 681 */
660static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) 682static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
661{ 683{
662 struct nfs4_lock_state *lsp, *new = NULL; 684 struct nfs4_lock_state *lsp, *new = NULL;
663 685
664 for(;;) { 686 for(;;) {
665 spin_lock(&state->state_lock); 687 spin_lock(&state->state_lock);
666 lsp = __nfs4_find_lock_state(state, owner); 688 lsp = __nfs4_find_lock_state(state, owner, pid, type);
667 if (lsp != NULL) 689 if (lsp != NULL)
668 break; 690 break;
669 if (new != NULL) { 691 if (new != NULL) {
@@ -674,7 +696,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
674 break; 696 break;
675 } 697 }
676 spin_unlock(&state->state_lock); 698 spin_unlock(&state->state_lock);
677 new = nfs4_alloc_lock_state(state, owner); 699 new = nfs4_alloc_lock_state(state, owner, pid, type);
678 if (new == NULL) 700 if (new == NULL)
679 return NULL; 701 return NULL;
680 } 702 }
@@ -701,6 +723,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
701 if (list_empty(&state->lock_states)) 723 if (list_empty(&state->lock_states))
702 clear_bit(LK_STATE_IN_USE, &state->flags); 724 clear_bit(LK_STATE_IN_USE, &state->flags);
703 spin_unlock(&state->state_lock); 725 spin_unlock(&state->state_lock);
726 if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
727 nfs4_release_lockowner(lsp);
704 nfs4_free_lock_state(lsp); 728 nfs4_free_lock_state(lsp);
705} 729}
706 730
@@ -728,7 +752,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
728 752
729 if (fl->fl_ops != NULL) 753 if (fl->fl_ops != NULL)
730 return 0; 754 return 0;
731 lsp = nfs4_get_lock_state(state, fl->fl_owner); 755 if (fl->fl_flags & FL_POSIX)
756 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
757 else if (fl->fl_flags & FL_FLOCK)
758 lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
759 else
760 return -EINVAL;
732 if (lsp == NULL) 761 if (lsp == NULL)
733 return -ENOMEM; 762 return -ENOMEM;
734 fl->fl_u.nfs4_fl.owner = lsp; 763 fl->fl_u.nfs4_fl.owner = lsp;
@@ -740,7 +769,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
740 * Byte-range lock aware utility to initialize the stateid of read/write 769 * Byte-range lock aware utility to initialize the stateid of read/write
741 * requests. 770 * requests.
742 */ 771 */
743void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) 772void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
744{ 773{
745 struct nfs4_lock_state *lsp; 774 struct nfs4_lock_state *lsp;
746 int seq; 775 int seq;
@@ -753,7 +782,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
753 return; 782 return;
754 783
755 spin_lock(&state->state_lock); 784 spin_lock(&state->state_lock);
756 lsp = __nfs4_find_lock_state(state, fl_owner); 785 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
757 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 786 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
758 memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); 787 memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
759 spin_unlock(&state->state_lock); 788 spin_unlock(&state->state_lock);
@@ -943,13 +972,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
943 /* Guard against delegation returns and new lock/unlock calls */ 972 /* Guard against delegation returns and new lock/unlock calls */
944 down_write(&nfsi->rwsem); 973 down_write(&nfsi->rwsem);
945 /* Protect inode->i_flock using the BKL */ 974 /* Protect inode->i_flock using the BKL */
946 lock_kernel(); 975 lock_flocks();
947 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 976 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
948 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 977 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
949 continue; 978 continue;
950 if (nfs_file_open_context(fl->fl_file)->state != state) 979 if (nfs_file_open_context(fl->fl_file)->state != state)
951 continue; 980 continue;
952 unlock_kernel(); 981 unlock_flocks();
953 status = ops->recover_lock(state, fl); 982 status = ops->recover_lock(state, fl);
954 switch (status) { 983 switch (status) {
955 case 0: 984 case 0:
@@ -976,9 +1005,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
976 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1005 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
977 status = 0; 1006 status = 0;
978 } 1007 }
979 lock_kernel(); 1008 lock_flocks();
980 } 1009 }
981 unlock_kernel(); 1010 unlock_flocks();
982out: 1011out:
983 up_write(&nfsi->rwsem); 1012 up_write(&nfsi->rwsem);
984 return status; 1013 return status;
@@ -1036,16 +1065,24 @@ restart:
1036 /* Mark the file as being 'closed' */ 1065 /* Mark the file as being 'closed' */
1037 state->state = 0; 1066 state->state = 0;
1038 break; 1067 break;
1068 case -EKEYEXPIRED:
1069 /*
1070 * User RPCSEC_GSS context has expired.
1071 * We cannot recover this stateid now, so
1072 * skip it and allow recovery thread to
1073 * proceed.
1074 */
1075 break;
1039 case -NFS4ERR_ADMIN_REVOKED: 1076 case -NFS4ERR_ADMIN_REVOKED:
1040 case -NFS4ERR_STALE_STATEID: 1077 case -NFS4ERR_STALE_STATEID:
1041 case -NFS4ERR_BAD_STATEID: 1078 case -NFS4ERR_BAD_STATEID:
1042 case -NFS4ERR_RECLAIM_BAD: 1079 case -NFS4ERR_RECLAIM_BAD:
1043 case -NFS4ERR_RECLAIM_CONFLICT: 1080 case -NFS4ERR_RECLAIM_CONFLICT:
1044 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1081 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1045 break; 1082 break;
1046 case -NFS4ERR_EXPIRED: 1083 case -NFS4ERR_EXPIRED:
1047 case -NFS4ERR_NO_GRACE: 1084 case -NFS4ERR_NO_GRACE:
1048 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1085 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1049 case -NFS4ERR_STALE_CLIENTID: 1086 case -NFS4ERR_STALE_CLIENTID:
1050 case -NFS4ERR_BADSESSION: 1087 case -NFS4ERR_BADSESSION:
1051 case -NFS4ERR_BADSLOT: 1088 case -NFS4ERR_BADSLOT:
@@ -1111,17 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1111 (void)ops->reclaim_complete(clp); 1148 (void)ops->reclaim_complete(clp);
1112} 1149}
1113 1150
1114static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1151static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1115{ 1152{
1116 struct nfs4_state_owner *sp; 1153 struct nfs4_state_owner *sp;
1117 struct rb_node *pos; 1154 struct rb_node *pos;
1118 struct nfs4_state *state; 1155 struct nfs4_state *state;
1119 1156
1120 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1157 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1121 return; 1158 return 0;
1122
1123 nfs4_reclaim_complete(clp,
1124 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1125 1159
1126 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1160 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1127 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1161 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1135,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1135 } 1169 }
1136 1170
1137 nfs_delegation_reap_unclaimed(clp); 1171 nfs_delegation_reap_unclaimed(clp);
1172 return 1;
1173}
1174
1175static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1176{
1177 if (!nfs4_state_clear_reclaim_reboot(clp))
1178 return;
1179 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1138} 1180}
1139 1181
1140static void nfs_delegation_clear_all(struct nfs_client *clp) 1182static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1149,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1149 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1191 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1150} 1192}
1151 1193
1194static void nfs4_warn_keyexpired(const char *s)
1195{
1196 printk_ratelimited(KERN_WARNING "Error: state manager"
1197 " encountered RPCSEC_GSS session"
1198 " expired against NFSv4 server %s.\n",
1199 s);
1200}
1201
1152static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1202static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1153{ 1203{
1154 switch (error) { 1204 switch (error) {
@@ -1161,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1161 case -NFS4ERR_STALE_CLIENTID: 1211 case -NFS4ERR_STALE_CLIENTID:
1162 case -NFS4ERR_LEASE_MOVED: 1212 case -NFS4ERR_LEASE_MOVED:
1163 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1213 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1164 nfs4_state_end_reclaim_reboot(clp); 1214 nfs4_state_clear_reclaim_reboot(clp);
1165 nfs4_state_start_reclaim_reboot(clp); 1215 nfs4_state_start_reclaim_reboot(clp);
1166 break; 1216 break;
1167 case -NFS4ERR_EXPIRED: 1217 case -NFS4ERR_EXPIRED:
@@ -1178,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1178 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 1228 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1179 /* Zero session reset errors */ 1229 /* Zero session reset errors */
1180 return 0; 1230 return 0;
1231 case -EKEYEXPIRED:
1232 /* Nothing we can do */
1233 nfs4_warn_keyexpired(clp->cl_hostname);
1234 return 0;
1181 } 1235 }
1182 return error; 1236 return error;
1183} 1237}
@@ -1211,8 +1265,8 @@ restart:
1211static int nfs4_check_lease(struct nfs_client *clp) 1265static int nfs4_check_lease(struct nfs_client *clp)
1212{ 1266{
1213 struct rpc_cred *cred; 1267 struct rpc_cred *cred;
1214 struct nfs4_state_maintenance_ops *ops = 1268 const struct nfs4_state_maintenance_ops *ops =
1215 nfs4_state_renewal_ops[clp->cl_minorversion]; 1269 clp->cl_mvops->state_renewal_ops;
1216 int status = -NFS4ERR_EXPIRED; 1270 int status = -NFS4ERR_EXPIRED;
1217 1271
1218 /* Is the client already known to have an expired lease? */ 1272 /* Is the client already known to have an expired lease? */
@@ -1235,8 +1289,8 @@ out:
1235static int nfs4_reclaim_lease(struct nfs_client *clp) 1289static int nfs4_reclaim_lease(struct nfs_client *clp)
1236{ 1290{
1237 struct rpc_cred *cred; 1291 struct rpc_cred *cred;
1238 struct nfs4_state_recovery_ops *ops = 1292 const struct nfs4_state_recovery_ops *ops =
1239 nfs4_reboot_recovery_ops[clp->cl_minorversion]; 1293 clp->cl_mvops->reboot_recovery_ops;
1240 int status = -ENOENT; 1294 int status = -ENOENT;
1241 1295
1242 cred = ops->get_clid_cred(clp); 1296 cred = ops->get_clid_cred(clp);
@@ -1388,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1388 case -NFS4ERR_DELAY: 1442 case -NFS4ERR_DELAY:
1389 case -NFS4ERR_CLID_INUSE: 1443 case -NFS4ERR_CLID_INUSE:
1390 case -EAGAIN: 1444 case -EAGAIN:
1391 case -EKEYEXPIRED:
1392 break; 1445 break;
1393 1446
1447 case -EKEYEXPIRED:
1448 nfs4_warn_keyexpired(clp->cl_hostname);
1394 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1449 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1395 * in nfs4_exchange_id */ 1450 * in nfs4_exchange_id */
1396 default: 1451 default:
@@ -1421,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1421 } 1476 }
1422 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1477 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1423 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1478 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
1479 pnfs_destroy_all_layouts(clp);
1424 } 1480 }
1425 1481
1426 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { 1482 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
@@ -1444,7 +1500,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1444 /* First recover reboot state... */ 1500 /* First recover reboot state... */
1445 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { 1501 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1446 status = nfs4_do_reclaim(clp, 1502 status = nfs4_do_reclaim(clp,
1447 nfs4_reboot_recovery_ops[clp->cl_minorversion]); 1503 clp->cl_mvops->reboot_recovery_ops);
1448 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1504 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1449 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) 1505 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
1450 continue; 1506 continue;
@@ -1458,7 +1514,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1458 /* Now recover expired state... */ 1514 /* Now recover expired state... */
1459 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { 1515 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1460 status = nfs4_do_reclaim(clp, 1516 status = nfs4_do_reclaim(clp,
1461 nfs4_nograce_recovery_ops[clp->cl_minorversion]); 1517 clp->cl_mvops->nograce_recovery_ops);
1462 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1518 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1463 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || 1519 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
1464 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1520 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c8dae4b26..f313c4cce7e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
52#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
53#include "nfs4_fs.h" 53#include "nfs4_fs.h"
54#include "internal.h" 54#include "internal.h"
55#include "pnfs.h"
55 56
56#define NFSDBG_FACILITY NFSDBG_XDR 57#define NFSDBG_FACILITY NFSDBG_XDR
57 58
@@ -202,14 +203,17 @@ static int nfs4_stat_to_errno(int);
202#define encode_link_maxsz (op_encode_hdr_maxsz + \ 203#define encode_link_maxsz (op_encode_hdr_maxsz + \
203 nfs4_name_maxsz) 204 nfs4_name_maxsz)
204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) 205#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
206#define encode_lockowner_maxsz (7)
205#define encode_lock_maxsz (op_encode_hdr_maxsz + \ 207#define encode_lock_maxsz (op_encode_hdr_maxsz + \
206 7 + \ 208 7 + \
207 1 + encode_stateid_maxsz + 8) 209 1 + encode_stateid_maxsz + 1 + \
210 encode_lockowner_maxsz)
208#define decode_lock_denied_maxsz \ 211#define decode_lock_denied_maxsz \
209 (8 + decode_lockowner_maxsz) 212 (8 + decode_lockowner_maxsz)
210#define decode_lock_maxsz (op_decode_hdr_maxsz + \ 213#define decode_lock_maxsz (op_decode_hdr_maxsz + \
211 decode_lock_denied_maxsz) 214 decode_lock_denied_maxsz)
212#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) 215#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
216 encode_lockowner_maxsz)
213#define decode_lockt_maxsz (op_decode_hdr_maxsz + \ 217#define decode_lockt_maxsz (op_decode_hdr_maxsz + \
214 decode_lock_denied_maxsz) 218 decode_lock_denied_maxsz)
215#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ 219#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \
@@ -217,6 +221,11 @@ static int nfs4_stat_to_errno(int);
217 4) 221 4)
218#define decode_locku_maxsz (op_decode_hdr_maxsz + \ 222#define decode_locku_maxsz (op_decode_hdr_maxsz + \
219 decode_stateid_maxsz) 223 decode_stateid_maxsz)
224#define encode_release_lockowner_maxsz \
225 (op_encode_hdr_maxsz + \
226 encode_lockowner_maxsz)
227#define decode_release_lockowner_maxsz \
228 (op_decode_hdr_maxsz)
220#define encode_access_maxsz (op_encode_hdr_maxsz + 1) 229#define encode_access_maxsz (op_encode_hdr_maxsz + 1)
221#define decode_access_maxsz (op_decode_hdr_maxsz + 2) 230#define decode_access_maxsz (op_decode_hdr_maxsz + 2)
222#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ 231#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
@@ -302,6 +311,19 @@ static int nfs4_stat_to_errno(int);
302 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 311 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
303#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 312#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
304#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 313#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
314#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
315 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
316#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
317 1 /* layout type */ + \
318 1 /* opaque devaddr4 length */ + \
319 /* devaddr4 payload is read into page */ \
320 1 /* notification bitmap length */ + \
321 1 /* notification bitmap */)
322#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
323 encode_stateid_maxsz)
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
305#else /* CONFIG_NFS_V4_1 */ 327#else /* CONFIG_NFS_V4_1 */
306#define encode_sequence_maxsz 0 328#define encode_sequence_maxsz 0
307#define decode_sequence_maxsz 0 329#define decode_sequence_maxsz 0
@@ -471,6 +493,12 @@ static int nfs4_stat_to_errno(int);
471 decode_sequence_maxsz + \ 493 decode_sequence_maxsz + \
472 decode_putfh_maxsz + \ 494 decode_putfh_maxsz + \
473 decode_locku_maxsz) 495 decode_locku_maxsz)
496#define NFS4_enc_release_lockowner_sz \
497 (compound_encode_hdr_maxsz + \
498 encode_lockowner_maxsz)
499#define NFS4_dec_release_lockowner_sz \
500 (compound_decode_hdr_maxsz + \
501 decode_lockowner_maxsz)
474#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 502#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
475 encode_sequence_maxsz + \ 503 encode_sequence_maxsz + \
476 encode_putfh_maxsz + \ 504 encode_putfh_maxsz + \
@@ -685,6 +713,20 @@ static int nfs4_stat_to_errno(int);
685#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 713#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
686 decode_sequence_maxsz + \ 714 decode_sequence_maxsz + \
687 decode_reclaim_complete_maxsz) 715 decode_reclaim_complete_maxsz)
716#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
717 encode_sequence_maxsz +\
718 encode_getdeviceinfo_maxsz)
719#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
720 decode_sequence_maxsz + \
721 decode_getdeviceinfo_maxsz)
722#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
723 encode_sequence_maxsz + \
724 encode_putfh_maxsz + \
725 encode_layoutget_maxsz)
726#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
727 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \
729 decode_layoutget_maxsz)
688 730
689const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
690 compound_encode_hdr_maxsz + 732 compound_encode_hdr_maxsz +
@@ -744,7 +786,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
744 struct compound_hdr *hdr) 786 struct compound_hdr *hdr)
745{ 787{
746 __be32 *p; 788 __be32 *p;
747 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 789 struct rpc_auth *auth = req->rq_cred->cr_auth;
748 790
749 /* initialize running count of expected bytes in reply. 791 /* initialize running count of expected bytes in reply.
750 * NOTE: the replied tag SHOULD be the same is the one sent, 792 * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -802,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
802 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
803 len += 4; 845 len += 4;
804 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
805 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
806 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
807 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
808 iap->ia_uid); 850 iap->ia_uid);
@@ -814,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
814 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
815 } 857 }
816 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
817 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
818 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
819 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
820 iap->ia_gid); 862 iap->ia_gid);
@@ -1042,6 +1084,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
1042 return fl->fl_end - fl->fl_start + 1; 1084 return fl->fl_end - fl->fl_start + 1;
1043} 1085}
1044 1086
1087static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
1088{
1089 __be32 *p;
1090
1091 p = reserve_space(xdr, 28);
1092 p = xdr_encode_hyper(p, lowner->clientid);
1093 *p++ = cpu_to_be32(16);
1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1095 xdr_encode_hyper(p, lowner->id);
1096}
1097
1045/* 1098/*
1046 * opcode,type,reclaim,offset,length,new_lock_owner = 32 1099 * opcode,type,reclaim,offset,length,new_lock_owner = 32
1047 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 1100 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1111,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1058 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1111 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1059 *p = cpu_to_be32(args->new_lock_owner); 1112 *p = cpu_to_be32(args->new_lock_owner);
1060 if (args->new_lock_owner){ 1113 if (args->new_lock_owner){
1061 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); 1114 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1062 *p++ = cpu_to_be32(args->open_seqid->sequence->counter); 1115 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1063 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); 1116 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1064 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); 1117 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1065 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1118 encode_lockowner(xdr, &args->lock_owner);
1066 *p++ = cpu_to_be32(16);
1067 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1068 xdr_encode_hyper(p, args->lock_owner.id);
1069 } 1119 }
1070 else { 1120 else {
1071 p = reserve_space(xdr, NFS4_STATEID_SIZE+4); 1121 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1130,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1080{ 1130{
1081 __be32 *p; 1131 __be32 *p;
1082 1132
1083 p = reserve_space(xdr, 52); 1133 p = reserve_space(xdr, 24);
1084 *p++ = cpu_to_be32(OP_LOCKT); 1134 *p++ = cpu_to_be32(OP_LOCKT);
1085 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); 1135 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1086 p = xdr_encode_hyper(p, args->fl->fl_start); 1136 p = xdr_encode_hyper(p, args->fl->fl_start);
1087 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1137 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1088 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1138 encode_lockowner(xdr, &args->lock_owner);
1089 *p++ = cpu_to_be32(16);
1090 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1091 xdr_encode_hyper(p, args->lock_owner.id);
1092 hdr->nops++; 1139 hdr->nops++;
1093 hdr->replen += decode_lockt_maxsz; 1140 hdr->replen += decode_lockt_maxsz;
1094} 1141}
@@ -1108,6 +1155,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1108 hdr->replen += decode_locku_maxsz; 1155 hdr->replen += decode_locku_maxsz;
1109} 1156}
1110 1157
1158static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
1159{
1160 __be32 *p;
1161
1162 p = reserve_space(xdr, 4);
1163 *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
1164 encode_lockowner(xdr, lowner);
1165 hdr->nops++;
1166 hdr->replen += decode_release_lockowner_maxsz;
1167}
1168
1111static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1169static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1112{ 1170{
1113 int len = name->len; 1171 int len = name->len;
@@ -1172,7 +1230,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1172 break; 1230 break;
1173 default: 1231 default:
1174 clp = arg->server->nfs_client; 1232 clp = arg->server->nfs_client;
1175 if (clp->cl_minorversion > 0) { 1233 if (clp->cl_mvops->minor_version > 0) {
1176 if (nfs4_has_persistent_session(clp)) { 1234 if (nfs4_has_persistent_session(clp)) {
1177 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1235 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1178 encode_attrs(xdr, arg->u.attrs, arg->server); 1236 encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1382,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1324 hdr->replen += decode_putrootfh_maxsz; 1382 hdr->replen += decode_putrootfh_maxsz;
1325} 1383}
1326 1384
1327static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1385static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
1328{ 1386{
1329 nfs4_stateid stateid; 1387 nfs4_stateid stateid;
1330 __be32 *p; 1388 __be32 *p;
1331 1389
1332 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1390 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1333 if (ctx->state != NULL) { 1391 if (ctx->state != NULL) {
1334 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1392 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1335 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1393 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1336 } else 1394 } else
1337 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1395 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1402,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1344 p = reserve_space(xdr, 4); 1402 p = reserve_space(xdr, 4);
1345 *p = cpu_to_be32(OP_READ); 1403 *p = cpu_to_be32(OP_READ);
1346 1404
1347 encode_stateid(xdr, args->context); 1405 encode_stateid(xdr, args->context, args->lock_context);
1348 1406
1349 p = reserve_space(xdr, 12); 1407 p = reserve_space(xdr, 12);
1350 p = xdr_encode_hyper(p, args->offset); 1408 p = xdr_encode_hyper(p, args->offset);
@@ -1355,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1355 1413
1356static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1414static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1357{ 1415{
1358 uint32_t attrs[2] = { 1416 uint32_t attrs[2] = {0, 0};
1359 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1417 uint32_t dircount = readdir->count >> 1;
1360 FATTR4_WORD1_MOUNTED_ON_FILEID,
1361 };
1362 __be32 *p; 1418 __be32 *p;
1363 1419
1420 if (readdir->plus) {
1421 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
1422 FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
1423 attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
1424 FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
1425 FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
1426 FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1427 dircount >>= 1;
1428 }
1429 attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
1430 attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
1431 /* Switch to mounted_on_fileid if the server supports it */
1432 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1433 attrs[0] &= ~FATTR4_WORD0_FILEID;
1434 else
1435 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1436
1364 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); 1437 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1365 *p++ = cpu_to_be32(OP_READDIR); 1438 *p++ = cpu_to_be32(OP_READDIR);
1366 p = xdr_encode_hyper(p, readdir->cookie); 1439 p = xdr_encode_hyper(p, readdir->cookie);
1367 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); 1440 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1368 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ 1441 *p++ = cpu_to_be32(dircount);
1369 *p++ = cpu_to_be32(readdir->count); 1442 *p++ = cpu_to_be32(readdir->count);
1370 *p++ = cpu_to_be32(2); 1443 *p++ = cpu_to_be32(2);
1371 /* Switch to mounted_on_fileid if the server supports it */ 1444
1372 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1373 attrs[0] &= ~FATTR4_WORD0_FILEID;
1374 else
1375 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1376 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1445 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1377 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1446 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1378 hdr->nops++; 1447 hdr->nops++;
@@ -1523,7 +1592,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1523 p = reserve_space(xdr, 4); 1592 p = reserve_space(xdr, 4);
1524 *p = cpu_to_be32(OP_WRITE); 1593 *p = cpu_to_be32(OP_WRITE);
1525 1594
1526 encode_stateid(xdr, args->context); 1595 encode_stateid(xdr, args->context, args->lock_context);
1527 1596
1528 p = reserve_space(xdr, 16); 1597 p = reserve_space(xdr, 16);
1529 p = xdr_encode_hyper(p, args->offset); 1598 p = xdr_encode_hyper(p, args->offset);
@@ -1696,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
1696#endif /* CONFIG_NFS_V4_1 */ 1765#endif /* CONFIG_NFS_V4_1 */
1697} 1766}
1698 1767
1768#ifdef CONFIG_NFS_V4_1
1769static void
1770encode_getdeviceinfo(struct xdr_stream *xdr,
1771 const struct nfs4_getdeviceinfo_args *args,
1772 struct compound_hdr *hdr)
1773{
1774 __be32 *p;
1775
1776 p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
1777 *p++ = cpu_to_be32(OP_GETDEVICEINFO);
1778 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1779 NFS4_DEVICEID4_SIZE);
1780 *p++ = cpu_to_be32(args->pdev->layout_type);
1781 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
1782 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1783 hdr->nops++;
1784 hdr->replen += decode_getdeviceinfo_maxsz;
1785}
1786
1787static void
1788encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr)
1791{
1792 nfs4_stateid stateid;
1793 __be32 *p;
1794
1795 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1796 *p++ = cpu_to_be32(OP_LAYOUTGET);
1797 *p++ = cpu_to_be32(0); /* Signal layout available */
1798 *p++ = cpu_to_be32(args->type);
1799 *p++ = cpu_to_be32(args->range.iomode);
1800 p = xdr_encode_hyper(p, args->range.offset);
1801 p = xdr_encode_hyper(p, args->range.length);
1802 p = xdr_encode_hyper(p, args->minlength);
1803 pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
1804 args->ctx->state);
1805 p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
1806 *p = cpu_to_be32(args->maxcount);
1807
1808 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
1809 __func__,
1810 args->type,
1811 args->range.iomode,
1812 (unsigned long)args->range.offset,
1813 (unsigned long)args->range.length,
1814 args->maxcount);
1815 hdr->nops++;
1816 hdr->replen += decode_layoutget_maxsz;
1817}
1818#endif /* CONFIG_NFS_V4_1 */
1819
1699/* 1820/*
1700 * END OF "GENERIC" ENCODE ROUTINES. 1821 * END OF "GENERIC" ENCODE ROUTINES.
1701 */ 1822 */
@@ -1704,7 +1825,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1704{ 1825{
1705#if defined(CONFIG_NFS_V4_1) 1826#if defined(CONFIG_NFS_V4_1)
1706 if (args->sa_session) 1827 if (args->sa_session)
1707 return args->sa_session->clp->cl_minorversion; 1828 return args->sa_session->clp->cl_mvops->minor_version;
1708#endif /* CONFIG_NFS_V4_1 */ 1829#endif /* CONFIG_NFS_V4_1 */
1709 return 0; 1830 return 0;
1710} 1831}
@@ -1793,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
1793/* 1914/*
1794 * Encode RENAME request 1915 * Encode RENAME request
1795 */ 1916 */
1796static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) 1917static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
1797{ 1918{
1798 struct xdr_stream xdr; 1919 struct xdr_stream xdr;
1799 struct compound_hdr hdr = { 1920 struct compound_hdr hdr = {
@@ -2048,6 +2169,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
2048 return 0; 2169 return 0;
2049} 2170}
2050 2171
2172static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
2173{
2174 struct xdr_stream xdr;
2175 struct compound_hdr hdr = {
2176 .minorversion = 0,
2177 };
2178
2179 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2180 encode_compound_hdr(&xdr, req, &hdr);
2181 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2182 encode_nops(&hdr);
2183 return 0;
2184}
2185
2051/* 2186/*
2052 * Encode a READLINK request 2187 * Encode a READLINK request
2053 */ 2188 */
@@ -2395,7 +2530,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
2395{ 2530{
2396 struct xdr_stream xdr; 2531 struct xdr_stream xdr;
2397 struct compound_hdr hdr = { 2532 struct compound_hdr hdr = {
2398 .minorversion = args->client->cl_minorversion, 2533 .minorversion = args->client->cl_mvops->minor_version,
2399 }; 2534 };
2400 2535
2401 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2536 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2548,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
2413{ 2548{
2414 struct xdr_stream xdr; 2549 struct xdr_stream xdr;
2415 struct compound_hdr hdr = { 2550 struct compound_hdr hdr = {
2416 .minorversion = args->client->cl_minorversion, 2551 .minorversion = args->client->cl_mvops->minor_version,
2417 }; 2552 };
2418 2553
2419 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2554 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2566,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
2431{ 2566{
2432 struct xdr_stream xdr; 2567 struct xdr_stream xdr;
2433 struct compound_hdr hdr = { 2568 struct compound_hdr hdr = {
2434 .minorversion = session->clp->cl_minorversion, 2569 .minorversion = session->clp->cl_mvops->minor_version,
2435 }; 2570 };
2436 2571
2437 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2572 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2499,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
2499 return 0; 2634 return 0;
2500} 2635}
2501 2636
2637/*
2638 * Encode GETDEVICEINFO request
2639 */
2640static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
2641 struct nfs4_getdeviceinfo_args *args)
2642{
2643 struct xdr_stream xdr;
2644 struct compound_hdr hdr = {
2645 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2646 };
2647
2648 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2649 encode_compound_hdr(&xdr, req, &hdr);
2650 encode_sequence(&xdr, &args->seq_args, &hdr);
2651 encode_getdeviceinfo(&xdr, args, &hdr);
2652
2653 /* set up reply kvec. Subtract notification bitmap max size (2)
2654 * so that notification bitmap is put in xdr_buf tail */
2655 xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
2656 args->pdev->pages, args->pdev->pgbase,
2657 args->pdev->pglen);
2658
2659 encode_nops(&hdr);
2660 return 0;
2661}
2662
2663/*
2664 * Encode LAYOUTGET request
2665 */
2666static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
2667 struct nfs4_layoutget_args *args)
2668{
2669 struct xdr_stream xdr;
2670 struct compound_hdr hdr = {
2671 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2672 };
2673
2674 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2675 encode_compound_hdr(&xdr, req, &hdr);
2676 encode_sequence(&xdr, &args->seq_args, &hdr);
2677 encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
2678 encode_layoutget(&xdr, args, &hdr);
2679 encode_nops(&hdr);
2680 return 0;
2681}
2502#endif /* CONFIG_NFS_V4_1 */ 2682#endif /* CONFIG_NFS_V4_1 */
2503 2683
2504static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2684static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2632,7 +2812,10 @@ out_overflow:
2632static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2812static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
2633{ 2813{
2634 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { 2814 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
2635 decode_attr_bitmap(xdr, bitmask); 2815 int ret;
2816 ret = decode_attr_bitmap(xdr, bitmask);
2817 if (unlikely(ret < 0))
2818 return ret;
2636 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 2819 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2637 } else 2820 } else
2638 bitmask[0] = bitmask[1] = 0; 2821 bitmask[0] = bitmask[1] = 0;
@@ -2804,6 +2987,56 @@ out_overflow:
2804 return -EIO; 2987 return -EIO;
2805} 2988}
2806 2989
2990static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2991{
2992 __be32 *p;
2993
2994 if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
2995 return -EIO;
2996 if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
2997 p = xdr_inline_decode(xdr, 4);
2998 if (unlikely(!p))
2999 goto out_overflow;
3000 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
3001 }
3002 return 0;
3003out_overflow:
3004 print_overflow_msg(__func__, xdr);
3005 return -EIO;
3006}
3007
3008static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
3009{
3010 __be32 *p;
3011 int len;
3012
3013 if (fh != NULL)
3014 memset(fh, 0, sizeof(*fh));
3015
3016 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
3017 return -EIO;
3018 if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
3019 p = xdr_inline_decode(xdr, 4);
3020 if (unlikely(!p))
3021 goto out_overflow;
3022 len = be32_to_cpup(p);
3023 if (len > NFS4_FHSIZE)
3024 return -EIO;
3025 p = xdr_inline_decode(xdr, len);
3026 if (unlikely(!p))
3027 goto out_overflow;
3028 if (fh != NULL) {
3029 memcpy(fh->data, p, len);
3030 fh->size = len;
3031 }
3032 bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
3033 }
3034 return 0;
3035out_overflow:
3036 print_overflow_msg(__func__, xdr);
3037 return -EIO;
3038}
3039
2807static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3040static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2808{ 3041{
2809 __be32 *p; 3042 __be32 *p;
@@ -3477,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
3477 return status; 3710 return status;
3478} 3711}
3479 3712
3713static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
3714 struct timespec *time)
3715{
3716 int status = 0;
3717
3718 time->tv_sec = 0;
3719 time->tv_nsec = 0;
3720 if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
3721 return -EIO;
3722 if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
3723 status = decode_attr_time(xdr, time);
3724 bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
3725 }
3726 dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
3727 (long)time->tv_nsec);
3728 return status;
3729}
3730
3480static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3731static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
3481{ 3732{
3482 int status = 0; 3733 int status = 0;
@@ -3700,29 +3951,14 @@ xdr_error:
3700 return status; 3951 return status;
3701} 3952}
3702 3953
3703static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 3954static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3955 struct nfs_fattr *fattr, struct nfs_fh *fh,
3704 const struct nfs_server *server, int may_sleep) 3956 const struct nfs_server *server, int may_sleep)
3705{ 3957{
3706 __be32 *savep;
3707 uint32_t attrlen,
3708 bitmap[2] = {0},
3709 type;
3710 int status; 3958 int status;
3711 umode_t fmode = 0; 3959 umode_t fmode = 0;
3712 uint64_t fileid; 3960 uint64_t fileid;
3713 3961 uint32_t type;
3714 status = decode_op_hdr(xdr, OP_GETATTR);
3715 if (status < 0)
3716 goto xdr_error;
3717
3718 status = decode_attr_bitmap(xdr, bitmap);
3719 if (status < 0)
3720 goto xdr_error;
3721
3722 status = decode_attr_length(xdr, &attrlen, &savep);
3723 if (status < 0)
3724 goto xdr_error;
3725
3726 3962
3727 status = decode_attr_type(xdr, bitmap, &type); 3963 status = decode_attr_type(xdr, bitmap, &type);
3728 if (status < 0) 3964 if (status < 0)
@@ -3748,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3748 goto xdr_error; 3984 goto xdr_error;
3749 fattr->valid |= status; 3985 fattr->valid |= status;
3750 3986
3987 status = decode_attr_error(xdr, bitmap);
3988 if (status < 0)
3989 goto xdr_error;
3990
3991 status = decode_attr_filehandle(xdr, bitmap, fh);
3992 if (status < 0)
3993 goto xdr_error;
3994
3751 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); 3995 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3752 if (status < 0) 3996 if (status < 0)
3753 goto xdr_error; 3997 goto xdr_error;
@@ -3818,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3818 fattr->valid |= status; 4062 fattr->valid |= status;
3819 } 4063 }
3820 4064
4065xdr_error:
4066 dprintk("%s: xdr returned %d\n", __func__, -status);
4067 return status;
4068}
4069
4070static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4071 struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
4072{
4073 __be32 *savep;
4074 uint32_t attrlen,
4075 bitmap[2] = {0};
4076 int status;
4077
4078 status = decode_op_hdr(xdr, OP_GETATTR);
4079 if (status < 0)
4080 goto xdr_error;
4081
4082 status = decode_attr_bitmap(xdr, bitmap);
4083 if (status < 0)
4084 goto xdr_error;
4085
4086 status = decode_attr_length(xdr, &attrlen, &savep);
4087 if (status < 0)
4088 goto xdr_error;
4089
4090 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
4091 if (status < 0)
4092 goto xdr_error;
4093
3821 status = verify_attr_len(xdr, savep, attrlen); 4094 status = verify_attr_len(xdr, savep, attrlen);
3822xdr_error: 4095xdr_error:
3823 dprintk("%s: xdr returned %d\n", __func__, -status); 4096 dprintk("%s: xdr returned %d\n", __func__, -status);
3824 return status; 4097 return status;
3825} 4098}
3826 4099
4100static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4101 const struct nfs_server *server, int may_sleep)
4102{
4103 return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
4104}
4105
4106/*
4107 * Decode potentially multiple layout types. Currently we only support
4108 * one layout driver per file system.
4109 */
4110static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4111 uint32_t *layouttype)
4112{
4113 uint32_t *p;
4114 int num;
4115
4116 p = xdr_inline_decode(xdr, 4);
4117 if (unlikely(!p))
4118 goto out_overflow;
4119 num = be32_to_cpup(p);
4120
4121 /* pNFS is not supported by the underlying file system */
4122 if (num == 0) {
4123 *layouttype = 0;
4124 return 0;
4125 }
4126 if (num > 1)
4127 printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
4128 "per filesystem not supported\n", __func__);
4129
4130 /* Decode and set first layout type, move xdr->p past unused types */
4131 p = xdr_inline_decode(xdr, num * 4);
4132 if (unlikely(!p))
4133 goto out_overflow;
4134 *layouttype = be32_to_cpup(p);
4135 return 0;
4136out_overflow:
4137 print_overflow_msg(__func__, xdr);
4138 return -EIO;
4139}
4140
4141/*
4142 * The type of file system exported.
4143 * Note we must ensure that layouttype is set in any non-error case.
4144 */
4145static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4146 uint32_t *layouttype)
4147{
4148 int status = 0;
4149
4150 dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
4151 if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
4152 return -EIO;
4153 if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
4154 status = decode_first_pnfs_layout_type(xdr, layouttype);
4155 bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
4156 } else
4157 *layouttype = 0;
4158 return status;
4159}
3827 4160
3828static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4161static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3829{ 4162{
@@ -3850,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3850 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) 4183 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
3851 goto xdr_error; 4184 goto xdr_error;
3852 fsinfo->wtpref = fsinfo->wtmax; 4185 fsinfo->wtpref = fsinfo->wtmax;
4186 status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
4187 if (status != 0)
4188 goto xdr_error;
4189 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4190 if (status != 0)
4191 goto xdr_error;
3853 4192
3854 status = verify_attr_len(xdr, savep, attrlen); 4193 status = verify_attr_len(xdr, savep, attrlen);
3855xdr_error: 4194xdr_error:
@@ -3906,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3906 __be32 *p; 4245 __be32 *p;
3907 uint32_t namelen, type; 4246 uint32_t namelen, type;
3908 4247
3909 p = xdr_inline_decode(xdr, 32); 4248 p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
3910 if (unlikely(!p)) 4249 if (unlikely(!p))
3911 goto out_overflow; 4250 goto out_overflow;
3912 p = xdr_decode_hyper(p, &offset); 4251 p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
3913 p = xdr_decode_hyper(p, &length); 4252 p = xdr_decode_hyper(p, &length);
3914 type = be32_to_cpup(p++); 4253 type = be32_to_cpup(p++); /* 4 byte read */
3915 if (fl != NULL) { 4254 if (fl != NULL) { /* manipulate file lock */
3916 fl->fl_start = (loff_t)offset; 4255 fl->fl_start = (loff_t)offset;
3917 fl->fl_end = fl->fl_start + (loff_t)length - 1; 4256 fl->fl_end = fl->fl_start + (loff_t)length - 1;
3918 if (length == ~(uint64_t)0) 4257 if (length == ~(uint64_t)0)
@@ -3922,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3922 fl->fl_type = F_RDLCK; 4261 fl->fl_type = F_RDLCK;
3923 fl->fl_pid = 0; 4262 fl->fl_pid = 0;
3924 } 4263 }
3925 p = xdr_decode_hyper(p, &clientid); 4264 p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
3926 namelen = be32_to_cpup(p); 4265 namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
3927 p = xdr_inline_decode(xdr, namelen); 4266 p = xdr_inline_decode(xdr, namelen); /* variable size field */
3928 if (likely(p)) 4267 if (likely(p))
3929 return -NFS4ERR_DENIED; 4268 return -NFS4ERR_DENIED;
3930out_overflow: 4269out_overflow:
@@ -3973,6 +4312,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3973 return status; 4312 return status;
3974} 4313}
3975 4314
4315static int decode_release_lockowner(struct xdr_stream *xdr)
4316{
4317 return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
4318}
4319
3976static int decode_lookup(struct xdr_stream *xdr) 4320static int decode_lookup(struct xdr_stream *xdr)
3977{ 4321{
3978 return decode_op_hdr(xdr, OP_LOOKUP); 4322 return decode_op_hdr(xdr, OP_LOOKUP);
@@ -4151,12 +4495,9 @@ out_overflow:
4151static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4495static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
4152{ 4496{
4153 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 4497 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
4154 struct page *page = *rcvbuf->pages;
4155 struct kvec *iov = rcvbuf->head; 4498 struct kvec *iov = rcvbuf->head;
4156 size_t hdrlen; 4499 size_t hdrlen;
4157 u32 recvd, pglen = rcvbuf->page_len; 4500 u32 recvd, pglen = rcvbuf->page_len;
4158 __be32 *end, *entry, *p, *kaddr;
4159 unsigned int nr = 0;
4160 int status; 4501 int status;
4161 4502
4162 status = decode_op_hdr(xdr, OP_READDIR); 4503 status = decode_op_hdr(xdr, OP_READDIR);
@@ -4176,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4176 pglen = recvd; 4517 pglen = recvd;
4177 xdr_read_pages(xdr, pglen); 4518 xdr_read_pages(xdr, pglen);
4178 4519
4179 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 4520
4180 kaddr = p = kmap_atomic(page, KM_USER0);
4181 end = p + ((pglen + readdir->pgbase) >> 2);
4182 entry = p;
4183
4184 /* Make sure the packet actually has a value_follows and EOF entry */
4185 if ((entry + 1) > end)
4186 goto short_pkt;
4187
4188 for (; *p++; nr++) {
4189 u32 len, attrlen, xlen;
4190 if (end - p < 3)
4191 goto short_pkt;
4192 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
4193 p += 2; /* cookie */
4194 len = ntohl(*p++); /* filename length */
4195 if (len > NFS4_MAXNAMLEN) {
4196 dprintk("NFS: giant filename in readdir (len 0x%x)\n",
4197 len);
4198 goto err_unmap;
4199 }
4200 xlen = XDR_QUADLEN(len);
4201 if (end - p < xlen + 1)
4202 goto short_pkt;
4203 dprintk("filename = %*s\n", len, (char *)p);
4204 p += xlen;
4205 len = ntohl(*p++); /* bitmap length */
4206 if (end - p < len + 1)
4207 goto short_pkt;
4208 p += len;
4209 attrlen = XDR_QUADLEN(ntohl(*p++));
4210 if (end - p < attrlen + 2)
4211 goto short_pkt;
4212 p += attrlen; /* attributes */
4213 entry = p;
4214 }
4215 /*
4216 * Apparently some server sends responses that are a valid size, but
4217 * contain no entries, and have value_follows==0 and EOF==0. For
4218 * those, just set the EOF marker.
4219 */
4220 if (!nr && entry[1] == 0) {
4221 dprintk("NFS: readdir reply truncated!\n");
4222 entry[1] = 1;
4223 }
4224out:
4225 kunmap_atomic(kaddr, KM_USER0);
4226 return 0; 4521 return 0;
4227short_pkt:
4228 /*
4229 * When we get a short packet there are 2 possibilities. We can
4230 * return an error, or fix up the response to look like a valid
4231 * response and return what we have so far. If there are no
4232 * entries and the packet was short, then return -EIO. If there
4233 * are valid entries in the response, return them and pretend that
4234 * the call was successful, but incomplete. The caller can retry the
4235 * readdir starting at the last cookie.
4236 */
4237 dprintk("%s: short packet at entry %d\n", __func__, nr);
4238 entry[0] = entry[1] = 0;
4239 if (nr)
4240 goto out;
4241err_unmap:
4242 kunmap_atomic(kaddr, KM_USER0);
4243 return -errno_NFSERR_IO;
4244} 4522}
4245 4523
4246static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4250,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4250 size_t hdrlen; 4528 size_t hdrlen;
4251 u32 len, recvd; 4529 u32 len, recvd;
4252 __be32 *p; 4530 __be32 *p;
4253 char *kaddr;
4254 int status; 4531 int status;
4255 4532
4256 status = decode_op_hdr(xdr, OP_READLINK); 4533 status = decode_op_hdr(xdr, OP_READLINK);
@@ -4281,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4281 * and and null-terminate the text (the VFS expects 4558 * and and null-terminate the text (the VFS expects
4282 * null-termination). 4559 * null-termination).
4283 */ 4560 */
4284 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); 4561 xdr_terminate_string(rcvbuf, len);
4285 kaddr[len+rcvbuf->page_base] = '\0';
4286 kunmap_atomic(kaddr, KM_USER0);
4287 return 0; 4562 return 0;
4288out_overflow: 4563out_overflow:
4289 print_overflow_msg(__func__, xdr); 4564 print_overflow_msg(__func__, xdr);
@@ -4619,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
4619 struct rpc_rqst *rqstp) 4894 struct rpc_rqst *rqstp)
4620{ 4895{
4621#if defined(CONFIG_NFS_V4_1) 4896#if defined(CONFIG_NFS_V4_1)
4622 struct nfs4_slot *slot;
4623 struct nfs4_sessionid id; 4897 struct nfs4_sessionid id;
4624 u32 dummy; 4898 u32 dummy;
4625 int status; 4899 int status;
@@ -4651,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
4651 goto out_overflow; 4925 goto out_overflow;
4652 4926
4653 /* seqid */ 4927 /* seqid */
4654 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4655 dummy = be32_to_cpup(p++); 4928 dummy = be32_to_cpup(p++);
4656 if (dummy != slot->seq_nr) { 4929 if (dummy != res->sr_slot->seq_nr) {
4657 dprintk("%s Invalid sequence number\n", __func__); 4930 dprintk("%s Invalid sequence number\n", __func__);
4658 goto out_err; 4931 goto out_err;
4659 } 4932 }
4660 /* slot id */ 4933 /* slot id */
4661 dummy = be32_to_cpup(p++); 4934 dummy = be32_to_cpup(p++);
4662 if (dummy != res->sr_slotid) { 4935 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
4663 dprintk("%s Invalid slot id\n", __func__); 4936 dprintk("%s Invalid slot id\n", __func__);
4664 goto out_err; 4937 goto out_err;
4665 } 4938 }
@@ -4682,6 +4955,134 @@ out_overflow:
4682#endif /* CONFIG_NFS_V4_1 */ 4955#endif /* CONFIG_NFS_V4_1 */
4683} 4956}
4684 4957
4958#if defined(CONFIG_NFS_V4_1)
4959
4960static int decode_getdeviceinfo(struct xdr_stream *xdr,
4961 struct pnfs_device *pdev)
4962{
4963 __be32 *p;
4964 uint32_t len, type;
4965 int status;
4966
4967 status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
4968 if (status) {
4969 if (status == -ETOOSMALL) {
4970 p = xdr_inline_decode(xdr, 4);
4971 if (unlikely(!p))
4972 goto out_overflow;
4973 pdev->mincount = be32_to_cpup(p);
4974 dprintk("%s: Min count too small. mincnt = %u\n",
4975 __func__, pdev->mincount);
4976 }
4977 return status;
4978 }
4979
4980 p = xdr_inline_decode(xdr, 8);
4981 if (unlikely(!p))
4982 goto out_overflow;
4983 type = be32_to_cpup(p++);
4984 if (type != pdev->layout_type) {
4985 dprintk("%s: layout mismatch req: %u pdev: %u\n",
4986 __func__, pdev->layout_type, type);
4987 return -EINVAL;
4988 }
4989 /*
4990 * Get the length of the opaque device_addr4. xdr_read_pages places
4991 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
4992 * and places the remaining xdr data in xdr_buf->tail
4993 */
4994 pdev->mincount = be32_to_cpup(p);
4995 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
4996
4997 /* Parse notification bitmap, verifying that it is zero. */
4998 p = xdr_inline_decode(xdr, 4);
4999 if (unlikely(!p))
5000 goto out_overflow;
5001 len = be32_to_cpup(p);
5002 if (len) {
5003 int i;
5004
5005 p = xdr_inline_decode(xdr, 4 * len);
5006 if (unlikely(!p))
5007 goto out_overflow;
5008 for (i = 0; i < len; i++, p++) {
5009 if (be32_to_cpup(p)) {
5010 dprintk("%s: notifications not supported\n",
5011 __func__);
5012 return -EIO;
5013 }
5014 }
5015 }
5016 return 0;
5017out_overflow:
5018 print_overflow_msg(__func__, xdr);
5019 return -EIO;
5020}
5021
5022static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
5023 struct nfs4_layoutget_res *res)
5024{
5025 __be32 *p;
5026 int status;
5027 u32 layout_count;
5028
5029 status = decode_op_hdr(xdr, OP_LAYOUTGET);
5030 if (status)
5031 return status;
5032 p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
5033 if (unlikely(!p))
5034 goto out_overflow;
5035 res->return_on_close = be32_to_cpup(p++);
5036 p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
5037 layout_count = be32_to_cpup(p);
5038 if (!layout_count) {
5039 dprintk("%s: server responded with empty layout array\n",
5040 __func__);
5041 return -EINVAL;
5042 }
5043
5044 p = xdr_inline_decode(xdr, 24);
5045 if (unlikely(!p))
5046 goto out_overflow;
5047 p = xdr_decode_hyper(p, &res->range.offset);
5048 p = xdr_decode_hyper(p, &res->range.length);
5049 res->range.iomode = be32_to_cpup(p++);
5050 res->type = be32_to_cpup(p++);
5051
5052 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
5053 if (unlikely(status))
5054 return status;
5055
5056 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
5057 __func__,
5058 (unsigned long)res->range.offset,
5059 (unsigned long)res->range.length,
5060 res->range.iomode,
5061 res->type,
5062 res->layout.len);
5063
5064 /* nfs4_proc_layoutget allocated a single page */
5065 if (res->layout.len > PAGE_SIZE)
5066 return -ENOMEM;
5067 memcpy(res->layout.buf, p, res->layout.len);
5068
5069 if (layout_count > 1) {
5070 /* We only handle a length one array at the moment. Any
5071 * further entries are just ignored. Note that this means
5072 * the client may see a response that is less than the
5073 * minimum it requested.
5074 */
5075 dprintk("%s: server responded with %d layouts, dropping tail\n",
5076 __func__, layout_count);
5077 }
5078
5079 return 0;
5080out_overflow:
5081 print_overflow_msg(__func__, xdr);
5082 return -EIO;
5083}
5084#endif /* CONFIG_NFS_V4_1 */
5085
4685/* 5086/*
4686 * END OF "GENERIC" DECODE ROUTINES. 5087 * END OF "GENERIC" DECODE ROUTINES.
4687 */ 5088 */
@@ -4824,7 +5225,7 @@ out:
4824/* 5225/*
4825 * Decode RENAME response 5226 * Decode RENAME response
4826 */ 5227 */
4827static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) 5228static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
4828{ 5229{
4829 struct xdr_stream xdr; 5230 struct xdr_stream xdr;
4830 struct compound_hdr hdr; 5231 struct compound_hdr hdr;
@@ -5259,6 +5660,19 @@ out:
5259 return status; 5660 return status;
5260} 5661}
5261 5662
5663static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5664{
5665 struct xdr_stream xdr;
5666 struct compound_hdr hdr;
5667 int status;
5668
5669 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5670 status = decode_compound_hdr(&xdr, &hdr);
5671 if (!status)
5672 status = decode_release_lockowner(&xdr);
5673 return status;
5674}
5675
5262/* 5676/*
5263 * Decode READLINK response 5677 * Decode READLINK response
5264 */ 5678 */
@@ -5696,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
5696 status = decode_reclaim_complete(&xdr, (void *)NULL); 6110 status = decode_reclaim_complete(&xdr, (void *)NULL);
5697 return status; 6111 return status;
5698} 6112}
6113
6114/*
6115 * Decode GETDEVINFO response
6116 */
6117static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
6118 struct nfs4_getdeviceinfo_res *res)
6119{
6120 struct xdr_stream xdr;
6121 struct compound_hdr hdr;
6122 int status;
6123
6124 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6125 status = decode_compound_hdr(&xdr, &hdr);
6126 if (status != 0)
6127 goto out;
6128 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6129 if (status != 0)
6130 goto out;
6131 status = decode_getdeviceinfo(&xdr, res->pdev);
6132out:
6133 return status;
6134}
6135
6136/*
6137 * Decode LAYOUTGET response
6138 */
6139static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
6140 struct nfs4_layoutget_res *res)
6141{
6142 struct xdr_stream xdr;
6143 struct compound_hdr hdr;
6144 int status;
6145
6146 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6147 status = decode_compound_hdr(&xdr, &hdr);
6148 if (status)
6149 goto out;
6150 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6151 if (status)
6152 goto out;
6153 status = decode_putfh(&xdr);
6154 if (status)
6155 goto out;
6156 status = decode_layoutget(&xdr, rqstp, res);
6157out:
6158 return status;
6159}
5699#endif /* CONFIG_NFS_V4_1 */ 6160#endif /* CONFIG_NFS_V4_1 */
5700 6161
5701__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 6162__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6163 struct nfs_server *server, int plus)
5702{ 6164{
5703 uint32_t bitmap[2] = {0}; 6165 uint32_t bitmap[2] = {0};
5704 uint32_t len; 6166 uint32_t len;
5705 6167 __be32 *p = xdr_inline_decode(xdr, 4);
5706 if (!*p++) { 6168 if (unlikely(!p))
5707 if (!*p) 6169 goto out_overflow;
6170 if (!ntohl(*p++)) {
6171 p = xdr_inline_decode(xdr, 4);
6172 if (unlikely(!p))
6173 goto out_overflow;
6174 if (!ntohl(*p++))
5708 return ERR_PTR(-EAGAIN); 6175 return ERR_PTR(-EAGAIN);
5709 entry->eof = 1; 6176 entry->eof = 1;
5710 return ERR_PTR(-EBADCOOKIE); 6177 return ERR_PTR(-EBADCOOKIE);
5711 } 6178 }
5712 6179
6180 p = xdr_inline_decode(xdr, 12);
6181 if (unlikely(!p))
6182 goto out_overflow;
5713 entry->prev_cookie = entry->cookie; 6183 entry->prev_cookie = entry->cookie;
5714 p = xdr_decode_hyper(p, &entry->cookie); 6184 p = xdr_decode_hyper(p, &entry->cookie);
5715 entry->len = ntohl(*p++); 6185 entry->len = ntohl(*p++);
6186
6187 p = xdr_inline_decode(xdr, entry->len);
6188 if (unlikely(!p))
6189 goto out_overflow;
5716 entry->name = (const char *) p; 6190 entry->name = (const char *) p;
5717 p += XDR_QUADLEN(entry->len);
5718 6191
5719 /* 6192 /*
5720 * In case the server doesn't return an inode number, 6193 * In case the server doesn't return an inode number,
@@ -5722,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
5722 * since glibc seems to choke on it...) 6195 * since glibc seems to choke on it...)
5723 */ 6196 */
5724 entry->ino = 1; 6197 entry->ino = 1;
6198 entry->fattr->valid = 0;
5725 6199
5726 len = ntohl(*p++); /* bitmap length */ 6200 if (decode_attr_bitmap(xdr, bitmap) < 0)
5727 if (len-- > 0) { 6201 goto out_overflow;
5728 bitmap[0] = ntohl(*p++); 6202
5729 if (len-- > 0) { 6203 if (decode_attr_length(xdr, &len, &p) < 0)
5730 bitmap[1] = ntohl(*p++); 6204 goto out_overflow;
5731 p += len; 6205
5732 } 6206 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
5733 } 6207 goto out_overflow;
5734 len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ 6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
5735 if (len > 0) { 6209 entry->ino = entry->fattr->fileid;
5736 if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { 6210
5737 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; 6211 if (verify_attr_len(xdr, p, len) < 0)
5738 /* Ignore the return value of rdattr_error for now */ 6212 goto out_overflow;
5739 p++; 6213
5740 len--; 6214 p = xdr_inline_peek(xdr, 8);
5741 } 6215 if (p != NULL)
5742 if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) 6216 entry->eof = !p[0] && p[1];
5743 xdr_decode_hyper(p, &entry->ino); 6217 else
5744 else if (bitmap[0] == FATTR4_WORD0_FILEID) 6218 entry->eof = 0;
5745 xdr_decode_hyper(p, &entry->ino);
5746 p += len;
5747 }
5748 6219
5749 entry->eof = !p[0] && p[1];
5750 return p; 6220 return p;
6221
6222out_overflow:
6223 print_overflow_msg(__func__, xdr);
6224 return ERR_PTR(-EIO);
5751} 6225}
5752 6226
5753/* 6227/*
@@ -5866,6 +6340,7 @@ struct rpc_procinfo nfs4_procedures[] = {
5866 PROC(GETACL, enc_getacl, dec_getacl), 6340 PROC(GETACL, enc_getacl, dec_getacl),
5867 PROC(SETACL, enc_setacl, dec_setacl), 6341 PROC(SETACL, enc_setacl, dec_setacl),
5868 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6342 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
6343 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
5869#if defined(CONFIG_NFS_V4_1) 6344#if defined(CONFIG_NFS_V4_1)
5870 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6345 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
5871 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6346 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
@@ -5873,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = {
5873 PROC(SEQUENCE, enc_sequence, dec_sequence), 6348 PROC(SEQUENCE, enc_sequence, dec_sequence),
5874 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6349 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5875 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6350 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6351 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6352 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
5876#endif /* CONFIG_NFS_V4_1 */ 6353#endif /* CONFIG_NFS_V4_1 */
5877}; 6354};
5878 6355
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546..903908a2002 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
3 * 3 *
4 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. 5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
6 * (2) Handle RPC negotiation with the system which replied to RARP or 6 * (2) Construct the device string and the options string using DHCP
7 * was reported as a boot server by BOOTP or manually. 7 * option 17 and/or kernel command line options.
8 * (3) The actual mounting is done later, when init() is running. 8 * (3) When mount_root() sets up the root file system, pass these strings
9 * to the NFS client's regular mount interface via sys_mount().
9 * 10 *
10 * 11 *
11 * Changes: 12 * Changes:
@@ -65,470 +66,245 @@
65 * Hua Qin : Support for mounting root file system via 66 * Hua Qin : Support for mounting root file system via
66 * NFS over TCP. 67 * NFS over TCP.
67 * Fabian Frederick: Option parser rebuilt (using parser lib) 68 * Fabian Frederick: Option parser rebuilt (using parser lib)
68*/ 69 * Chuck Lever : Use super.c's text-based mount option parsing
70 * Chuck Lever : Add "nfsrootdebug".
71 */
69 72
70#include <linux/types.h> 73#include <linux/types.h>
71#include <linux/string.h> 74#include <linux/string.h>
72#include <linux/kernel.h>
73#include <linux/time.h>
74#include <linux/fs.h>
75#include <linux/init.h> 75#include <linux/init.h>
76#include <linux/sunrpc/clnt.h>
77#include <linux/sunrpc/xprtsock.h>
78#include <linux/nfs.h> 76#include <linux/nfs.h>
79#include <linux/nfs_fs.h> 77#include <linux/nfs_fs.h>
80#include <linux/nfs_mount.h>
81#include <linux/in.h>
82#include <linux/major.h>
83#include <linux/utsname.h> 78#include <linux/utsname.h>
84#include <linux/inet.h>
85#include <linux/root_dev.h> 79#include <linux/root_dev.h>
86#include <net/ipconfig.h> 80#include <net/ipconfig.h>
87#include <linux/parser.h>
88 81
89#include "internal.h" 82#include "internal.h"
90 83
91/* Define this to allow debugging output */
92#undef NFSROOT_DEBUG
93#define NFSDBG_FACILITY NFSDBG_ROOT 84#define NFSDBG_FACILITY NFSDBG_ROOT
94 85
95/* Default port to use if server is not running a portmapper */
96#define NFS_MNT_PORT 627
97
98/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
99#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
100 88
101/* Parameters passed from the kernel command line */ 89/* Parameters passed from the kernel command line */
102static char nfs_root_name[256] __initdata = ""; 90static char nfs_root_parms[256] __initdata = "";
91
92/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = "";
103 94
104/* Address of NFS server */ 95/* Address of NFS server */
105static __be32 servaddr __initdata = 0; 96static __be32 servaddr __initdata = htonl(INADDR_NONE);
106 97
107/* Name of directory to mount */ 98/* Name of directory to mount */
108static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; 99static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
109
110/* NFS-related data */
111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
112static int nfs_port __initdata = 0; /* Port to connect to for NFS */
113static int mount_port __initdata = 0; /* Mount daemon port number */
114
115
116/***************************************************************************
117
118 Parsing of options
119
120 ***************************************************************************/
121
122enum {
123 /* Options that take integer arguments */
124 Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
125 Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
126 /* Options that take no arguments */
127 Opt_soft, Opt_hard, Opt_intr,
128 Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac,
129 Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
130 Opt_acl, Opt_noacl,
131 /* Error token */
132 Opt_err
133};
134
135static const match_table_t tokens __initconst = {
136 {Opt_port, "port=%u"},
137 {Opt_rsize, "rsize=%u"},
138 {Opt_wsize, "wsize=%u"},
139 {Opt_timeo, "timeo=%u"},
140 {Opt_retrans, "retrans=%u"},
141 {Opt_acregmin, "acregmin=%u"},
142 {Opt_acregmax, "acregmax=%u"},
143 {Opt_acdirmin, "acdirmin=%u"},
144 {Opt_acdirmax, "acdirmax=%u"},
145 {Opt_soft, "soft"},
146 {Opt_hard, "hard"},
147 {Opt_intr, "intr"},
148 {Opt_nointr, "nointr"},
149 {Opt_posix, "posix"},
150 {Opt_noposix, "noposix"},
151 {Opt_cto, "cto"},
152 {Opt_nocto, "nocto"},
153 {Opt_ac, "ac"},
154 {Opt_noac, "noac"},
155 {Opt_lock, "lock"},
156 {Opt_nolock, "nolock"},
157 {Opt_v2, "nfsvers=2"},
158 {Opt_v2, "v2"},
159 {Opt_v3, "nfsvers=3"},
160 {Opt_v3, "v3"},
161 {Opt_udp, "proto=udp"},
162 {Opt_udp, "udp"},
163 {Opt_tcp, "proto=tcp"},
164 {Opt_tcp, "tcp"},
165 {Opt_acl, "acl"},
166 {Opt_noacl, "noacl"},
167 {Opt_err, NULL}
168
169};
170 100
101/* server:export path string passed to super.c */
102static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
103
104#ifdef RPC_DEBUG
171/* 105/*
172 * Parse option string. 106 * When the "nfsrootdebug" kernel command line option is specified,
107 * enable debugging messages for NFSROOT.
173 */ 108 */
174 109static int __init nfs_root_debug(char *__unused)
175static int __init root_nfs_parse(char *name, char *buf)
176{ 110{
177 111 nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
178 char *p;
179 substring_t args[MAX_OPT_ARGS];
180 int option;
181
182 if (!name)
183 return 1;
184
185 /* Set the NFS remote path */
186 p = strsep(&name, ",");
187 if (p[0] != '\0' && strcmp(p, "default") != 0)
188 strlcpy(buf, p, NFS_MAXPATHLEN);
189
190 while ((p = strsep (&name, ",")) != NULL) {
191 int token;
192 if (!*p)
193 continue;
194 token = match_token(p, tokens, args);
195
196 /* %u tokens only. Beware if you add new tokens! */
197 if (token < Opt_soft && match_int(&args[0], &option))
198 return 0;
199 switch (token) {
200 case Opt_port:
201 nfs_port = option;
202 break;
203 case Opt_rsize:
204 nfs_data.rsize = option;
205 break;
206 case Opt_wsize:
207 nfs_data.wsize = option;
208 break;
209 case Opt_timeo:
210 nfs_data.timeo = option;
211 break;
212 case Opt_retrans:
213 nfs_data.retrans = option;
214 break;
215 case Opt_acregmin:
216 nfs_data.acregmin = option;
217 break;
218 case Opt_acregmax:
219 nfs_data.acregmax = option;
220 break;
221 case Opt_acdirmin:
222 nfs_data.acdirmin = option;
223 break;
224 case Opt_acdirmax:
225 nfs_data.acdirmax = option;
226 break;
227 case Opt_soft:
228 nfs_data.flags |= NFS_MOUNT_SOFT;
229 break;
230 case Opt_hard:
231 nfs_data.flags &= ~NFS_MOUNT_SOFT;
232 break;
233 case Opt_intr:
234 case Opt_nointr:
235 break;
236 case Opt_posix:
237 nfs_data.flags |= NFS_MOUNT_POSIX;
238 break;
239 case Opt_noposix:
240 nfs_data.flags &= ~NFS_MOUNT_POSIX;
241 break;
242 case Opt_cto:
243 nfs_data.flags &= ~NFS_MOUNT_NOCTO;
244 break;
245 case Opt_nocto:
246 nfs_data.flags |= NFS_MOUNT_NOCTO;
247 break;
248 case Opt_ac:
249 nfs_data.flags &= ~NFS_MOUNT_NOAC;
250 break;
251 case Opt_noac:
252 nfs_data.flags |= NFS_MOUNT_NOAC;
253 break;
254 case Opt_lock:
255 nfs_data.flags &= ~NFS_MOUNT_NONLM;
256 break;
257 case Opt_nolock:
258 nfs_data.flags |= NFS_MOUNT_NONLM;
259 break;
260 case Opt_v2:
261 nfs_data.flags &= ~NFS_MOUNT_VER3;
262 break;
263 case Opt_v3:
264 nfs_data.flags |= NFS_MOUNT_VER3;
265 break;
266 case Opt_udp:
267 nfs_data.flags &= ~NFS_MOUNT_TCP;
268 break;
269 case Opt_tcp:
270 nfs_data.flags |= NFS_MOUNT_TCP;
271 break;
272 case Opt_acl:
273 nfs_data.flags &= ~NFS_MOUNT_NOACL;
274 break;
275 case Opt_noacl:
276 nfs_data.flags |= NFS_MOUNT_NOACL;
277 break;
278 default:
279 printk(KERN_WARNING "Root-NFS: unknown "
280 "option: %s\n", p);
281 return 0;
282 }
283 }
284
285 return 1; 112 return 1;
286} 113}
287 114
115__setup("nfsrootdebug", nfs_root_debug);
116#endif
117
288/* 118/*
289 * Prepare the NFS data structure and parse all options. 119 * Parse NFS server and directory information passed on the kernel
120 * command line.
121 *
122 * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
123 *
124 * If there is a "%s" token in the <root-dir> string, it is replaced
125 * by the ASCII-representation of the client's IP address.
290 */ 126 */
291static int __init root_nfs_name(char *name) 127static int __init nfs_root_setup(char *line)
292{ 128{
293 static char buf[NFS_MAXPATHLEN] __initdata; 129 ROOT_DEV = Root_NFS;
294 char *cp; 130
295 131 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
296 /* Set some default values */ 132 strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
297 memset(&nfs_data, 0, sizeof(nfs_data)); 133 } else {
298 nfs_port = -1; 134 size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
299 nfs_data.version = NFS_MOUNT_VERSION; 135 if (n >= sizeof(nfs_root_parms))
300 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 136 line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
301 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 137 sprintf(nfs_root_parms, NFS_ROOT, line);
302 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
303 nfs_data.acregmin = NFS_DEF_ACREGMIN;
304 nfs_data.acregmax = NFS_DEF_ACREGMAX;
305 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
306 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
307 strcpy(buf, NFS_ROOT);
308
309 /* Process options received from the remote server */
310 root_nfs_parse(root_server_path, buf);
311
312 /* Override them by options set on kernel command-line */
313 root_nfs_parse(name, buf);
314
315 cp = utsname()->nodename;
316 if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
317 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
318 return -1;
319 } 138 }
320 sprintf(nfs_export_path, buf, cp); 139
140 /*
141 * Extract the IP address of the NFS server containing our
142 * root file system, if one was specified.
143 *
144 * Note: root_nfs_parse_addr() removes the server-ip from
145 * nfs_root_parms, if it exists.
146 */
147 root_server_addr = root_nfs_parse_addr(nfs_root_parms);
321 148
322 return 1; 149 return 1;
323} 150}
324 151
152__setup("nfsroot=", nfs_root_setup);
325 153
326/* 154static int __init root_nfs_copy(char *dest, const char *src,
327 * Get NFS server address. 155 const size_t destlen)
328 */
329static int __init root_nfs_addr(void)
330{ 156{
331 if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { 157 if (strlcpy(dest, src, destlen) > destlen)
332 printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
333 return -1; 158 return -1;
334 } 159 return 0;
160}
335 161
336 snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), 162static int __init root_nfs_cat(char *dest, const char *src,
337 "%pI4", &servaddr); 163 const size_t destlen)
164{
165 if (strlcat(dest, src, destlen) > destlen)
166 return -1;
338 return 0; 167 return 0;
339} 168}
340 169
341/* 170/*
342 * Tell the user what's going on. 171 * Parse out root export path and mount options from
172 * passed-in string @incoming.
173 *
174 * Copy the export path into @exppath.
343 */ 175 */
344#ifdef NFSROOT_DEBUG 176static int __init root_nfs_parse_options(char *incoming, char *exppath,
345static void __init root_nfs_print(void) 177 const size_t exppathlen)
346{ 178{
347 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", 179 char *p;
348 nfs_export_path, nfs_data.hostname);
349 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
350 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
351 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
352 nfs_data.acregmin, nfs_data.acregmax,
353 nfs_data.acdirmin, nfs_data.acdirmax);
354 printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n",
355 nfs_port, mount_port, nfs_data.flags);
356}
357#endif
358
359 180
360static int __init root_nfs_init(void) 181 /*
361{ 182 * Set the NFS remote path
362#ifdef NFSROOT_DEBUG 183 */
363 nfs_debug |= NFSDBG_ROOT; 184 p = strsep(&incoming, ",");
364#endif 185 if (*p != '\0' && strcmp(p, "default") != 0)
186 if (root_nfs_copy(exppath, p, exppathlen))
187 return -1;
365 188
366 /* 189 /*
367 * Decode the root directory path name and NFS options from 190 * @incoming now points to the rest of the string; if it
368 * the kernel command line. This has to go here in order to 191 * contains something, append it to our root options buffer
369 * be able to use the client IP address for the remote root
370 * directory (necessary for pure RARP booting).
371 */ 192 */
372 if (root_nfs_name(nfs_root_name) < 0 || 193 if (incoming != NULL && *incoming != '\0')
373 root_nfs_addr() < 0) 194 if (root_nfs_cat(nfs_root_options, incoming,
374 return -1; 195 sizeof(nfs_root_options)))
196 return -1;
375 197
376#ifdef NFSROOT_DEBUG 198 /*
377 root_nfs_print(); 199 * Possibly prepare for more options to be appended
378#endif 200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
379 206
380 return 0; 207 return 0;
381} 208}
382 209
383
384/* 210/*
385 * Parse NFS server and directory information passed on the kernel 211 * Decode the export directory path name and NFS options from
386 * command line. 212 * the kernel command line. This has to be done late in order to
213 * use a dynamically acquired client IP address for the remote
214 * root directory path.
215 *
216 * Returns zero if successful; otherwise -1 is returned.
387 */ 217 */
388static int __init nfs_root_setup(char *line) 218static int __init root_nfs_data(char *cmdline)
389{ 219{
390 ROOT_DEV = Root_NFS; 220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
391 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { 221 int len, retval = -1;
392 strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); 222 char *tmp = NULL;
393 } else { 223 const size_t tmplen = sizeof(nfs_export_path);
394 int n = strlen(line) + sizeof(NFS_ROOT) - 1; 224
395 if (n >= sizeof(nfs_root_name)) 225 tmp = kzalloc(tmplen, GFP_KERNEL);
396 line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; 226 if (tmp == NULL)
397 sprintf(nfs_root_name, NFS_ROOT, line); 227 goto out_nomem;
228 strcpy(tmp, NFS_ROOT);
229
230 if (root_server_path[0] != '\0') {
231 dprintk("Root-NFS: DHCPv4 option 17: %s\n",
232 root_server_path);
233 if (root_nfs_parse_options(root_server_path, tmp, tmplen))
234 goto out_optionstoolong;
398 } 235 }
399 root_server_addr = root_nfs_parse_addr(nfs_root_name);
400 return 1;
401}
402
403__setup("nfsroot=", nfs_root_setup);
404
405/***************************************************************************
406 236
407 Routines to actually mount the root directory 237 if (cmdline[0] != '\0') {
238 dprintk("Root-NFS: nfsroot=%s\n", cmdline);
239 if (root_nfs_parse_options(cmdline, tmp, tmplen))
240 goto out_optionstoolong;
241 }
408 242
409 ***************************************************************************/ 243 /*
244 * Append mandatory options for nfsroot so they override
245 * what has come before
246 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
248 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option,
250 sizeof(nfs_root_options)))
251 goto out_optionstoolong;
410 252
411/* 253 /*
412 * Construct sockaddr_in from address and port number. 254 * Set up nfs_root_device. For NFS mounts, this looks like
413 */ 255 *
414static inline void 256 * server:/path
415set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) 257 *
416{ 258 * At this point, utsname()->nodename contains our local
417 sin->sin_family = AF_INET; 259 * IP address or hostname, set by ipconfig. If "%s" exists
418 sin->sin_addr.s_addr = addr; 260 * in tmp, substitute the nodename, then shovel the whole
419 sin->sin_port = port; 261 * mess into nfs_root_device.
420} 262 */
263 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
264 tmp, utsname()->nodename);
265 if (len > (int)sizeof(nfs_export_path))
266 goto out_devnametoolong;
267 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
268 "%pI4:%s", &servaddr, nfs_export_path);
269 if (len > (int)sizeof(nfs_root_device))
270 goto out_devnametoolong;
421 271
422/* 272 retval = 0;
423 * Query server portmapper for the port of a daemon program.
424 */
425static int __init root_nfs_getport(int program, int version, int proto)
426{
427 struct sockaddr_in sin;
428 273
429 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", 274out:
430 program, version, &servaddr); 275 kfree(tmp);
431 set_sockaddr(&sin, servaddr, 0); 276 return retval;
432 return rpcb_getport_sync(&sin, program, version, proto); 277out_nomem:
278 printk(KERN_ERR "Root-NFS: could not allocate memory\n");
279 goto out;
280out_optionstoolong:
281 printk(KERN_ERR "Root-NFS: mount options string too long\n");
282 goto out;
283out_devnametoolong:
284 printk(KERN_ERR "Root-NFS: root device name too long.\n");
285 goto out;
433} 286}
434 287
435 288/**
436/* 289 * nfs_root_data - Return prepared 'data' for NFSROOT mount
437 * Use portmapper to find mountd and nfsd port numbers if not overriden 290 * @root_device: OUT: address of string containing NFSROOT device
438 * by the user. Use defaults if portmapper is not available. 291 * @root_data: OUT: address of string containing NFSROOT mount options
439 * XXX: Is there any nfs server with no portmapper? 292 *
293 * Returns zero and sets @root_device and @root_data if successful,
294 * otherwise -1 is returned.
440 */ 295 */
441static int __init root_nfs_ports(void) 296int __init nfs_root_data(char **root_device, char **root_data)
442{ 297{
443 int port; 298 servaddr = root_server_addr;
444 int nfsd_ver, mountd_ver; 299 if (servaddr == htonl(INADDR_NONE)) {
445 int nfsd_port, mountd_port; 300 printk(KERN_ERR "Root-NFS: no NFS server address\n");
446 int proto; 301 return -1;
447
448 if (nfs_data.flags & NFS_MOUNT_VER3) {
449 nfsd_ver = NFS3_VERSION;
450 mountd_ver = NFS_MNT3_VERSION;
451 nfsd_port = NFS_PORT;
452 mountd_port = NFS_MNT_PORT;
453 } else {
454 nfsd_ver = NFS2_VERSION;
455 mountd_ver = NFS_MNT_VERSION;
456 nfsd_port = NFS_PORT;
457 mountd_port = NFS_MNT_PORT;
458 }
459
460 proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
461
462 if (nfs_port < 0) {
463 if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
464 printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
465 "number from server, using default\n");
466 port = nfsd_port;
467 }
468 nfs_port = port;
469 dprintk("Root-NFS: Portmapper on server returned %d "
470 "as nfsd port\n", port);
471 } 302 }
472 303
473 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { 304 if (root_nfs_data(nfs_root_parms) < 0)
474 printk(KERN_ERR "Root-NFS: Unable to get mountd port " 305 return -1;
475 "number from server, using default\n");
476 port = mountd_port;
477 }
478 mount_port = port;
479 dprintk("Root-NFS: mountd port is %d\n", port);
480 306
307 *root_device = nfs_root_device;
308 *root_data = nfs_root_options;
481 return 0; 309 return 0;
482} 310}
483
484
485/*
486 * Get a file handle from the server for the directory which is to be
487 * mounted.
488 */
489static int __init root_nfs_get_handle(void)
490{
491 struct sockaddr_in sin;
492 unsigned int auth_flav_len = 0;
493 struct nfs_mount_request request = {
494 .sap = (struct sockaddr *)&sin,
495 .salen = sizeof(sin),
496 .dirpath = nfs_export_path,
497 .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
501 .auth_flav_len = &auth_flav_len,
502 };
503 int status = -ENOMEM;
504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
508 set_sockaddr(&sin, servaddr, htons(mount_port));
509 status = nfs_mount(&request);
510 if (status < 0)
511 printk(KERN_ERR "Root-NFS: Server returned error %d "
512 "while mounting %s\n", status, nfs_export_path);
513 else {
514 nfs_data.root.size = request.fh->size;
515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
516 }
517 nfs_free_fhandle(request.fh);
518out:
519 return status;
520}
521
522/*
523 * Get the NFS port numbers and file handle, and return the prepared 'data'
524 * argument for mount() if everything went OK. Return NULL otherwise.
525 */
526void * __init nfs_root_data(void)
527{
528 if (root_nfs_init() < 0
529 || root_nfs_ports() < 0
530 || root_nfs_get_handle() < 0)
531 return NULL;
532 set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
533 return (void*)&nfs_data;
534}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b58..137b549e63d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
65 if (req == NULL) 65 if (req == NULL)
66 return ERR_PTR(-ENOMEM); 66 return ERR_PTR(-ENOMEM);
67 67
68 /* get lock context early so we can deal with alloc failures */
69 req->wb_lock_context = nfs_get_lock_context(ctx);
70 if (req->wb_lock_context == NULL) {
71 nfs_page_free(req);
72 return ERR_PTR(-ENOMEM);
73 }
74
68 /* Initialize the request struct. Initially, we assume a 75 /* Initialize the request struct. Initially, we assume a
69 * long write-back delay. This will be adjusted in 76 * long write-back delay. This will be adjusted in
70 * update_nfs_request below if the region is not locked. */ 77 * update_nfs_request below if the region is not locked. */
@@ -141,11 +148,16 @@ void nfs_clear_request(struct nfs_page *req)
141{ 148{
142 struct page *page = req->wb_page; 149 struct page *page = req->wb_page;
143 struct nfs_open_context *ctx = req->wb_context; 150 struct nfs_open_context *ctx = req->wb_context;
151 struct nfs_lock_context *l_ctx = req->wb_lock_context;
144 152
145 if (page != NULL) { 153 if (page != NULL) {
146 page_cache_release(page); 154 page_cache_release(page);
147 req->wb_page = NULL; 155 req->wb_page = NULL;
148 } 156 }
157 if (l_ctx != NULL) {
158 nfs_put_lock_context(l_ctx);
159 req->wb_lock_context = NULL;
160 }
149 if (ctx != NULL) { 161 if (ctx != NULL) {
150 put_nfs_open_context(ctx); 162 put_nfs_open_context(ctx);
151 req->wb_context = NULL; 163 req->wb_context = NULL;
@@ -235,7 +247,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
235{ 247{
236 if (req->wb_context->cred != prev->wb_context->cred) 248 if (req->wb_context->cred != prev->wb_context->cred)
237 return 0; 249 return 0;
238 if (req->wb_context->lockowner != prev->wb_context->lockowner) 250 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
239 return 0; 251 return 0;
240 if (req->wb_context->state != prev->wb_context->state) 252 if (req->wb_context->state != prev->wb_context->state)
241 return 0; 253 return 0;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 00000000000..db773428f95
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
1/*
2 * pNFS functions to call and manage layout drivers.
3 *
4 * Copyright (c) 2002 [year of first publication]
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#include <linux/nfs_fs.h>
31#include "internal.h"
32#include "pnfs.h"
33
34#define NFSDBG_FACILITY NFSDBG_PNFS
35
36/* Locking:
37 *
38 * pnfs_spinlock:
39 * protects pnfs_modules_tbl.
40 */
41static DEFINE_SPINLOCK(pnfs_spinlock);
42
43/*
44 * pnfs_modules_tbl holds all pnfs modules
45 */
46static LIST_HEAD(pnfs_modules_tbl);
47
48/* Return the registered pnfs layout driver module matching given id */
49static struct pnfs_layoutdriver_type *
50find_pnfs_driver_locked(u32 id)
51{
52 struct pnfs_layoutdriver_type *local;
53
54 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
55 if (local->id == id)
56 goto out;
57 local = NULL;
58out:
59 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
60 return local;
61}
62
63static struct pnfs_layoutdriver_type *
64find_pnfs_driver(u32 id)
65{
66 struct pnfs_layoutdriver_type *local;
67
68 spin_lock(&pnfs_spinlock);
69 local = find_pnfs_driver_locked(id);
70 spin_unlock(&pnfs_spinlock);
71 return local;
72}
73
74void
75unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{
77 if (nfss->pnfs_curr_ld) {
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL;
82}
83
84/*
85 * Try to set the server's pnfs module to the pnfs layout type specified by id.
86 * Currently only one pNFS layout driver per filesystem is supported.
87 *
88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
89 */
90void
91set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
92{
93 struct pnfs_layoutdriver_type *ld_type = NULL;
94
95 if (id == 0)
96 goto out_no_driver;
97 if (!(server->nfs_client->cl_exchange_flags &
98 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
99 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
100 id, server->nfs_client->cl_exchange_flags);
101 goto out_no_driver;
102 }
103 ld_type = find_pnfs_driver(id);
104 if (!ld_type) {
105 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
106 ld_type = find_pnfs_driver(id);
107 if (!ld_type) {
108 dprintk("%s: No pNFS module found for %u.\n",
109 __func__, id);
110 goto out_no_driver;
111 }
112 }
113 if (!try_module_get(ld_type->owner)) {
114 dprintk("%s: Could not grab reference on module\n", __func__);
115 goto out_no_driver;
116 }
117 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) {
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return;
127
128out_no_driver:
129 dprintk("%s: Using NFSv4 I/O\n", __func__);
130 server->pnfs_curr_ld = NULL;
131}
132
133int
134pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
135{
136 int status = -EINVAL;
137 struct pnfs_layoutdriver_type *tmp;
138
139 if (ld_type->id == 0) {
140 printk(KERN_ERR "%s id 0 is reserved\n", __func__);
141 return status;
142 }
143 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
144 printk(KERN_ERR "%s Layout driver must provide "
145 "alloc_lseg and free_lseg.\n", __func__);
146 return status;
147 }
148
149 spin_lock(&pnfs_spinlock);
150 tmp = find_pnfs_driver_locked(ld_type->id);
151 if (!tmp) {
152 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
153 status = 0;
154 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
155 ld_type->name);
156 } else {
157 printk(KERN_ERR "%s Module with id %d already loaded!\n",
158 __func__, ld_type->id);
159 }
160 spin_unlock(&pnfs_spinlock);
161
162 return status;
163}
164EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
165
166void
167pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
168{
169 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
170 spin_lock(&pnfs_spinlock);
171 list_del(&ld_type->pnfs_tblid);
172 spin_unlock(&pnfs_spinlock);
173}
174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
175
176/*
177 * pNFS client layout cache
178 */
179
180static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
182{
183 assert_spin_locked(&lo->inode->i_lock);
184 lo->refcount++;
185}
186
187static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{
190 assert_spin_locked(&lo->inode->i_lock);
191 BUG_ON(lo->refcount == 0);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200}
201
202void
203put_layout_hdr(struct inode *inode)
204{
205 spin_lock(&inode->i_lock);
206 put_layout_hdr_locked(NFS_I(inode)->layout);
207 spin_unlock(&inode->i_lock);
208}
209
210static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{
213 INIT_LIST_HEAD(&lseg->fi_list);
214 kref_init(&lseg->kref);
215 lseg->layout = lo;
216}
217
218/* Called without i_lock held, as the free_lseg call may sleep */
219static void
220destroy_lseg(struct kref *kref)
221{
222 struct pnfs_layout_segment *lseg =
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
229 put_layout_hdr(ino);
230}
231
232static void
233put_lseg(struct pnfs_layout_segment *lseg)
234{
235 if (!lseg)
236 return;
237
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
239 atomic_read(&lseg->kref.refcount));
240 kref_put(&lseg->kref, destroy_lseg);
241}
242
243static void
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
245{
246 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp;
248
249 dprintk("%s:Begin lo %p\n", __func__, lo);
250
251 assert_spin_locked(&lo->inode->i_lock);
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg);
254 list_move(&lseg->fi_list, tmp_list);
255 }
256 clp = NFS_SERVER(lo->inode)->nfs_client;
257 spin_lock(&clp->cl_lock);
258 /* List does not take a reference, so no need for put here */
259 list_del_init(&lo->layouts);
260 spin_unlock(&clp->cl_lock);
261 write_seqlock(&lo->seqlock);
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266}
267
268static void
269pnfs_free_lseg_list(struct list_head *tmp_list)
270{
271 struct pnfs_layout_segment *lseg;
272
273 while (!list_empty(tmp_list)) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
275 fi_list);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 }
280}
281
282void
283pnfs_destroy_layout(struct nfs_inode *nfsi)
284{
285 struct pnfs_layout_hdr *lo;
286 LIST_HEAD(tmp_list);
287
288 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout;
290 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo);
294 }
295 spin_unlock(&nfsi->vfs_inode.i_lock);
296 pnfs_free_lseg_list(&tmp_list);
297}
298
299/*
300 * Called by the state manger to remove all layouts established under an
301 * expired lease.
302 */
303void
304pnfs_destroy_all_layouts(struct nfs_client *clp)
305{
306 struct pnfs_layout_hdr *lo;
307 LIST_HEAD(tmp_list);
308
309 spin_lock(&clp->cl_lock);
310 list_splice_init(&clp->cl_layouts, &tmp_list);
311 spin_unlock(&clp->cl_lock);
312
313 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode));
319 }
320}
321
322/* update lo->stateid with new if is more recent
323 *
324 * lo->stateid could be the open stateid, in which case we just use what given.
325 */
326static void
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
328 const nfs4_stateid *new)
329{
330 nfs4_stateid *old = &lo->stateid;
331 bool overwrite = false;
332
333 write_seqlock(&lo->seqlock);
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
336 overwrite = true;
337 else {
338 u32 oldseq, newseq;
339
340 oldseq = be32_to_cpu(old->stateid.seqid);
341 newseq = be32_to_cpu(new->stateid.seqid);
342 if ((int)(newseq - oldseq) > 0)
343 overwrite = true;
344 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348}
349
350static void
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
352 struct nfs4_state *state)
353{
354 int seq;
355
356 dprintk("--> %s\n", __func__);
357 write_seqlock(&lo->seqlock);
358 do {
359 seq = read_seqbegin(&state->seqlock);
360 memcpy(lo->stateid.data, state->stateid.data,
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366}
367
368void
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state)
371{
372 int seq;
373
374 dprintk("--> %s\n", __func__);
375 do {
376 seq = read_seqbegin(&lo->seqlock);
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
378 /* This will trigger retry of the read */
379 pnfs_layout_from_open_stateid(lo, open_state);
380 } else
381 memcpy(dst->data, lo->stateid.data,
382 sizeof(lo->stateid.data));
383 } while (read_seqretry(&lo->seqlock, seq));
384 dprintk("<-- %s\n", __func__);
385}
386
387/*
388* Get layout from server.
389* for now, assume that whole file layouts are requested.
390* arg->offset: 0
391* arg->length: all ones
392*/
393static struct pnfs_layout_segment *
394send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx,
396 u32 iomode)
397{
398 struct inode *ino = lo->inode;
399 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL;
402
403 dprintk("--> %s\n", __func__);
404
405 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) {
408 put_layout_hdr(lo->inode);
409 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode;
414 lgp->args.range.offset = 0;
415 lgp->args.range.length = NFS4_MAX_UINT64;
416 lgp->args.type = server->pnfs_curr_ld->id;
417 lgp->args.inode = ino;
418 lgp->args.ctx = get_nfs_open_context(ctx);
419 lgp->lsegpp = &lseg;
420
421 /* Synchronously retrieve layout information from server and
422 * store in lseg.
423 */
424 nfs4_proc_layoutget(lgp);
425 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state);
428 }
429 return lseg;
430}
431
432/*
433 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those
435 * are seen first.
436 */
437static s64
438cmp_layout(u32 iomode1, u32 iomode2)
439{
440 /* read > read/write */
441 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
442}
443
444static void
445pnfs_insert_layout(struct pnfs_layout_hdr *lo,
446 struct pnfs_layout_segment *lseg)
447{
448 struct pnfs_layout_segment *lp;
449 int found = 0;
450
451 dprintk("%s:Begin\n", __func__);
452
453 assert_spin_locked(&lo->inode->i_lock);
454 if (list_empty(&lo->segs)) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list);
466 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode,
470 lseg->range.offset, lseg->range.length,
471 lp, lp->range.iomode, lp->range.offset,
472 lp->range.length);
473 found = 1;
474 break;
475 }
476 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs);
478 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode,
481 lseg->range.offset, lseg->range.length);
482 }
483 get_layout_hdr_locked(lo);
484
485 dprintk("%s:Return\n", __func__);
486}
487
488static struct pnfs_layout_hdr *
489alloc_init_layout_hdr(struct inode *ino)
490{
491 struct pnfs_layout_hdr *lo;
492
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo)
495 return NULL;
496 lo->refcount = 1;
497 INIT_LIST_HEAD(&lo->layouts);
498 INIT_LIST_HEAD(&lo->segs);
499 seqlock_init(&lo->seqlock);
500 lo->inode = ino;
501 return lo;
502}
503
504static struct pnfs_layout_hdr *
505pnfs_find_alloc_layout(struct inode *ino)
506{
507 struct nfs_inode *nfsi = NFS_I(ino);
508 struct pnfs_layout_hdr *new = NULL;
509
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511
512 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout)
514 return nfsi->layout;
515
516 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock);
519
520 if (likely(nfsi->layout == NULL)) /* Won the race? */
521 nfsi->layout = new;
522 else
523 kfree(new);
524 return nfsi->layout;
525}
526
527/*
528 * iomode matching rules:
529 * iomode lseg match
530 * ----- ----- -----
531 * ANY READ true
532 * ANY RW true
533 * RW READ false
534 * RW RW true
535 * READ READ true
536 * READ RW true
537 */
538static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
542}
543
544/*
545 * lookup range in layout
546 */
547static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
549{
550 struct pnfs_layout_segment *lseg, *ret = NULL;
551
552 dprintk("%s:Begin\n", __func__);
553
554 assert_spin_locked(&lo->inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) {
556 if (is_matching_lseg(lseg, iomode)) {
557 ret = lseg;
558 break;
559 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0)
561 break;
562 }
563
564 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
566 return ret;
567}
568
569/*
570 * Layout segment is retreived from the server if not cached.
571 * The appropriate layout segment is referenced and returned to the caller.
572 */
573struct pnfs_layout_segment *
574pnfs_update_layout(struct inode *ino,
575 struct nfs_open_context *ctx,
576 enum pnfs_iomode iomode)
577{
578 struct nfs_inode *nfsi = NFS_I(ino);
579 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL;
581
582 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
583 return NULL;
584 spin_lock(&ino->i_lock);
585 lo = pnfs_find_alloc_layout(ino);
586 if (lo == NULL) {
587 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
588 goto out_unlock;
589 }
590
591 /* Check to see if the layout for the given range already exists */
592 lseg = pnfs_has_layout(lo, iomode);
593 if (lseg) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n",
595 __func__, lseg, iomode);
596 goto out_unlock;
597 }
598
599 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
601 goto out_unlock;
602
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
604 spin_unlock(&ino->i_lock);
605
606 lseg = send_layoutget(lo, ctx, iomode);
607out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg);
610 return lseg;
611out_unlock:
612 spin_unlock(&ino->i_lock);
613 goto out;
614}
615
616int
617pnfs_layout_process(struct nfs4_layoutget *lgp)
618{
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode;
623 int status = 0;
624
625 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) {
628 if (!lseg)
629 status = -ENOMEM;
630 else
631 status = PTR_ERR(lseg);
632 dprintk("%s: Could not allocate layout: error %d\n",
633 __func__, status);
634 goto out;
635 }
636
637 spin_lock(&ino->i_lock);
638 init_lseg(lo, lseg);
639 lseg->range = res->range;
640 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg);
642
643 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid);
645 spin_unlock(&ino->i_lock);
646out:
647 return status;
648}
649
650/*
651 * Device ID cache. Currently supports one layout type per struct nfs_client.
652 * Add layout type to the lookup key to expand to support multiple types.
653 */
654int
655pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
656 void (*free_callback)(struct pnfs_deviceid_node *))
657{
658 struct pnfs_deviceid_cache *c;
659
660 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
661 if (!c)
662 return -ENOMEM;
663 spin_lock(&clp->cl_lock);
664 if (clp->cl_devid_cache != NULL) {
665 atomic_inc(&clp->cl_devid_cache->dc_ref);
666 dprintk("%s [kref [%d]]\n", __func__,
667 atomic_read(&clp->cl_devid_cache->dc_ref));
668 kfree(c);
669 } else {
670 /* kzalloc initializes hlists */
671 spin_lock_init(&c->dc_lock);
672 atomic_set(&c->dc_ref, 1);
673 c->dc_free_callback = free_callback;
674 clp->cl_devid_cache = c;
675 dprintk("%s [new]\n", __func__);
676 }
677 spin_unlock(&clp->cl_lock);
678 return 0;
679}
680EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
681
682/*
683 * Called from pnfs_layoutdriver_type->free_lseg
684 * last layout segment reference frees deviceid
685 */
686void
687pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
688 struct pnfs_deviceid_node *devid)
689{
690 struct nfs4_deviceid *id = &devid->de_id;
691 struct pnfs_deviceid_node *d;
692 struct hlist_node *n;
693 long h = nfs4_deviceid_hash(id);
694
695 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
696 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
697 return;
698
699 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
700 if (!memcmp(&d->de_id, id, sizeof(*id))) {
701 hlist_del_rcu(&d->de_node);
702 spin_unlock(&c->dc_lock);
703 synchronize_rcu();
704 c->dc_free_callback(devid);
705 return;
706 }
707 spin_unlock(&c->dc_lock);
708 /* Why wasn't it found in the list? */
709 BUG();
710}
711EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
712
713/* Find and reference a deviceid */
714struct pnfs_deviceid_node *
715pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
716{
717 struct pnfs_deviceid_node *d;
718 struct hlist_node *n;
719 long hash = nfs4_deviceid_hash(id);
720
721 dprintk("--> %s hash %ld\n", __func__, hash);
722 rcu_read_lock();
723 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
724 if (!memcmp(&d->de_id, id, sizeof(*id))) {
725 if (!atomic_inc_not_zero(&d->de_ref)) {
726 goto fail;
727 } else {
728 rcu_read_unlock();
729 return d;
730 }
731 }
732 }
733fail:
734 rcu_read_unlock();
735 return NULL;
736}
737EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
738
739/*
740 * Add a deviceid to the cache.
741 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
742 */
743struct pnfs_deviceid_node *
744pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
745{
746 struct pnfs_deviceid_node *d;
747 long hash = nfs4_deviceid_hash(&new->de_id);
748
749 dprintk("--> %s hash %ld\n", __func__, hash);
750 spin_lock(&c->dc_lock);
751 d = pnfs_find_get_deviceid(c, &new->de_id);
752 if (d) {
753 spin_unlock(&c->dc_lock);
754 dprintk("%s [discard]\n", __func__);
755 c->dc_free_callback(new);
756 return d;
757 }
758 INIT_HLIST_NODE(&new->de_node);
759 atomic_set(&new->de_ref, 1);
760 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
761 spin_unlock(&c->dc_lock);
762 dprintk("%s [new]\n", __func__);
763 return new;
764}
765EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
766
767void
768pnfs_put_deviceid_cache(struct nfs_client *clp)
769{
770 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
771
772 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
773 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
774 int i;
775 /* Verify cache is empty */
776 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
777 BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
778 clp->cl_devid_cache = NULL;
779 spin_unlock(&clp->cl_lock);
780 kfree(local);
781 }
782}
783EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 00000000000..e12367d5048
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
1/*
2 * pNFS client data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H
32
33struct pnfs_layout_segment {
34 struct list_head fi_list;
35 struct pnfs_layout_range range;
36 struct kref kref;
37 struct pnfs_layout_hdr *layout;
38};
39
40#ifdef CONFIG_NFS_V4_1
41
42#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
43
44enum {
45 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
46 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
47 NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
48};
49
50/* Per-layout driver specific registration structure */
51struct pnfs_layoutdriver_type {
52 struct list_head pnfs_tblid;
53 const u32 id;
54 const char *name;
55 struct module *owner;
56 int (*set_layoutdriver) (struct nfs_server *);
57 int (*clear_layoutdriver) (struct nfs_server *);
58 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
59 void (*free_lseg) (struct pnfs_layout_segment *lseg);
60};
61
62struct pnfs_layout_hdr {
63 unsigned long refcount;
64 struct list_head layouts; /* other client layouts */
65 struct list_head segs; /* layout segments list */
66 seqlock_t seqlock; /* Protects the stateid */
67 nfs4_stateid stateid;
68 unsigned long state;
69 struct inode *inode;
70};
71
72struct pnfs_device {
73 struct nfs4_deviceid dev_id;
74 unsigned int layout_type;
75 unsigned int mincount;
76 struct page **pages;
77 void *area;
78 unsigned int pgbase;
79 unsigned int pglen;
80};
81
82/*
83 * Device ID RCU cache. A device ID is unique per client ID and layout type.
84 */
85#define NFS4_DEVICE_ID_HASH_BITS 5
86#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
87#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
88
89static inline u32
90nfs4_deviceid_hash(struct nfs4_deviceid *id)
91{
92 unsigned char *cptr = (unsigned char *)id->data;
93 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
94 u32 x = 0;
95
96 while (nbytes--) {
97 x *= 37;
98 x += *cptr++;
99 }
100 return x & NFS4_DEVICE_ID_HASH_MASK;
101}
102
103struct pnfs_deviceid_node {
104 struct hlist_node de_node;
105 struct nfs4_deviceid de_id;
106 atomic_t de_ref;
107};
108
109struct pnfs_deviceid_cache {
110 spinlock_t dc_lock;
111 atomic_t dc_ref;
112 void (*dc_free_callback)(struct pnfs_deviceid_node *);
113 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
114};
115
116extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
117 void (*free_callback)(struct pnfs_deviceid_node *));
118extern void pnfs_put_deviceid_cache(struct nfs_client *);
119extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
120 struct pnfs_deviceid_cache *,
121 struct nfs4_deviceid *);
122extern struct pnfs_deviceid_node *pnfs_add_deviceid(
123 struct pnfs_deviceid_cache *,
124 struct pnfs_deviceid_node *);
125extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
126 struct pnfs_deviceid_node *devid);
127
128extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
129extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
130
131/* nfs4proc.c */
132extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
133 struct pnfs_device *dev);
134extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
135
136/* pnfs.c */
137struct pnfs_layout_segment *
138pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
139 enum pnfs_iomode access_type);
140void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
141void unset_pnfs_layoutdriver(struct nfs_server *);
142int pnfs_layout_process(struct nfs4_layoutget *lgp);
143void pnfs_destroy_layout(struct nfs_inode *);
144void pnfs_destroy_all_layouts(struct nfs_client *);
145void put_layout_hdr(struct inode *inode);
146void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
147 struct nfs4_state *open_state);
148
149
150static inline int lo_fail_bit(u32 iomode)
151{
152 return iomode == IOMODE_RW ?
153 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
154}
155
156/* Return true if a layout driver is being used for this mountpoint */
157static inline int pnfs_enabled_sb(struct nfs_server *nfss)
158{
159 return nfss->pnfs_curr_ld != NULL;
160}
161
162#else /* CONFIG_NFS_V4_1 */
163
164static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
165{
166}
167
168static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
169{
170}
171
172static inline struct pnfs_layout_segment *
173pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
174 enum pnfs_iomode access_type)
175{
176 return NULL;
177}
178
179static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
180{
181}
182
183static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
184{
185}
186
187#endif /* CONFIG_NFS_V4_1 */
188
189#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f55..58e7f84fc1f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
258 258
259static int 259static int
260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
261 int flags, struct nameidata *nd) 261 int flags, struct nfs_open_context *ctx)
262{ 262{
263 struct nfs_createdata *data; 263 struct nfs_createdata *data;
264 struct rpc_message msg = { 264 struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
365 return 1; 365 return 1;
366} 366}
367 367
368static void
369nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
370{
371 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
372}
373
374static int
375nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
376 struct inode *new_dir)
377{
378 if (nfs_async_handle_expired_key(task))
379 return 0;
380 nfs_mark_for_revalidate(old_dir);
381 nfs_mark_for_revalidate(new_dir);
382 return 1;
383}
384
368static int 385static int
369nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, 386nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
370 struct inode *new_dir, struct qstr *new_name) 387 struct inode *new_dir, struct qstr *new_name)
371{ 388{
372 struct nfs_renameargs arg = { 389 struct nfs_renameargs arg = {
373 .fromfh = NFS_FH(old_dir), 390 .old_dir = NFS_FH(old_dir),
374 .fromname = old_name->name, 391 .old_name = old_name,
375 .fromlen = old_name->len, 392 .new_dir = NFS_FH(new_dir),
376 .tofh = NFS_FH(new_dir), 393 .new_name = new_name,
377 .toname = new_name->name,
378 .tolen = new_name->len
379 }; 394 };
380 struct rpc_message msg = { 395 struct rpc_message msg = {
381 .rpc_proc = &nfs_procedures[NFSPROC_RENAME], 396 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
519 */ 534 */
520static int 535static int
521nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 536nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
522 u64 cookie, struct page *page, unsigned int count, int plus) 537 u64 cookie, struct page **pages, unsigned int count, int plus)
523{ 538{
524 struct inode *dir = dentry->d_inode; 539 struct inode *dir = dentry->d_inode;
525 struct nfs_readdirargs arg = { 540 struct nfs_readdirargs arg = {
526 .fh = NFS_FH(dir), 541 .fh = NFS_FH(dir),
527 .cookie = cookie, 542 .cookie = cookie,
528 .count = count, 543 .count = count,
529 .pages = &page, 544 .pages = pages,
530 }; 545 };
531 struct rpc_message msg = { 546 struct rpc_message msg = {
532 .rpc_proc = &nfs_procedures[NFSPROC_READDIR], 547 .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
705 .unlink_setup = nfs_proc_unlink_setup, 720 .unlink_setup = nfs_proc_unlink_setup,
706 .unlink_done = nfs_proc_unlink_done, 721 .unlink_done = nfs_proc_unlink_done,
707 .rename = nfs_proc_rename, 722 .rename = nfs_proc_rename,
723 .rename_setup = nfs_proc_rename_setup,
724 .rename_done = nfs_proc_rename_done,
708 .link = nfs_proc_link, 725 .link = nfs_proc_link,
709 .symlink = nfs_proc_symlink, 726 .symlink = nfs_proc_symlink,
710 .mkdir = nfs_proc_mkdir, 727 .mkdir = nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6e2b06e6ca7..e4b62c6f5a6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h" 27#include "fscache.h"
28#include "pnfs.h"
28 29
29#define NFSDBG_FACILITY NFSDBG_PAGECACHE 30#define NFSDBG_FACILITY NFSDBG_PAGECACHE
30 31
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
46 memset(p, 0, sizeof(*p)); 47 memset(p, 0, sizeof(*p));
47 INIT_LIST_HEAD(&p->pages); 48 INIT_LIST_HEAD(&p->pages);
48 p->npages = pagecount; 49 p->npages = pagecount;
49 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
121 len = nfs_page_length(page); 121 len = nfs_page_length(page);
122 if (len == 0) 122 if (len == 0)
123 return nfs_return_empty_page(page); 123 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
124 new = nfs_create_request(ctx, inode, page, 0, len); 125 new = nfs_create_request(ctx, inode, page, 0, len);
125 if (IS_ERR(new)) { 126 if (IS_ERR(new)) {
126 unlock_page(page); 127 unlock_page(page);
@@ -190,6 +191,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
190 data->args.pages = data->pagevec; 191 data->args.pages = data->pagevec;
191 data->args.count = count; 192 data->args.count = count;
192 data->args.context = get_nfs_open_context(req->wb_context); 193 data->args.context = get_nfs_open_context(req->wb_context);
194 data->args.lock_context = req->wb_lock_context;
193 195
194 data->res.fattr = &data->fattr; 196 data->res.fattr = &data->fattr;
195 data->res.count = count; 197 data->res.count = count;
@@ -410,7 +412,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
410{ 412{
411 struct nfs_read_data *data = calldata; 413 struct nfs_read_data *data = calldata;
412 414
413 if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, 415 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
414 &data->args.seq_args, &data->res.seq_res, 416 &data->args.seq_args, &data->res.seq_res,
415 0, task)) 417 0, task))
416 return; 418 return;
@@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
624 if (ret == 0) 626 if (ret == 0)
625 goto read_complete; /* all pages were read */ 627 goto read_complete; /* all pages were read */
626 628
629 pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
627 if (rsize < PAGE_CACHE_SIZE) 630 if (rsize < PAGE_CACHE_SIZE)
628 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 631 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
629 else 632 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a5..0a42e8f4adc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
100 Opt_addr, Opt_mountaddr, Opt_clientaddr, 100 Opt_addr, Opt_mountaddr, Opt_clientaddr,
101 Opt_lookupcache, 101 Opt_lookupcache,
102 Opt_fscache_uniq, 102 Opt_fscache_uniq,
103 Opt_local_lock,
103 104
104 /* Special mount options */ 105 /* Special mount options */
105 Opt_userspace, Opt_deprecated, Opt_sloppy, 106 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
171 172
172 { Opt_lookupcache, "lookupcache=%s" }, 173 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" }, 174 { Opt_fscache_uniq, "fsc=%s" },
175 { Opt_local_lock, "local_lock=%s" },
174 176
175 { Opt_err, NULL } 177 { Opt_err, NULL }
176}; 178};
@@ -236,14 +238,30 @@ static match_table_t nfs_lookupcache_tokens = {
236 { Opt_lookupcache_err, NULL } 238 { Opt_lookupcache_err, NULL }
237}; 239};
238 240
241enum {
242 Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
243 Opt_local_lock_none,
244
245 Opt_local_lock_err
246};
247
248static match_table_t nfs_local_lock_tokens = {
249 { Opt_local_lock_all, "all" },
250 { Opt_local_lock_flock, "flock" },
251 { Opt_local_lock_posix, "posix" },
252 { Opt_local_lock_none, "none" },
253
254 { Opt_local_lock_err, NULL }
255};
256
239 257
240static void nfs_umount_begin(struct super_block *); 258static void nfs_umount_begin(struct super_block *);
241static int nfs_statfs(struct dentry *, struct kstatfs *); 259static int nfs_statfs(struct dentry *, struct kstatfs *);
242static int nfs_show_options(struct seq_file *, struct vfsmount *); 260static int nfs_show_options(struct seq_file *, struct vfsmount *);
243static int nfs_show_stats(struct seq_file *, struct vfsmount *); 261static int nfs_show_stats(struct seq_file *, struct vfsmount *);
244static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 262static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
245static int nfs_xdev_get_sb(struct file_system_type *fs_type, 263static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
246 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 264 int flags, const char *dev_name, void *raw_data);
247static void nfs_put_super(struct super_block *); 265static void nfs_put_super(struct super_block *);
248static void nfs_kill_super(struct super_block *); 266static void nfs_kill_super(struct super_block *);
249static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 267static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
259struct file_system_type nfs_xdev_fs_type = { 277struct file_system_type nfs_xdev_fs_type = {
260 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
261 .name = "nfs", 279 .name = "nfs",
262 .get_sb = nfs_xdev_get_sb, 280 .mount = nfs_xdev_mount,
263 .kill_sb = nfs_kill_super, 281 .kill_sb = nfs_kill_super,
264 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 282 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
265}; 283};
@@ -270,7 +288,7 @@ static const struct super_operations nfs_sops = {
270 .write_inode = nfs_write_inode, 288 .write_inode = nfs_write_inode,
271 .put_super = nfs_put_super, 289 .put_super = nfs_put_super,
272 .statfs = nfs_statfs, 290 .statfs = nfs_statfs,
273 .clear_inode = nfs_clear_inode, 291 .evict_inode = nfs_evict_inode,
274 .umount_begin = nfs_umount_begin, 292 .umount_begin = nfs_umount_begin,
275 .show_options = nfs_show_options, 293 .show_options = nfs_show_options,
276 .show_stats = nfs_show_stats, 294 .show_stats = nfs_show_stats,
@@ -284,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
284 struct nfs_parsed_mount_data *data, struct vfsmount *mnt); 302 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
285static int nfs4_get_sb(struct file_system_type *fs_type, 303static int nfs4_get_sb(struct file_system_type *fs_type,
286 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 304 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
287static int nfs4_remote_get_sb(struct file_system_type *fs_type, 305static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
288 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 306 int flags, const char *dev_name, void *raw_data);
289static int nfs4_xdev_get_sb(struct file_system_type *fs_type, 307static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
290 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 308 int flags, const char *dev_name, void *raw_data);
291static int nfs4_referral_get_sb(struct file_system_type *fs_type, 309static int nfs4_referral_get_sb(struct file_system_type *fs_type,
292 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 310 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
293static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 311static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
294 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 312 int flags, const char *dev_name, void *raw_data);
295static void nfs4_kill_super(struct super_block *sb); 313static void nfs4_kill_super(struct super_block *sb);
296 314
297static struct file_system_type nfs4_fs_type = { 315static struct file_system_type nfs4_fs_type = {
@@ -305,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
305static struct file_system_type nfs4_remote_fs_type = { 323static struct file_system_type nfs4_remote_fs_type = {
306 .owner = THIS_MODULE, 324 .owner = THIS_MODULE,
307 .name = "nfs4", 325 .name = "nfs4",
308 .get_sb = nfs4_remote_get_sb, 326 .mount = nfs4_remote_mount,
309 .kill_sb = nfs4_kill_super, 327 .kill_sb = nfs4_kill_super,
310 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 328 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
311}; 329};
@@ -313,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
313struct file_system_type nfs4_xdev_fs_type = { 331struct file_system_type nfs4_xdev_fs_type = {
314 .owner = THIS_MODULE, 332 .owner = THIS_MODULE,
315 .name = "nfs4", 333 .name = "nfs4",
316 .get_sb = nfs4_xdev_get_sb, 334 .mount = nfs4_xdev_mount,
317 .kill_sb = nfs4_kill_super, 335 .kill_sb = nfs4_kill_super,
318 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 336 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
319}; 337};
@@ -321,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
321static struct file_system_type nfs4_remote_referral_fs_type = { 339static struct file_system_type nfs4_remote_referral_fs_type = {
322 .owner = THIS_MODULE, 340 .owner = THIS_MODULE,
323 .name = "nfs4", 341 .name = "nfs4",
324 .get_sb = nfs4_remote_referral_get_sb, 342 .mount = nfs4_remote_referral_mount,
325 .kill_sb = nfs4_kill_super, 343 .kill_sb = nfs4_kill_super,
326 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 344 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
327}; 345};
@@ -340,7 +358,7 @@ static const struct super_operations nfs4_sops = {
340 .write_inode = nfs_write_inode, 358 .write_inode = nfs_write_inode,
341 .put_super = nfs_put_super, 359 .put_super = nfs_put_super,
342 .statfs = nfs_statfs, 360 .statfs = nfs_statfs,
343 .clear_inode = nfs4_clear_inode, 361 .evict_inode = nfs4_evict_inode,
344 .umount_begin = nfs_umount_begin, 362 .umount_begin = nfs_umount_begin,
345 .show_options = nfs_show_options, 363 .show_options = nfs_show_options,
346 .show_stats = nfs_show_stats, 364 .show_stats = nfs_show_stats,
@@ -431,7 +449,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
431 goto out_err; 449 goto out_err;
432 450
433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 451 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
452 if (unlikely(error == -ESTALE)) {
453 struct dentry *pd_dentry;
434 454
455 pd_dentry = dget_parent(dentry);
456 if (pd_dentry != NULL) {
457 nfs_zap_caches(pd_dentry->d_inode);
458 dput(pd_dentry);
459 }
460 }
435 nfs_free_fattr(res.fattr); 461 nfs_free_fattr(res.fattr);
436 if (error < 0) 462 if (error < 0)
437 goto out_err; 463 goto out_err;
@@ -546,6 +572,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
546{ 572{
547 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; 573 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
548 574
575 if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
576 return;
577
549 switch (sap->sa_family) { 578 switch (sap->sa_family) {
550 case AF_INET: { 579 case AF_INET: {
551 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 580 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -611,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
611 const struct proc_nfs_info *nfs_infop; 640 const struct proc_nfs_info *nfs_infop;
612 struct nfs_client *clp = nfss->nfs_client; 641 struct nfs_client *clp = nfss->nfs_client;
613 u32 version = clp->rpc_ops->version; 642 u32 version = clp->rpc_ops->version;
643 int local_flock, local_fcntl;
614 644
615 seq_printf(m, ",vers=%u", version); 645 seq_printf(m, ",vers=%u", version);
616 seq_printf(m, ",rsize=%u", nfss->rsize); 646 seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -652,6 +682,25 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
652 682
653 if (nfss->options & NFS_OPTION_FSCACHE) 683 if (nfss->options & NFS_OPTION_FSCACHE)
654 seq_printf(m, ",fsc"); 684 seq_printf(m, ",fsc");
685
686 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
687 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
688 seq_printf(m, ",lookupcache=none");
689 else
690 seq_printf(m, ",lookupcache=pos");
691 }
692
693 local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
694 local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
695
696 if (!local_flock && !local_fcntl)
697 seq_printf(m, ",local_lock=none");
698 else if (local_flock && local_fcntl)
699 seq_printf(m, ",local_lock=all");
700 else if (local_flock)
701 seq_printf(m, ",local_lock=flock");
702 else
703 seq_printf(m, ",local_lock=posix");
655} 704}
656 705
657/* 706/*
@@ -999,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
999 break; 1048 break;
1000 case Opt_lock: 1049 case Opt_lock:
1001 mnt->flags &= ~NFS_MOUNT_NONLM; 1050 mnt->flags &= ~NFS_MOUNT_NONLM;
1051 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1052 NFS_MOUNT_LOCAL_FCNTL);
1002 break; 1053 break;
1003 case Opt_nolock: 1054 case Opt_nolock:
1004 mnt->flags |= NFS_MOUNT_NONLM; 1055 mnt->flags |= NFS_MOUNT_NONLM;
1056 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1057 NFS_MOUNT_LOCAL_FCNTL);
1005 break; 1058 break;
1006 case Opt_v2: 1059 case Opt_v2:
1007 mnt->flags &= ~NFS_MOUNT_VER3; 1060 mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1402,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
1402 mnt->fscache_uniq = string; 1455 mnt->fscache_uniq = string;
1403 mnt->options |= NFS_OPTION_FSCACHE; 1456 mnt->options |= NFS_OPTION_FSCACHE;
1404 break; 1457 break;
1458 case Opt_local_lock:
1459 string = match_strdup(args);
1460 if (string == NULL)
1461 goto out_nomem;
1462 token = match_token(string, nfs_local_lock_tokens,
1463 args);
1464 kfree(string);
1465 switch (token) {
1466 case Opt_local_lock_all:
1467 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1468 NFS_MOUNT_LOCAL_FCNTL);
1469 break;
1470 case Opt_local_lock_flock:
1471 mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
1472 break;
1473 case Opt_local_lock_posix:
1474 mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
1475 break;
1476 case Opt_local_lock_none:
1477 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1478 NFS_MOUNT_LOCAL_FCNTL);
1479 break;
1480 default:
1481 dfprintk(MOUNT, "NFS: invalid "
1482 "local_lock argument\n");
1483 return 0;
1484 };
1485 break;
1405 1486
1406 /* 1487 /*
1407 * Special options 1488 * Special options
@@ -1780,6 +1861,7 @@ static int nfs_validate_mount_data(void *options,
1780 * can deal with. 1861 * can deal with.
1781 */ 1862 */
1782 args->flags = data->flags & NFS_MOUNT_FLAGMASK; 1863 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1864 args->flags |= NFS_MOUNT_LEGACY_INTERFACE;
1783 args->rsize = data->rsize; 1865 args->rsize = data->rsize;
1784 args->wsize = data->wsize; 1866 args->wsize = data->wsize;
1785 args->timeo = data->timeo; 1867 args->timeo = data->timeo;
@@ -1806,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
1806 if (!args->nfs_server.hostname) 1888 if (!args->nfs_server.hostname)
1807 goto out_nomem; 1889 goto out_nomem;
1808 1890
1891 if (!(data->flags & NFS_MOUNT_NONLM))
1892 args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
1893 NFS_MOUNT_LOCAL_FCNTL);
1894 else
1895 args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
1896 NFS_MOUNT_LOCAL_FCNTL);
1809 /* 1897 /*
1810 * The legacy version 6 binary mount data from userspace has a 1898 * The legacy version 6 binary mount data from userspace has a
1811 * field used only to transport selinux information into the 1899 * field used only to transport selinux information into the
@@ -2309,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
2309/* 2397/*
2310 * Clone an NFS2/3 server record on xdev traversal (FSID-change) 2398 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
2311 */ 2399 */
2312static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, 2400static struct dentry *
2313 const char *dev_name, void *raw_data, 2401nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2314 struct vfsmount *mnt) 2402 const char *dev_name, void *raw_data)
2315{ 2403{
2316 struct nfs_clone_mount *data = raw_data; 2404 struct nfs_clone_mount *data = raw_data;
2317 struct super_block *s; 2405 struct super_block *s;
@@ -2323,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2323 }; 2411 };
2324 int error; 2412 int error;
2325 2413
2326 dprintk("--> nfs_xdev_get_sb()\n"); 2414 dprintk("--> nfs_xdev_mount()\n");
2327 2415
2328 /* create a new volume representation */ 2416 /* create a new volume representation */
2329 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 2417 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2370,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2370 } 2458 }
2371 2459
2372 s->s_flags |= MS_ACTIVE; 2460 s->s_flags |= MS_ACTIVE;
2373 mnt->mnt_sb = s;
2374 mnt->mnt_root = mntroot;
2375 2461
2376 /* clone any lsm security options from the parent to the new sb */ 2462 /* clone any lsm security options from the parent to the new sb */
2377 security_sb_clone_mnt_opts(data->sb, s); 2463 security_sb_clone_mnt_opts(data->sb, s);
2378 2464
2379 dprintk("<-- nfs_xdev_get_sb() = 0\n"); 2465 dprintk("<-- nfs_xdev_mount() = 0\n");
2380 return 0; 2466 return mntroot;
2381 2467
2382out_err_nosb: 2468out_err_nosb:
2383 nfs_free_server(server); 2469 nfs_free_server(server);
2384out_err_noserver: 2470out_err_noserver:
2385 dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); 2471 dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
2386 return error; 2472 return ERR_PTR(error);
2387 2473
2388error_splat_super: 2474error_splat_super:
2389 if (server && !s->s_root) 2475 if (server && !s->s_root)
2390 bdi_unregister(&server->backing_dev_info); 2476 bdi_unregister(&server->backing_dev_info);
2391error_splat_bdi: 2477error_splat_bdi:
2392 deactivate_locked_super(s); 2478 deactivate_locked_super(s);
2393 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2479 dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
2394 return error; 2480 return ERR_PTR(error);
2395} 2481}
2396 2482
2397#ifdef CONFIG_NFS_V4 2483#ifdef CONFIG_NFS_V4
@@ -2422,7 +2508,8 @@ static void nfs4_fill_super(struct super_block *sb)
2422 2508
2423static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) 2509static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2424{ 2510{
2425 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2511 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
2512 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
2426} 2513}
2427 2514
2428static int nfs4_validate_text_mount_data(void *options, 2515static int nfs4_validate_text_mount_data(void *options,
@@ -2560,8 +2647,9 @@ out_no_address:
2560/* 2647/*
2561 * Get the superblock for the NFS4 root partition 2648 * Get the superblock for the NFS4 root partition
2562 */ 2649 */
2563static int nfs4_remote_get_sb(struct file_system_type *fs_type, 2650static struct dentry *
2564 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2651nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2652 const char *dev_name, void *raw_data)
2565{ 2653{
2566 struct nfs_parsed_mount_data *data = raw_data; 2654 struct nfs_parsed_mount_data *data = raw_data;
2567 struct super_block *s; 2655 struct super_block *s;
@@ -2625,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2625 goto error_splat_root; 2713 goto error_splat_root;
2626 2714
2627 s->s_flags |= MS_ACTIVE; 2715 s->s_flags |= MS_ACTIVE;
2628 mnt->mnt_sb = s; 2716
2629 mnt->mnt_root = mntroot; 2717 security_free_mnt_opts(&data->lsm_opts);
2630 error = 0; 2718 nfs_free_fhandle(mntfh);
2719 return mntroot;
2631 2720
2632out: 2721out:
2633 security_free_mnt_opts(&data->lsm_opts); 2722 security_free_mnt_opts(&data->lsm_opts);
2634out_free_fh: 2723out_free_fh:
2635 nfs_free_fhandle(mntfh); 2724 nfs_free_fhandle(mntfh);
2636 return error; 2725 return ERR_PTR(error);
2637 2726
2638out_free: 2727out_free:
2639 nfs_free_server(server); 2728 nfs_free_server(server);
@@ -2879,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
2879/* 2968/*
2880 * Clone an NFS4 server record on xdev traversal (FSID-change) 2969 * Clone an NFS4 server record on xdev traversal (FSID-change)
2881 */ 2970 */
2882static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, 2971static struct dentry *
2883 const char *dev_name, void *raw_data, 2972nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2884 struct vfsmount *mnt) 2973 const char *dev_name, void *raw_data)
2885{ 2974{
2886 struct nfs_clone_mount *data = raw_data; 2975 struct nfs_clone_mount *data = raw_data;
2887 struct super_block *s; 2976 struct super_block *s;
@@ -2893,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2893 }; 2982 };
2894 int error; 2983 int error;
2895 2984
2896 dprintk("--> nfs4_xdev_get_sb()\n"); 2985 dprintk("--> nfs4_xdev_mount()\n");
2897 2986
2898 /* create a new volume representation */ 2987 /* create a new volume representation */
2899 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 2988 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2940,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2940 } 3029 }
2941 3030
2942 s->s_flags |= MS_ACTIVE; 3031 s->s_flags |= MS_ACTIVE;
2943 mnt->mnt_sb = s;
2944 mnt->mnt_root = mntroot;
2945 3032
2946 security_sb_clone_mnt_opts(data->sb, s); 3033 security_sb_clone_mnt_opts(data->sb, s);
2947 3034
2948 dprintk("<-- nfs4_xdev_get_sb() = 0\n"); 3035 dprintk("<-- nfs4_xdev_mount() = 0\n");
2949 return 0; 3036 return mntroot;
2950 3037
2951out_err_nosb: 3038out_err_nosb:
2952 nfs_free_server(server); 3039 nfs_free_server(server);
2953out_err_noserver: 3040out_err_noserver:
2954 dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); 3041 dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
2955 return error; 3042 return ERR_PTR(error);
2956 3043
2957error_splat_super: 3044error_splat_super:
2958 if (server && !s->s_root) 3045 if (server && !s->s_root)
2959 bdi_unregister(&server->backing_dev_info); 3046 bdi_unregister(&server->backing_dev_info);
2960error_splat_bdi: 3047error_splat_bdi:
2961 deactivate_locked_super(s); 3048 deactivate_locked_super(s);
2962 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 3049 dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
2963 return error; 3050 return ERR_PTR(error);
2964} 3051}
2965 3052
2966static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 3053static struct dentry *
2967 int flags, const char *dev_name, void *raw_data, 3054nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
2968 struct vfsmount *mnt) 3055 const char *dev_name, void *raw_data)
2969{ 3056{
2970 struct nfs_clone_mount *data = raw_data; 3057 struct nfs_clone_mount *data = raw_data;
2971 struct super_block *s; 3058 struct super_block *s;
@@ -3029,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
3029 } 3116 }
3030 3117
3031 s->s_flags |= MS_ACTIVE; 3118 s->s_flags |= MS_ACTIVE;
3032 mnt->mnt_sb = s;
3033 mnt->mnt_root = mntroot;
3034 3119
3035 security_sb_clone_mnt_opts(data->sb, s); 3120 security_sb_clone_mnt_opts(data->sb, s);
3036 3121
3037 nfs_free_fhandle(mntfh); 3122 nfs_free_fhandle(mntfh);
3038 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3123 dprintk("<-- nfs4_referral_get_sb() = 0\n");
3039 return 0; 3124 return mntroot;
3040 3125
3041out_err_nosb: 3126out_err_nosb:
3042 nfs_free_server(server); 3127 nfs_free_server(server);
@@ -3044,7 +3129,7 @@ out_err_noserver:
3044 nfs_free_fhandle(mntfh); 3129 nfs_free_fhandle(mntfh);
3045out_err_nofh: 3130out_err_nofh:
3046 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3131 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
3047 return error; 3132 return ERR_PTR(error);
3048 3133
3049error_splat_super: 3134error_splat_super:
3050 if (server && !s->s_root) 3135 if (server && !s->s_root)
@@ -3053,7 +3138,7 @@ error_splat_bdi:
3053 deactivate_locked_super(s); 3138 deactivate_locked_super(s);
3054 nfs_free_fhandle(mntfh); 3139 nfs_free_fhandle(mntfh);
3055 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3140 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
3056 return error; 3141 return ERR_PTR(error);
3057} 3142}
3058 3143
3059/* 3144/*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b2..978aaeb8a09 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
32 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
33 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
34 }, 34 },
35#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
35 { 36 {
36 .procname = "idmap_cache_timeout", 37 .procname = "idmap_cache_timeout",
37 .data = &nfs_idmap_cache_timeout, 38 .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
39 .mode = 0644, 40 .mode = 0644,
40 .proc_handler = proc_dointvec_jiffies, 41 .proc_handler = proc_dointvec_jiffies,
41 }, 42 },
43#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
42#endif 44#endif
43 { 45 {
44 .procname = "nfs_mountpoint_timeout", 46 .procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index a2242af6a17..7bdec853140 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h>
16 17
17#include "internal.h" 18#include "internal.h"
18#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "iostat.h"
21#include "delegation.h"
19 22
20struct nfs_unlinkdata { 23struct nfs_unlinkdata {
21 struct hlist_node list; 24 struct hlist_node list;
@@ -110,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
110 struct nfs_unlinkdata *data = calldata; 113 struct nfs_unlinkdata *data = calldata;
111 struct nfs_server *server = NFS_SERVER(data->dir); 114 struct nfs_server *server = NFS_SERVER(data->dir);
112 115
113 if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, 116 if (nfs4_setup_sequence(server, &data->args.seq_args,
114 &data->res.seq_res, 1, task)) 117 &data->res.seq_res, 1, task))
115 return; 118 return;
116 rpc_call_start(task); 119 rpc_call_start(task);
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
244 * @dir: parent directory of dentry 247 * @dir: parent directory of dentry
245 * @dentry: dentry to unlink 248 * @dentry: dentry to unlink
246 */ 249 */
247int 250static int
248nfs_async_unlink(struct inode *dir, struct dentry *dentry) 251nfs_async_unlink(struct inode *dir, struct dentry *dentry)
249{ 252{
250 struct nfs_unlinkdata *data; 253 struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 status = PTR_ERR(data->cred); 262 status = PTR_ERR(data->cred);
260 goto out_free; 263 goto out_free;
261 } 264 }
262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr; 265 data->res.dir_attr = &data->dir_attr;
264 266
265 status = -EBUSY; 267 status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
303 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) 305 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
304 nfs_free_unlinkdata(data); 306 nfs_free_unlinkdata(data);
305} 307}
308
309/* Cancel a queued async unlink. Called when a sillyrename run fails. */
310static void
311nfs_cancel_async_unlink(struct dentry *dentry)
312{
313 spin_lock(&dentry->d_lock);
314 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
315 struct nfs_unlinkdata *data = dentry->d_fsdata;
316
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
318 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data);
320 return;
321 }
322 spin_unlock(&dentry->d_lock);
323}
324
325struct nfs_renamedata {
326 struct nfs_renameargs args;
327 struct nfs_renameres res;
328 struct rpc_cred *cred;
329 struct inode *old_dir;
330 struct dentry *old_dentry;
331 struct nfs_fattr old_fattr;
332 struct inode *new_dir;
333 struct dentry *new_dentry;
334 struct nfs_fattr new_fattr;
335};
336
337/**
338 * nfs_async_rename_done - Sillyrename post-processing
339 * @task: rpc_task of the sillyrename
340 * @calldata: nfs_renamedata for the sillyrename
341 *
342 * Do the directory attribute updates and the d_move
343 */
344static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
345{
346 struct nfs_renamedata *data = calldata;
347 struct inode *old_dir = data->old_dir;
348 struct inode *new_dir = data->new_dir;
349
350 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
351 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
352 return;
353 }
354
355 if (task->tk_status != 0) {
356 nfs_cancel_async_unlink(data->old_dentry);
357 return;
358 }
359
360 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
361 d_move(data->old_dentry, data->new_dentry);
362}
363
364/**
365 * nfs_async_rename_release - Release the sillyrename data.
366 * @calldata: the struct nfs_renamedata to be released
367 */
368static void nfs_async_rename_release(void *calldata)
369{
370 struct nfs_renamedata *data = calldata;
371 struct super_block *sb = data->old_dir->i_sb;
372
373 if (data->old_dentry->d_inode)
374 nfs_mark_for_revalidate(data->old_dentry->d_inode);
375
376 dput(data->old_dentry);
377 dput(data->new_dentry);
378 iput(data->old_dir);
379 iput(data->new_dir);
380 nfs_sb_deactive(sb);
381 put_rpccred(data->cred);
382 kfree(data);
383}
384
385#if defined(CONFIG_NFS_V4_1)
386static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
387{
388 struct nfs_renamedata *data = calldata;
389 struct nfs_server *server = NFS_SERVER(data->old_dir);
390
391 if (nfs4_setup_sequence(server, &data->args.seq_args,
392 &data->res.seq_res, 1, task))
393 return;
394 rpc_call_start(task);
395}
396#endif /* CONFIG_NFS_V4_1 */
397
398static const struct rpc_call_ops nfs_rename_ops = {
399 .rpc_call_done = nfs_async_rename_done,
400 .rpc_release = nfs_async_rename_release,
401#if defined(CONFIG_NFS_V4_1)
402 .rpc_call_prepare = nfs_rename_prepare,
403#endif /* CONFIG_NFS_V4_1 */
404};
405
406/**
407 * nfs_async_rename - perform an asynchronous rename operation
408 * @old_dir: directory that currently holds the dentry to be renamed
409 * @new_dir: target directory for the rename
410 * @old_dentry: original dentry to be renamed
411 * @new_dentry: dentry to which the old_dentry should be renamed
412 *
413 * It's expected that valid references to the dentries and inodes are held
414 */
415static struct rpc_task *
416nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
417 struct dentry *old_dentry, struct dentry *new_dentry)
418{
419 struct nfs_renamedata *data;
420 struct rpc_message msg = { };
421 struct rpc_task_setup task_setup_data = {
422 .rpc_message = &msg,
423 .callback_ops = &nfs_rename_ops,
424 .workqueue = nfsiod_workqueue,
425 .rpc_client = NFS_CLIENT(old_dir),
426 .flags = RPC_TASK_ASYNC,
427 };
428
429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL)
431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data,
433
434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) {
436 struct rpc_task *task = ERR_CAST(data->cred);
437 kfree(data);
438 return task;
439 }
440
441 msg.rpc_argp = &data->args;
442 msg.rpc_resp = &data->res;
443 msg.rpc_cred = data->cred;
444
445 /* set up nfs_renamedata */
446 data->old_dir = old_dir;
447 ihold(old_dir);
448 data->new_dir = new_dir;
449 ihold(new_dir);
450 data->old_dentry = dget(old_dentry);
451 data->new_dentry = dget(new_dentry);
452 nfs_fattr_init(&data->old_fattr);
453 nfs_fattr_init(&data->new_fattr);
454
455 /* set up nfs_renameargs */
456 data->args.old_dir = NFS_FH(old_dir);
457 data->args.old_name = &old_dentry->d_name;
458 data->args.new_dir = NFS_FH(new_dir);
459 data->args.new_name = &new_dentry->d_name;
460
461 /* set up nfs_renameres */
462 data->res.old_fattr = &data->old_fattr;
463 data->res.new_fattr = &data->new_fattr;
464
465 nfs_sb_active(old_dir->i_sb);
466
467 NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
468
469 return rpc_run_task(&task_setup_data);
470}
471
472/**
473 * nfs_sillyrename - Perform a silly-rename of a dentry
474 * @dir: inode of directory that contains dentry
475 * @dentry: dentry to be sillyrenamed
476 *
477 * NFSv2/3 is stateless and the server doesn't know when the client is
478 * holding a file open. To prevent application problems when a file is
479 * unlinked while it's still open, the client performs a "silly-rename".
480 * That is, it renames the file to a hidden file in the same directory,
481 * and only performs the unlink once the last reference to it is put.
482 *
483 * The final cleanup is done during dentry_iput.
484 */
485int
486nfs_sillyrename(struct inode *dir, struct dentry *dentry)
487{
488 static unsigned int sillycounter;
489 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
490 const int countersize = sizeof(sillycounter)*2;
491 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
492 char silly[slen+1];
493 struct dentry *sdentry;
494 struct rpc_task *task;
495 int error = -EIO;
496
497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
498 dentry->d_parent->d_name.name, dentry->d_name.name,
499 atomic_read(&dentry->d_count));
500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
501
502 /*
503 * We don't allow a dentry to be silly-renamed twice.
504 */
505 error = -EBUSY;
506 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
507 goto out;
508
509 sprintf(silly, ".nfs%*.*Lx",
510 fileidsize, fileidsize,
511 (unsigned long long)NFS_FILEID(dentry->d_inode));
512
513 /* Return delegation in anticipation of the rename */
514 nfs_inode_return_delegation(dentry->d_inode);
515
516 sdentry = NULL;
517 do {
518 char *suffix = silly + slen - countersize;
519
520 dput(sdentry);
521 sillycounter++;
522 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
523
524 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
525 dentry->d_name.name, silly);
526
527 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
528 /*
529 * N.B. Better to return EBUSY here ... it could be
530 * dangerous to delete the file while it's in use.
531 */
532 if (IS_ERR(sdentry))
533 goto out;
534 } while (sdentry->d_inode != NULL); /* need negative lookup */
535
536 /* queue unlink first. Can't do this from rpc_release as it
537 * has to allocate memory
538 */
539 error = nfs_async_unlink(dir, dentry);
540 if (error)
541 goto out_dput;
542
543 /* run the rename task, undo unlink if it fails */
544 task = nfs_async_rename(dir, dir, dentry, sdentry);
545 if (IS_ERR(task)) {
546 error = -EBUSY;
547 nfs_cancel_async_unlink(dentry);
548 goto out_dput;
549 }
550
551 /* wait for the RPC task to complete, unless a SIGKILL intervenes */
552 error = rpc_wait_for_completion_task(task);
553 if (error == 0)
554 error = task->tk_status;
555 rpc_put_task(task);
556out_dput:
557 dput(sdentry);
558out:
559 return error;
560}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f81bdd91c5..4c14c17a527 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
55 if (p) { 55 if (p) {
56 memset(p, 0, sizeof(*p)); 56 memset(p, 0, sizeof(*p));
57 INIT_LIST_HEAD(&p->pages); 57 INIT_LIST_HEAD(&p->pages);
58 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
59 } 58 }
60 return p; 59 return p;
61} 60}
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
75 memset(p, 0, sizeof(*p)); 74 memset(p, 0, sizeof(*p));
76 INIT_LIST_HEAD(&p->pages); 75 INIT_LIST_HEAD(&p->pages);
77 p->npages = pagecount; 76 p->npages = pagecount;
78 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
79 if (pagecount <= ARRAY_SIZE(p->page_array)) 77 if (pagecount <= ARRAY_SIZE(p->page_array))
80 p->pagevec = p->page_array; 78 p->pagevec = p->page_array;
81 else { 79 else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
292 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 290 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
293 291
294 nfs_pageio_cond_complete(pgio, page->index); 292 nfs_pageio_cond_complete(pgio, page->index);
295 ret = nfs_page_async_flush(pgio, page, 293 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
296 wbc->sync_mode == WB_SYNC_NONE ||
297 wbc->nonblocking != 0);
298 if (ret == -EAGAIN) { 294 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page); 295 redirty_page_for_writepage(wbc, page);
300 ret = 0; 296 ret = 0;
@@ -700,7 +696,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
700 req = nfs_page_find_request(page); 696 req = nfs_page_find_request(page);
701 if (req == NULL) 697 if (req == NULL)
702 return 0; 698 return 0;
703 do_flush = req->wb_page != page || req->wb_context != ctx; 699 do_flush = req->wb_page != page || req->wb_context != ctx ||
700 req->wb_lock_context->lockowner != current->files ||
701 req->wb_lock_context->pid != current->tgid;
704 nfs_release_request(req); 702 nfs_release_request(req);
705 if (!do_flush) 703 if (!do_flush)
706 return 0; 704 return 0;
@@ -824,6 +822,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
824 data->args.pages = data->pagevec; 822 data->args.pages = data->pagevec;
825 data->args.count = count; 823 data->args.count = count;
826 data->args.context = get_nfs_open_context(req->wb_context); 824 data->args.context = get_nfs_open_context(req->wb_context);
825 data->args.lock_context = req->wb_lock_context;
827 data->args.stable = NFS_UNSTABLE; 826 data->args.stable = NFS_UNSTABLE;
828 if (how & FLUSH_STABLE) { 827 if (how & FLUSH_STABLE) {
829 data->args.stable = NFS_DATA_SYNC; 828 data->args.stable = NFS_DATA_SYNC;
@@ -1047,9 +1046,9 @@ out:
1047void nfs_write_prepare(struct rpc_task *task, void *calldata) 1046void nfs_write_prepare(struct rpc_task *task, void *calldata)
1048{ 1047{
1049 struct nfs_write_data *data = calldata; 1048 struct nfs_write_data *data = calldata;
1050 struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
1051 1049
1052 if (nfs4_setup_sequence(clp, &data->args.seq_args, 1050 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
1051 &data->args.seq_args,
1053 &data->res.seq_res, 1, task)) 1052 &data->res.seq_res, 1, task))
1054 return; 1053 return;
1055 rpc_call_start(task); 1054 rpc_call_start(task);
@@ -1430,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1430 int flags = FLUSH_SYNC; 1429 int flags = FLUSH_SYNC;
1431 int ret = 0; 1430 int ret = 0;
1432 1431
1433 /* Don't commit yet if this is a non-blocking flush and there are 1432 if (wbc->sync_mode == WB_SYNC_NONE) {
1434 * lots of outstanding writes for this mapping. 1433 /* Don't commit yet if this is a non-blocking flush and there
1435 */ 1434 * are a lot of outstanding writes for this mapping.
1436 if (wbc->sync_mode == WB_SYNC_NONE && 1435 */
1437 nfsi->ncommit <= (nfsi->npages >> 1)) 1436 if (nfsi->ncommit <= (nfsi->npages >> 1))
1438 goto out_mark_dirty; 1437 goto out_mark_dirty;
1439 1438
1440 if (wbc->nonblocking || wbc->for_background) 1439 /* don't wait for the COMMIT response */
1441 flags = 0; 1440 flags = 0;
1441 }
1442
1442 ret = nfs_commit_inode(inode, flags); 1443 ret = nfs_commit_inode(inode, flags);
1443 if (ret >= 0) { 1444 if (ret >= 0) {
1444 if (wbc->sync_mode == WB_SYNC_NONE) { 1445 if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a..18b3e8975fe 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
28 28
29 If unsure, say N. 29 If unsure, say N.
30 30
31config NFSD_DEPRECATED
32 bool "Include support for deprecated syscall interface to NFSD"
33 depends on NFSD
34 default y
35 help
36 The syscall interface to nfsd was obsoleted in 2.6.0 by a new
37 filesystem based interface. The old interface is due for removal
38 in 2.6.40. If you wish to remove the interface before then
39 say N.
40
41 In unsure, say Y.
42
31config NFSD_V2_ACL 43config NFSD_V2_ACL
32 bool 44 bool
33 depends on NFSD 45 depends on NFSD
@@ -69,7 +81,7 @@ config NFSD_V4
69 depends on NFSD && PROC_FS && EXPERIMENTAL 81 depends on NFSD && PROC_FS && EXPERIMENTAL
70 select NFSD_V3 82 select NFSD_V3
71 select FS_POSIX_ACL 83 select FS_POSIX_ACL
72 select RPCSEC_GSS_KRB5 84 select SUNRPC_GSS
73 help 85 help
74 This option enables support in your system's NFS server for 86 This option enables support in your system's NFS server for
75 version 4 of the NFS protocol (RFC 3530). 87 version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87d..c0fcb7ab7f6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
28typedef struct auth_domain svc_client; 28typedef struct auth_domain svc_client;
29typedef struct svc_export svc_export; 29typedef struct svc_export svc_export;
30 30
31static void exp_do_unexport(svc_export *unexp);
32static int exp_verify_string(char *cp, int max);
33
34/* 31/*
35 * We have two caches. 32 * We have two caches.
36 * One maps client+vfsmnt+dentry to export options - the export map 33 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
802 return ek; 799 return ek;
803} 800}
804 801
802#ifdef CONFIG_NFSD_DEPRECATED
805static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, 803static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
806 struct svc_export *exp) 804 struct svc_export *exp)
807{ 805{
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
852 850
853 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 851 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
854} 852}
853#endif
855 854
856static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, 855static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
857 struct cache_req *reqp) 856 struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
893 return exp; 892 return exp;
894} 893}
895 894
895#ifdef CONFIG_NFSD_DEPRECATED
896/* 896/*
897 * Hashtable locking. Write locks are placed only by user processes 897 * Hashtable locking. Write locks are placed only by user processes
898 * wanting to modify export information. 898 * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
925{ 925{
926 up_write(&hash_sem); 926 up_write(&hash_sem);
927} 927}
928#else
929
930/* hash_sem not needed once deprecated interface is removed */
931void exp_readlock(void) {}
932static inline void exp_writelock(void){}
933void exp_readunlock(void) {}
934static inline void exp_writeunlock(void){}
935
936#endif
937
938#ifdef CONFIG_NFSD_DEPRECATED
939static void exp_do_unexport(svc_export *unexp);
940static int exp_verify_string(char *cp, int max);
928 941
929static void exp_fsid_unhash(struct svc_export *exp) 942static void exp_fsid_unhash(struct svc_export *exp)
930{ 943{
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
935 948
936 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 949 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
937 if (!IS_ERR(ek)) { 950 if (!IS_ERR(ek)) {
938 ek->h.expiry_time = get_seconds()-1; 951 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
939 cache_put(&ek->h, &svc_expkey_cache); 952 cache_put(&ek->h, &svc_expkey_cache);
940 } 953 }
941 svc_expkey_cache.nextcheck = get_seconds();
942} 954}
943 955
944static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) 956static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
973 985
974 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 986 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
975 if (!IS_ERR(ek)) { 987 if (!IS_ERR(ek)) {
976 ek->h.expiry_time = get_seconds()-1; 988 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
977 cache_put(&ek->h, &svc_expkey_cache); 989 cache_put(&ek->h, &svc_expkey_cache);
978 } 990 }
979 svc_expkey_cache.nextcheck = get_seconds();
980} 991}
981 992
982/* 993/*
@@ -1097,8 +1108,7 @@ out:
1097static void 1108static void
1098exp_do_unexport(svc_export *unexp) 1109exp_do_unexport(svc_export *unexp)
1099{ 1110{
1100 unexp->h.expiry_time = get_seconds()-1; 1111 sunrpc_invalidate(&unexp->h, &svc_export_cache);
1101 svc_export_cache.nextcheck = get_seconds();
1102 exp_unhash(unexp); 1112 exp_unhash(unexp);
1103 exp_fsid_unhash(unexp); 1113 exp_fsid_unhash(unexp);
1104} 1114}
@@ -1150,6 +1160,7 @@ out_unlock:
1150 exp_writeunlock(); 1160 exp_writeunlock();
1151 return err; 1161 return err;
1152} 1162}
1163#endif /* CONFIG_NFSD_DEPRECATED */
1153 1164
1154/* 1165/*
1155 * Obtain the root fh on behalf of a client. 1166 * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
1459 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); 1470 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1460} 1471}
1461 1472
1473static bool secinfo_flags_equal(int f, int g)
1474{
1475 f &= NFSEXP_SECINFO_FLAGS;
1476 g &= NFSEXP_SECINFO_FLAGS;
1477 return f == g;
1478}
1479
1480static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
1481{
1482 int flags;
1483
1484 flags = (*fp)->flags;
1485 seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
1486 (*fp)++;
1487 while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
1488 seq_printf(m, ":%d", (*fp)->pseudoflavor);
1489 (*fp)++;
1490 }
1491 return flags;
1492}
1493
1462static void show_secinfo(struct seq_file *m, struct svc_export *exp) 1494static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1463{ 1495{
1464 struct exp_flavor_info *f; 1496 struct exp_flavor_info *f;
1465 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; 1497 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1466 int lastflags = 0, first = 0; 1498 int flags;
1467 1499
1468 if (exp->ex_nflavors == 0) 1500 if (exp->ex_nflavors == 0)
1469 return; 1501 return;
1470 for (f = exp->ex_flavors; f < end; f++) { 1502 f = exp->ex_flavors;
1471 if (first || f->flags != lastflags) { 1503 flags = show_secinfo_run(m, &f, end);
1472 if (!first) 1504 if (!secinfo_flags_equal(flags, exp->ex_flags))
1473 show_secinfo_flags(m, lastflags); 1505 show_secinfo_flags(m, flags);
1474 seq_printf(m, ",sec=%d", f->pseudoflavor); 1506 while (f != end) {
1475 lastflags = f->flags; 1507 flags = show_secinfo_run(m, &f, end);
1476 } else { 1508 show_secinfo_flags(m, flags);
1477 seq_printf(m, ":%d", f->pseudoflavor);
1478 }
1479 } 1509 }
1480 show_secinfo_flags(m, lastflags);
1481} 1510}
1482 1511
1483static void exp_flags(struct seq_file *m, int flag, int fsid, 1512static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
1532 .show = e_show, 1561 .show = e_show,
1533}; 1562};
1534 1563
1564#ifdef CONFIG_NFSD_DEPRECATED
1535/* 1565/*
1536 * Add or modify a client. 1566 * Add or modify a client.
1537 * Change requests may involve the list of host addresses. The list of 1567 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
1563 /* Insert client into hashtable. */ 1593 /* Insert client into hashtable. */
1564 for (i = 0; i < ncp->cl_naddr; i++) { 1594 for (i = 0; i < ncp->cl_naddr; i++) {
1565 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); 1595 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
1566 auth_unix_add_addr(&addr6, dom); 1596 auth_unix_add_addr(&init_net, &addr6, dom);
1567 } 1597 }
1568 auth_unix_forget_old(dom); 1598 auth_unix_forget_old(dom);
1569 auth_domain_put(dom); 1599 auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
1621 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); 1651 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
1622 return 0; 1652 return 0;
1623} 1653}
1654#endif /* CONFIG_NFSD_DEPRECATED */
1624 1655
1625/* 1656/*
1626 * Initialize the exports module. 1657 * Initialize the exports module.
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b..5b7e3021e06 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
169 169
170 fh_copy(&resp->fh, &argp->fh); 170 fh_copy(&resp->fh, &argp->fh);
171 nfserr = nfsd_read(rqstp, &resp->fh, NULL, 171 nfserr = nfsd_read(rqstp, &resp->fh,
172 argp->offset, 172 argp->offset,
173 rqstp->rq_vec, argp->vlen, 173 rqstp->rq_vec, argp->vlen,
174 &resp->count); 174 &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
271 fh_init(&resp->fh, NFS3_FHSIZE); 271 fh_init(&resp->fh, NFS3_FHSIZE);
272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
273 &argp->attrs, S_IFDIR, 0, &resp->fh); 273 &argp->attrs, S_IFDIR, 0, &resp->fh);
274 274 fh_unlock(&resp->dirfh);
275 RETURN_STATUS(nfserr); 275 RETURN_STATUS(nfserr);
276} 276}
277 277
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
327 type = nfs3_ftypes[argp->ftype]; 327 type = nfs3_ftypes[argp->ftype];
328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
329 &argp->attrs, type, rdev, &resp->fh); 329 &argp->attrs, type, rdev, &resp->fh);
330 330 fh_unlock(&resp->dirfh);
331 RETURN_STATUS(nfserr); 331 RETURN_STATUS(nfserr);
332} 332}
333 333
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
348 /* Unlink. -S_IFDIR means file must not be a directory */ 348 /* Unlink. -S_IFDIR means file must not be a directory */
349 fh_copy(&resp->fh, &argp->fh); 349 fh_copy(&resp->fh, &argp->fh);
350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); 350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
351 fh_unlock(&resp->fh);
351 RETURN_STATUS(nfserr); 352 RETURN_STATUS(nfserr);
352} 353}
353 354
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
367 368
368 fh_copy(&resp->fh, &argp->fh); 369 fh_copy(&resp->fh, &argp->fh);
369 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); 370 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
371 fh_unlock(&resp->fh);
370 RETURN_STATUS(nfserr); 372 RETURN_STATUS(nfserr);
371} 373}
372 374
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index eb78e7e2207..143da2eecd7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
41 41
42#define NFSPROC4_CB_NULL 0 42#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 43#define NFSPROC4_CB_COMPOUND 1
44#define NFS4_STATEID_SIZE 16
45 44
46/* Index of predefined Linux callback client operations */ 45/* Index of predefined Linux callback client operations */
47 46
@@ -143,8 +142,6 @@ struct nfs4_cb_compound_hdr {
143 u32 minorversion; 142 u32 minorversion;
144 /* res */ 143 /* res */
145 int status; 144 int status;
146 u32 taglen;
147 char *tag;
148}; 145};
149 146
150static struct { 147static struct {
@@ -205,6 +202,16 @@ nfs_cb_stat_to_errno(int stat)
205 */ 202 */
206 203
207static void 204static void
205encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
206{
207 __be32 *p;
208
209 RESERVE_SPACE(sizeof(stateid_t));
210 WRITE32(sid->si_generation);
211 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
212}
213
214static void
208encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 215encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
209{ 216{
210 __be32 * p; 217 __be32 * p;
@@ -229,10 +236,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
229 __be32 *p; 236 __be32 *p;
230 int len = dp->dl_fh.fh_size; 237 int len = dp->dl_fh.fh_size;
231 238
232 RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); 239 RESERVE_SPACE(4);
233 WRITE32(OP_CB_RECALL); 240 WRITE32(OP_CB_RECALL);
234 WRITE32(dp->dl_stateid.si_generation); 241 encode_stateid(xdr, &dp->dl_stateid);
235 WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); 242 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
236 WRITE32(0); /* truncate optimization not implemented */ 243 WRITE32(0); /* truncate optimization not implemented */
237 WRITE32(len); 244 WRITE32(len);
238 WRITEMEM(&dp->dl_fh.fh_base, len); 245 WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -240,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
240} 247}
241 248
242static void 249static void
243encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, 250encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
244 struct nfs4_cb_compound_hdr *hdr) 251 struct nfs4_cb_compound_hdr *hdr)
245{ 252{
246 __be32 *p; 253 __be32 *p;
254 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
247 255
248 if (hdr->minorversion == 0) 256 if (hdr->minorversion == 0)
249 return; 257 return;
@@ -251,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
251 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
252 260
253 WRITE32(OP_CB_SEQUENCE); 261 WRITE32(OP_CB_SEQUENCE);
254 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); 262 WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
255 WRITE32(args->cbs_clp->cl_cb_seq_nr); 263 WRITE32(ses->se_cb_seq_nr);
256 WRITE32(0); /* slotid, always 0 */ 264 WRITE32(0); /* slotid, always 0 */
257 WRITE32(0); /* highest slotid always 0 */ 265 WRITE32(0); /* highest slotid always 0 */
258 WRITE32(0); /* cachethis always 0 */ 266 WRITE32(0); /* cachethis always 0 */
@@ -272,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
272 280
273static int 281static int
274nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
275 struct nfs4_rpc_args *rpc_args) 283 struct nfsd4_callback *cb)
276{ 284{
277 struct xdr_stream xdr; 285 struct xdr_stream xdr;
278 struct nfs4_delegation *args = rpc_args->args_op; 286 struct nfs4_delegation *args = cb->cb_op;
279 struct nfs4_cb_compound_hdr hdr = { 287 struct nfs4_cb_compound_hdr hdr = {
280 .ident = args->dl_ident, 288 .ident = cb->cb_clp->cl_cb_ident,
281 .minorversion = rpc_args->args_seq.cbs_minorversion, 289 .minorversion = cb->cb_minorversion,
282 }; 290 };
283 291
284 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 292 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
285 encode_cb_compound_hdr(&xdr, &hdr); 293 encode_cb_compound_hdr(&xdr, &hdr);
286 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); 294 encode_cb_sequence(&xdr, cb, &hdr);
287 encode_cb_recall(&xdr, args, &hdr); 295 encode_cb_recall(&xdr, args, &hdr);
288 encode_cb_nops(&hdr); 296 encode_cb_nops(&hdr);
289 return 0; 297 return 0;
@@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
293static int 301static int
294decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ 302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
295 __be32 *p; 303 __be32 *p;
304 u32 taglen;
296 305
297 READ_BUF(8); 306 READ_BUF(8);
298 READ32(hdr->status); 307 READ32(hdr->status);
299 READ32(hdr->taglen); 308 /* We've got no use for the tag; ignore it: */
300 READ_BUF(hdr->taglen + 4); 309 READ32(taglen);
301 hdr->tag = (char *)p; 310 READ_BUF(taglen + 4);
302 p += XDR_QUADLEN(hdr->taglen); 311 p += XDR_QUADLEN(taglen);
303 READ32(hdr->nops); 312 READ32(hdr->nops);
304 return 0; 313 return 0;
305} 314}
@@ -330,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
330 * with a single slot. 339 * with a single slot.
331 */ 340 */
332static int 341static int
333decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, 342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
334 struct rpc_rqst *rqstp) 343 struct rpc_rqst *rqstp)
335{ 344{
345 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
336 struct nfs4_sessionid id; 346 struct nfs4_sessionid id;
337 int status; 347 int status;
338 u32 dummy; 348 u32 dummy;
339 __be32 *p; 349 __be32 *p;
340 350
341 if (res->cbs_minorversion == 0) 351 if (cb->cb_minorversion == 0)
342 return 0; 352 return 0;
343 353
344 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); 354 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -354,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
354 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 364 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
355 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 365 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
356 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 366 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
357 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, 367 if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
358 NFS4_MAX_SESSIONID_LEN)) {
359 dprintk("%s Invalid session id\n", __func__); 368 dprintk("%s Invalid session id\n", __func__);
360 goto out; 369 goto out;
361 } 370 }
362 READ32(dummy); 371 READ32(dummy);
363 if (dummy != res->cbs_clp->cl_cb_seq_nr) { 372 if (dummy != ses->se_cb_seq_nr) {
364 dprintk("%s Invalid sequence number\n", __func__); 373 dprintk("%s Invalid sequence number\n", __func__);
365 goto out; 374 goto out;
366 } 375 }
@@ -384,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
384 393
385static int 394static int
386nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
387 struct nfsd4_cb_sequence *seq) 396 struct nfsd4_callback *cb)
388{ 397{
389 struct xdr_stream xdr; 398 struct xdr_stream xdr;
390 struct nfs4_cb_compound_hdr hdr; 399 struct nfs4_cb_compound_hdr hdr;
@@ -394,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
394 status = decode_cb_compound_hdr(&xdr, &hdr); 403 status = decode_cb_compound_hdr(&xdr, &hdr);
395 if (status) 404 if (status)
396 goto out; 405 goto out;
397 if (seq) { 406 if (cb) {
398 status = decode_cb_sequence(&xdr, seq, rqstp); 407 status = decode_cb_sequence(&xdr, cb, rqstp);
399 if (status) 408 if (status)
400 goto out; 409 goto out;
401 } 410 }
@@ -464,30 +473,34 @@ static int max_cb_time(void)
464/* Reference counting, callback cleanup, etc., all look racy as heck. 473/* Reference counting, callback cleanup, etc., all look racy as heck.
465 * And why is cl_cb_set an atomic? */ 474 * And why is cl_cb_set an atomic? */
466 475
467int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
468{ 477{
469 struct rpc_timeout timeparms = { 478 struct rpc_timeout timeparms = {
470 .to_initval = max_cb_time(), 479 .to_initval = max_cb_time(),
471 .to_retries = 0, 480 .to_retries = 0,
472 }; 481 };
473 struct rpc_create_args args = { 482 struct rpc_create_args args = {
474 .protocol = XPRT_TRANSPORT_TCP, 483 .net = &init_net,
475 .address = (struct sockaddr *) &cb->cb_addr, 484 .address = (struct sockaddr *) &conn->cb_addr,
476 .addrsize = cb->cb_addrlen, 485 .addrsize = conn->cb_addrlen,
477 .timeout = &timeparms, 486 .timeout = &timeparms,
478 .program = &cb_program, 487 .program = &cb_program,
479 .prognumber = cb->cb_prog,
480 .version = 0, 488 .version = 0,
481 .authflavor = clp->cl_flavor, 489 .authflavor = clp->cl_flavor,
482 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 490 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
483 .client_name = clp->cl_principal,
484 }; 491 };
485 struct rpc_clnt *client; 492 struct rpc_clnt *client;
486 493
487 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 494 if (clp->cl_minorversion == 0) {
488 return -EINVAL; 495 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
489 if (cb->cb_minorversion) { 496 return -EINVAL;
490 args.bc_xprt = cb->cb_xprt; 497 args.client_name = clp->cl_principal;
498 args.prognumber = conn->cb_prog,
499 args.protocol = XPRT_TRANSPORT_TCP;
500 clp->cl_cb_ident = conn->cb_ident;
501 } else {
502 args.bc_xprt = conn->cb_xprt;
503 args.prognumber = clp->cl_cb_session->se_cb_prog;
491 args.protocol = XPRT_TRANSPORT_BC_TCP; 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
492 } 505 }
493 /* Create RPC client */ 506 /* Create RPC client */
@@ -497,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
497 PTR_ERR(client)); 510 PTR_ERR(client));
498 return PTR_ERR(client); 511 return PTR_ERR(client);
499 } 512 }
500 nfsd4_set_callback_client(clp, client); 513 clp->cl_cb_client = client;
501 return 0; 514 return 0;
502 515
503} 516}
@@ -510,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
510 523
511static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 524static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
512{ 525{
513 struct nfs4_client *clp = calldata; 526 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
514 527
515 if (task->tk_status) 528 if (task->tk_status)
516 warn_no_callback_path(clp, task->tk_status); 529 warn_no_callback_path(clp, task->tk_status);
@@ -519,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
519} 532}
520 533
521static const struct rpc_call_ops nfsd4_cb_probe_ops = { 534static const struct rpc_call_ops nfsd4_cb_probe_ops = {
535 /* XXX: release method to ensure we set the cb channel down if
536 * necessary on early failure? */
522 .rpc_call_done = nfsd4_cb_probe_done, 537 .rpc_call_done = nfsd4_cb_probe_done,
523}; 538};
524 539
@@ -534,38 +549,42 @@ int set_callback_cred(void)
534 return 0; 549 return 0;
535} 550}
536 551
552static struct workqueue_struct *callback_wq;
537 553
538void do_probe_callback(struct nfs4_client *clp) 554static void do_probe_callback(struct nfs4_client *clp)
539{ 555{
540 struct rpc_message msg = { 556 struct nfsd4_callback *cb = &clp->cl_cb_null;
541 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
542 .rpc_argp = clp,
543 .rpc_cred = callback_cred
544 };
545 int status;
546 557
547 status = rpc_call_async(clp->cl_cb_client, &msg, 558 cb->cb_op = NULL;
548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 559 cb->cb_clp = clp;
549 &nfsd4_cb_probe_ops, (void *)clp); 560
550 if (status) 561 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
551 warn_no_callback_path(clp, status); 562 cb->cb_msg.rpc_argp = NULL;
563 cb->cb_msg.rpc_resp = NULL;
564 cb->cb_msg.rpc_cred = callback_cred;
565
566 cb->cb_ops = &nfsd4_cb_probe_ops;
567
568 queue_work(callback_wq, &cb->cb_work);
552} 569}
553 570
554/* 571/*
555 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 572 * Poke the callback thread to process any updates to the callback
573 * parameters, and send a null probe.
556 */ 574 */
557void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 575void nfsd4_probe_callback(struct nfs4_client *clp)
558{ 576{
559 int status; 577 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
578 do_probe_callback(clp);
579}
560 580
581void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
582{
561 BUG_ON(atomic_read(&clp->cl_cb_set)); 583 BUG_ON(atomic_read(&clp->cl_cb_set));
562 584
563 status = setup_callback_client(clp, cb); 585 spin_lock(&clp->cl_lock);
564 if (status) { 586 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
565 warn_no_callback_path(clp, status); 587 spin_unlock(&clp->cl_lock);
566 return;
567 }
568 do_probe_callback(clp);
569} 588}
570 589
571/* 590/*
@@ -576,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
576static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
577 struct rpc_task *task) 596 struct rpc_task *task)
578{ 597{
579 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 598 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
580 u32 *ptr = (u32 *)clp->cl_sessionid.data;
581 int status = 0; 599 int status = 0;
582 600
583 dprintk("%s: %u:%u:%u:%u\n", __func__, 601 dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -589,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
589 status = -EAGAIN; 607 status = -EAGAIN;
590 goto out; 608 goto out;
591 } 609 }
592
593 /*
594 * We'll need the clp during XDR encoding and decoding,
595 * and the sequence during decoding to verify the reply
596 */
597 args->args_seq.cbs_clp = clp;
598 task->tk_msg.rpc_resp = &args->args_seq;
599
600out: 610out:
601 dprintk("%s status=%d\n", __func__, status); 611 dprintk("%s status=%d\n", __func__, status);
602 return status; 612 return status;
@@ -608,13 +618,13 @@ out:
608 */ 618 */
609static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) 619static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
610{ 620{
611 struct nfs4_delegation *dp = calldata; 621 struct nfsd4_callback *cb = calldata;
622 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
612 struct nfs4_client *clp = dp->dl_client; 623 struct nfs4_client *clp = dp->dl_client;
613 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 624 u32 minorversion = clp->cl_minorversion;
614 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
615 int status = 0; 625 int status = 0;
616 626
617 args->args_seq.cbs_minorversion = minorversion; 627 cb->cb_minorversion = minorversion;
618 if (minorversion) { 628 if (minorversion) {
619 status = nfsd41_cb_setup_sequence(clp, task); 629 status = nfsd41_cb_setup_sequence(clp, task);
620 if (status) { 630 if (status) {
@@ -631,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
631 641
632static void nfsd4_cb_done(struct rpc_task *task, void *calldata) 642static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
633{ 643{
634 struct nfs4_delegation *dp = calldata; 644 struct nfsd4_callback *cb = calldata;
645 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
635 struct nfs4_client *clp = dp->dl_client; 646 struct nfs4_client *clp = dp->dl_client;
636 647
637 dprintk("%s: minorversion=%d\n", __func__, 648 dprintk("%s: minorversion=%d\n", __func__,
638 clp->cl_cb_conn.cb_minorversion); 649 clp->cl_minorversion);
639 650
640 if (clp->cl_cb_conn.cb_minorversion) { 651 if (clp->cl_minorversion) {
641 /* No need for lock, access serialized in nfsd4_cb_prepare */ 652 /* No need for lock, access serialized in nfsd4_cb_prepare */
642 ++clp->cl_cb_seq_nr; 653 ++clp->cl_cb_session->se_cb_seq_nr;
643 clear_bit(0, &clp->cl_cb_slot_busy); 654 clear_bit(0, &clp->cl_cb_slot_busy);
644 rpc_wake_up_next(&clp->cl_cb_waitq); 655 rpc_wake_up_next(&clp->cl_cb_waitq);
645 dprintk("%s: freed slot, new seqid=%d\n", __func__, 656 dprintk("%s: freed slot, new seqid=%d\n", __func__,
646 clp->cl_cb_seq_nr); 657 clp->cl_cb_session->se_cb_seq_nr);
647 658
648 /* We're done looking into the sequence information */ 659 /* We're done looking into the sequence information */
649 task->tk_msg.rpc_resp = NULL; 660 task->tk_msg.rpc_resp = NULL;
@@ -653,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
653 664
654static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 665static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
655{ 666{
656 struct nfs4_delegation *dp = calldata; 667 struct nfsd4_callback *cb = calldata;
668 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
657 struct nfs4_client *clp = dp->dl_client; 669 struct nfs4_client *clp = dp->dl_client;
658 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 670 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
659 671
@@ -667,28 +679,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
667 } 679 }
668 680
669 switch (task->tk_status) { 681 switch (task->tk_status) {
670 case -EIO: 682 case 0:
683 return;
684 case -EBADHANDLE:
685 case -NFS4ERR_BAD_STATEID:
686 /* Race: client probably got cb_recall
687 * before open reply granting delegation */
688 break;
689 default:
671 /* Network partition? */ 690 /* Network partition? */
672 atomic_set(&clp->cl_cb_set, 0); 691 atomic_set(&clp->cl_cb_set, 0);
673 warn_no_callback_path(clp, task->tk_status); 692 warn_no_callback_path(clp, task->tk_status);
674 if (current_rpc_client != task->tk_client) { 693 if (current_rpc_client != task->tk_client) {
675 /* queue a callback on the new connection: */ 694 /* queue a callback on the new connection: */
695 atomic_inc(&dp->dl_count);
676 nfsd4_cb_recall(dp); 696 nfsd4_cb_recall(dp);
677 return; 697 return;
678 } 698 }
679 case -EBADHANDLE:
680 case -NFS4ERR_BAD_STATEID:
681 /* Race: client probably got cb_recall
682 * before open reply granting delegation */
683 break;
684 default:
685 /* success, or error we can't handle */
686 return;
687 } 699 }
688 if (dp->dl_retries--) { 700 if (dp->dl_retries--) {
689 rpc_delay(task, 2*HZ); 701 rpc_delay(task, 2*HZ);
690 task->tk_status = 0; 702 task->tk_status = 0;
691 rpc_restart_call(task); 703 rpc_restart_call_prepare(task);
692 return; 704 return;
693 } else { 705 } else {
694 atomic_set(&clp->cl_cb_set, 0); 706 atomic_set(&clp->cl_cb_set, 0);
@@ -698,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
698 710
699static void nfsd4_cb_recall_release(void *calldata) 711static void nfsd4_cb_recall_release(void *calldata)
700{ 712{
701 struct nfs4_delegation *dp = calldata; 713 struct nfsd4_callback *cb = calldata;
714 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
702 715
703 nfs4_put_delegation(dp); 716 nfs4_put_delegation(dp);
704} 717}
@@ -709,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
709 .rpc_release = nfsd4_cb_recall_release, 722 .rpc_release = nfsd4_cb_recall_release,
710}; 723};
711 724
712static struct workqueue_struct *callback_wq;
713
714int nfsd4_create_callback_queue(void) 725int nfsd4_create_callback_queue(void)
715{ 726{
716 callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); 727 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -725,59 +736,88 @@ void nfsd4_destroy_callback_queue(void)
725} 736}
726 737
727/* must be called under the state lock */ 738/* must be called under the state lock */
728void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) 739void nfsd4_shutdown_callback(struct nfs4_client *clp)
729{ 740{
730 struct rpc_clnt *old = clp->cl_cb_client; 741 set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
731
732 clp->cl_cb_client = new;
733 /* 742 /*
734 * After this, any work that saw the old value of cl_cb_client will 743 * Note this won't actually result in a null callback;
735 * be gone: 744 * instead, nfsd4_do_callback_rpc() will detect the killed
745 * client, destroy the rpc client, and stop:
736 */ 746 */
747 do_probe_callback(clp);
737 flush_workqueue(callback_wq); 748 flush_workqueue(callback_wq);
738 /* So we can safely shut it down: */
739 if (old)
740 rpc_shutdown_client(old);
741} 749}
742 750
743/* 751void nfsd4_release_cb(struct nfsd4_callback *cb)
744 * called with dp->dl_count inc'ed.
745 */
746static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
747{ 752{
748 struct nfs4_client *clp = dp->dl_client; 753 if (cb->cb_ops->rpc_release)
749 struct rpc_clnt *clnt = clp->cl_cb_client; 754 cb->cb_ops->rpc_release(cb);
750 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; 755}
751 struct rpc_message msg = {
752 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
753 .rpc_cred = callback_cred
754 };
755 int status;
756 756
757 if (clnt == NULL) 757void nfsd4_process_cb_update(struct nfsd4_callback *cb)
758 return; /* Client is shutting down; give up. */ 758{
759 struct nfs4_cb_conn conn;
760 struct nfs4_client *clp = cb->cb_clp;
761 int err;
759 762
760 args->args_op = dp; 763 /*
761 msg.rpc_argp = args; 764 * This is either an update, or the client dying; in either case,
762 dp->dl_retries = 1; 765 * kill the old client:
763 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 766 */
764 &nfsd4_cb_recall_ops, dp); 767 if (clp->cl_cb_client) {
765 if (status) 768 rpc_shutdown_client(clp->cl_cb_client);
766 nfs4_put_delegation(dp); 769 clp->cl_cb_client = NULL;
770 }
771 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
772 return;
773 spin_lock(&clp->cl_lock);
774 /*
775 * Only serialized callback code is allowed to clear these
776 * flags; main nfsd code can only set them:
777 */
778 BUG_ON(!clp->cl_cb_flags);
779 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
780 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
781 spin_unlock(&clp->cl_lock);
782
783 err = setup_callback_client(clp, &conn);
784 if (err)
785 warn_no_callback_path(clp, err);
767} 786}
768 787
769void nfsd4_do_callback_rpc(struct work_struct *w) 788void nfsd4_do_callback_rpc(struct work_struct *w)
770{ 789{
771 /* XXX: for now, just send off delegation recall. */ 790 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
772 /* In future, generalize to handle any sort of callback. */ 791 struct nfs4_client *clp = cb->cb_clp;
773 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); 792 struct rpc_clnt *clnt;
774 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
775 793
776 _nfsd4_cb_recall(dp); 794 if (clp->cl_cb_flags)
777} 795 nfsd4_process_cb_update(cb);
778 796
797 clnt = clp->cl_cb_client;
798 if (!clnt) {
799 /* Callback channel broken, or client killed; give up: */
800 nfsd4_release_cb(cb);
801 return;
802 }
803 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
804 cb->cb_ops, cb);
805}
779 806
780void nfsd4_cb_recall(struct nfs4_delegation *dp) 807void nfsd4_cb_recall(struct nfs4_delegation *dp)
781{ 808{
809 struct nfsd4_callback *cb = &dp->dl_recall;
810
811 dp->dl_retries = 1;
812 cb->cb_op = dp;
813 cb->cb_clp = dp->dl_client;
814 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
815 cb->cb_msg.rpc_argp = cb;
816 cb->cb_msg.rpc_resp = cb;
817 cb->cb_msg.rpc_cred = callback_cred;
818
819 cb->cb_ops = &nfsd4_cb_recall_ops;
820 dp->dl_retries = 1;
821
782 queue_work(callback_wq, &dp->dl_recall.cb_work); 822 queue_work(callback_wq, &dp->dl_recall.cb_work);
783} 823}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf49342..f0695e815f0 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
482 cache_unregister(&nametoid_cache); 482 cache_unregister(&nametoid_cache);
483} 483}
484 484
485/*
486 * Deferred request handling
487 */
488
489struct idmap_defer_req {
490 struct cache_req req;
491 struct cache_deferred_req deferred_req;
492 wait_queue_head_t waitq;
493 atomic_t count;
494};
495
496static inline void
497put_mdr(struct idmap_defer_req *mdr)
498{
499 if (atomic_dec_and_test(&mdr->count))
500 kfree(mdr);
501}
502
503static inline void
504get_mdr(struct idmap_defer_req *mdr)
505{
506 atomic_inc(&mdr->count);
507}
508
509static void
510idmap_revisit(struct cache_deferred_req *dreq, int toomany)
511{
512 struct idmap_defer_req *mdr =
513 container_of(dreq, struct idmap_defer_req, deferred_req);
514
515 wake_up(&mdr->waitq);
516 put_mdr(mdr);
517}
518
519static struct cache_deferred_req *
520idmap_defer(struct cache_req *req)
521{
522 struct idmap_defer_req *mdr =
523 container_of(req, struct idmap_defer_req, req);
524
525 mdr->deferred_req.revisit = idmap_revisit;
526 get_mdr(mdr);
527 return (&mdr->deferred_req);
528}
529
530static inline int
531do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
532 struct cache_detail *detail, struct ent **item,
533 struct idmap_defer_req *mdr)
534{
535 *item = lookup_fn(key);
536 if (!*item)
537 return -ENOMEM;
538 return cache_check(detail, &(*item)->h, &mdr->req);
539}
540
541static inline int
542do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
543 struct ent *key, struct cache_detail *detail,
544 struct ent **item)
545{
546 int ret = -ENOMEM;
547
548 *item = lookup_fn(key);
549 if (!*item)
550 goto out_err;
551 ret = -ETIMEDOUT;
552 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
553 || (*item)->h.expiry_time < get_seconds()
554 || detail->flush_time > (*item)->h.last_refresh)
555 goto out_put;
556 ret = -ENOENT;
557 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
558 goto out_put;
559 return 0;
560out_put:
561 cache_put(&(*item)->h, detail);
562out_err:
563 *item = NULL;
564 return ret;
565}
566
567static int 485static int
568idmap_lookup(struct svc_rqst *rqstp, 486idmap_lookup(struct svc_rqst *rqstp,
569 struct ent *(*lookup_fn)(struct ent *), struct ent *key, 487 struct ent *(*lookup_fn)(struct ent *), struct ent *key,
570 struct cache_detail *detail, struct ent **item) 488 struct cache_detail *detail, struct ent **item)
571{ 489{
572 struct idmap_defer_req *mdr;
573 int ret; 490 int ret;
574 491
575 mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); 492 *item = lookup_fn(key);
576 if (!mdr) 493 if (!*item)
577 return -ENOMEM; 494 return -ENOMEM;
578 atomic_set(&mdr->count, 1); 495 retry:
579 init_waitqueue_head(&mdr->waitq); 496 ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
580 mdr->req.defer = idmap_defer; 497
581 ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr); 498 if (ret == -ETIMEDOUT) {
582 if (ret == -EAGAIN) { 499 struct ent *prev_item = *item;
583 wait_event_interruptible_timeout(mdr->waitq, 500 *item = lookup_fn(key);
584 test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ); 501 if (*item != prev_item)
585 ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item); 502 goto retry;
503 cache_put(&(*item)->h, detail);
586 } 504 }
587 put_mdr(mdr);
588 return ret; 505 return ret;
589} 506}
590 507
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7..0cdfd022bb7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1031 resp->cstate.session = NULL; 1031 resp->cstate.session = NULL;
1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1034 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /*
1035 rqstp->rq_usedeferral = (args->minorversion == 0); 1035 * Don't use the deferral mechanism for NFSv4; compounds make it
1036 * too hard to avoid non-idempotency problems.
1037 */
1038 rqstp->rq_usedeferral = 0;
1036 1039
1037 /* 1040 /*
1038 * According to RFC3010, this takes precedence over all other errors. 1041 * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4a273475877..f1e5ec6b510 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
33*/ 33*/
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/fs.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/swap.h> 39#include <linux/swap.h>
@@ -51,7 +51,6 @@ static time_t boot_time;
51static u32 current_ownerid = 1; 51static u32 current_ownerid = 1;
52static u32 current_fileid = 1; 52static u32 current_fileid = 1;
53static u32 current_delegid = 1; 53static u32 current_delegid = 1;
54static u32 nfs4_init;
55static stateid_t zerostateid; /* bits all 0 */ 54static stateid_t zerostateid; /* bits all 0 */
56static stateid_t onestateid; /* bits all 1 */ 55static stateid_t onestateid; /* bits all 1 */
57static u64 current_sessionid = 1; 56static u64 current_sessionid = 1;
@@ -163,14 +162,60 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
163static struct list_head file_hashtbl[FILE_HASH_SIZE]; 162static struct list_head file_hashtbl[FILE_HASH_SIZE];
164static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; 163static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
165 164
165static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
166{
167 BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
168 atomic_inc(&fp->fi_access[oflag]);
169}
170
171static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
172{
173 if (oflag == O_RDWR) {
174 __nfs4_file_get_access(fp, O_RDONLY);
175 __nfs4_file_get_access(fp, O_WRONLY);
176 } else
177 __nfs4_file_get_access(fp, oflag);
178}
179
180static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
181{
182 if (fp->fi_fds[oflag]) {
183 fput(fp->fi_fds[oflag]);
184 fp->fi_fds[oflag] = NULL;
185 }
186}
187
188static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
189{
190 if (atomic_dec_and_test(&fp->fi_access[oflag])) {
191 nfs4_file_put_fd(fp, O_RDWR);
192 nfs4_file_put_fd(fp, oflag);
193 }
194}
195
196static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
197{
198 if (oflag == O_RDWR) {
199 __nfs4_file_put_access(fp, O_RDONLY);
200 __nfs4_file_put_access(fp, O_WRONLY);
201 } else
202 __nfs4_file_put_access(fp, oflag);
203}
204
166static struct nfs4_delegation * 205static struct nfs4_delegation *
167alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) 206alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
168{ 207{
169 struct nfs4_delegation *dp; 208 struct nfs4_delegation *dp;
170 struct nfs4_file *fp = stp->st_file; 209 struct nfs4_file *fp = stp->st_file;
171 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
172 210
173 dprintk("NFSD alloc_init_deleg\n"); 211 dprintk("NFSD alloc_init_deleg\n");
212 /*
213 * Major work on the lease subsystem (for example, to support
214 * calbacks on stat) will be required before we can support
215 * write delegations properly.
216 */
217 if (type != NFS4_OPEN_DELEGATE_READ)
218 return NULL;
174 if (fp->fi_had_conflict) 219 if (fp->fi_had_conflict)
175 return NULL; 220 return NULL;
176 if (num_delegations > max_delegations) 221 if (num_delegations > max_delegations)
@@ -185,11 +230,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
185 dp->dl_client = clp; 230 dp->dl_client = clp;
186 get_nfs4_file(fp); 231 get_nfs4_file(fp);
187 dp->dl_file = fp; 232 dp->dl_file = fp;
233 nfs4_file_get_access(fp, O_RDONLY);
188 dp->dl_flock = NULL; 234 dp->dl_flock = NULL;
189 get_file(stp->st_vfs_file);
190 dp->dl_vfs_file = stp->st_vfs_file;
191 dp->dl_type = type; 235 dp->dl_type = type;
192 dp->dl_ident = cb->cb_ident;
193 dp->dl_stateid.si_boot = boot_time; 236 dp->dl_stateid.si_boot = boot_time;
194 dp->dl_stateid.si_stateownerid = current_delegid++; 237 dp->dl_stateid.si_stateownerid = current_delegid++;
195 dp->dl_stateid.si_fileid = 0; 238 dp->dl_stateid.si_fileid = 0;
@@ -222,15 +265,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
222static void 265static void
223nfs4_close_delegation(struct nfs4_delegation *dp) 266nfs4_close_delegation(struct nfs4_delegation *dp)
224{ 267{
225 struct file *filp = dp->dl_vfs_file; 268 struct file *filp = find_readable_file(dp->dl_file);
226 269
227 dprintk("NFSD: close_delegation dp %p\n",dp); 270 dprintk("NFSD: close_delegation dp %p\n",dp);
228 dp->dl_vfs_file = NULL;
229 /* The following nfsd_close may not actually close the file,
230 * but we want to remove the lease in any case. */
231 if (dp->dl_flock) 271 if (dp->dl_flock)
232 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 272 vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
233 nfsd_close(filp); 273 nfs4_file_put_access(dp->dl_file, O_RDONLY);
234} 274}
235 275
236/* Called under the state lock. */ 276/* Called under the state lock. */
@@ -302,8 +342,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
302 342
303static void release_lock_stateid(struct nfs4_stateid *stp) 343static void release_lock_stateid(struct nfs4_stateid *stp)
304{ 344{
345 struct file *file;
346
305 unhash_generic_stateid(stp); 347 unhash_generic_stateid(stp);
306 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); 348 file = find_any_file(stp->st_file);
349 if (file)
350 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
307 free_generic_stateid(stp); 351 free_generic_stateid(stp);
308} 352}
309 353
@@ -341,11 +385,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
341 } 385 }
342} 386}
343 387
388/*
389 * We store the NONE, READ, WRITE, and BOTH bits separately in the
390 * st_{access,deny}_bmap field of the stateid, in order to track not
391 * only what share bits are currently in force, but also what
392 * combinations of share bits previous opens have used. This allows us
393 * to enforce the recommendation of rfc 3530 14.2.19 that the server
394 * return an error if the client attempt to downgrade to a combination
395 * of share bits not explicable by closing some of its previous opens.
396 *
397 * XXX: This enforcement is actually incomplete, since we don't keep
398 * track of access/deny bit combinations; so, e.g., we allow:
399 *
400 * OPEN allow read, deny write
401 * OPEN allow both, deny none
402 * DOWNGRADE allow read, deny none
403 *
404 * which we should reject.
405 */
406static void
407set_access(unsigned int *access, unsigned long bmap) {
408 int i;
409
410 *access = 0;
411 for (i = 1; i < 4; i++) {
412 if (test_bit(i, &bmap))
413 *access |= i;
414 }
415}
416
417static void
418set_deny(unsigned int *deny, unsigned long bmap) {
419 int i;
420
421 *deny = 0;
422 for (i = 0; i < 4; i++) {
423 if (test_bit(i, &bmap))
424 *deny |= i ;
425 }
426}
427
428static int
429test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
430 unsigned int access, deny;
431
432 set_access(&access, stp->st_access_bmap);
433 set_deny(&deny, stp->st_deny_bmap);
434 if ((access & open->op_share_deny) || (deny & open->op_share_access))
435 return 0;
436 return 1;
437}
438
439static int nfs4_access_to_omode(u32 access)
440{
441 switch (access & NFS4_SHARE_ACCESS_BOTH) {
442 case NFS4_SHARE_ACCESS_READ:
443 return O_RDONLY;
444 case NFS4_SHARE_ACCESS_WRITE:
445 return O_WRONLY;
446 case NFS4_SHARE_ACCESS_BOTH:
447 return O_RDWR;
448 }
449 BUG();
450}
451
452static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
453{
454 unsigned int access;
455
456 set_access(&access, stp->st_access_bmap);
457 return nfs4_access_to_omode(access);
458}
459
344static void release_open_stateid(struct nfs4_stateid *stp) 460static void release_open_stateid(struct nfs4_stateid *stp)
345{ 461{
462 int oflag = nfs4_access_bmap_to_omode(stp);
463
346 unhash_generic_stateid(stp); 464 unhash_generic_stateid(stp);
347 release_stateid_lockowners(stp); 465 release_stateid_lockowners(stp);
348 nfsd_close(stp->st_vfs_file); 466 nfs4_file_put_access(stp->st_file, oflag);
349 free_generic_stateid(stp); 467 free_generic_stateid(stp);
350} 468}
351 469
@@ -415,171 +533,258 @@ gen_sessionid(struct nfsd4_session *ses)
415 */ 533 */
416#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) 534#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
417 535
536static void
537free_session_slots(struct nfsd4_session *ses)
538{
539 int i;
540
541 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
542 kfree(ses->se_slots[i]);
543}
544
418/* 545/*
419 * Give the client the number of ca_maxresponsesize_cached slots it 546 * We don't actually need to cache the rpc and session headers, so we
420 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, 547 * can allocate a little less for each slot:
421 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more 548 */
422 * than NFSD_MAX_SLOTS_PER_SESSION. 549static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
423 * 550{
424 * If we run out of reserved DRC memory we should (up to a point) 551 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
552}
553
554static int nfsd4_sanitize_slot_size(u32 size)
555{
556 size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
557 size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
558
559 return size;
560}
561
562/*
563 * XXX: If we run out of reserved DRC memory we could (up to a point)
425 * re-negotiate active sessions and reduce their slot usage to make 564 * re-negotiate active sessions and reduce their slot usage to make
426 * rooom for new connections. For now we just fail the create session. 565 * rooom for new connections. For now we just fail the create session.
427 */ 566 */
428static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) 567static int nfsd4_get_drc_mem(int slotsize, u32 num)
429{ 568{
430 int mem, size = fchan->maxresp_cached; 569 int avail;
431 570
432 if (fchan->maxreqs < 1) 571 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
433 return nfserr_inval;
434 572
435 if (size < NFSD_MIN_HDR_SEQ_SZ) 573 spin_lock(&nfsd_drc_lock);
436 size = NFSD_MIN_HDR_SEQ_SZ; 574 avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
437 size -= NFSD_MIN_HDR_SEQ_SZ; 575 nfsd_drc_max_mem - nfsd_drc_mem_used);
438 if (size > NFSD_SLOT_CACHE_SIZE) 576 num = min_t(int, num, avail / slotsize);
439 size = NFSD_SLOT_CACHE_SIZE; 577 nfsd_drc_mem_used += num * slotsize;
440 578 spin_unlock(&nfsd_drc_lock);
441 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ 579
442 mem = fchan->maxreqs * size; 580 return num;
443 if (mem > NFSD_MAX_MEM_PER_SESSION) { 581}
444 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
445 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
446 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
447 mem = fchan->maxreqs * size;
448 }
449 582
583static void nfsd4_put_drc_mem(int slotsize, int num)
584{
450 spin_lock(&nfsd_drc_lock); 585 spin_lock(&nfsd_drc_lock);
451 /* bound the total session drc memory ussage */ 586 nfsd_drc_mem_used -= slotsize * num;
452 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
453 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
454 mem = fchan->maxreqs * size;
455 }
456 nfsd_drc_mem_used += mem;
457 spin_unlock(&nfsd_drc_lock); 587 spin_unlock(&nfsd_drc_lock);
588}
458 589
459 if (fchan->maxreqs == 0) 590static struct nfsd4_session *alloc_session(int slotsize, int numslots)
460 return nfserr_serverfault; 591{
592 struct nfsd4_session *new;
593 int mem, i;
461 594
462 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 595 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
463 return 0; 596 + sizeof(struct nfsd4_session) > PAGE_SIZE);
597 mem = numslots * sizeof(struct nfsd4_slot *);
598
599 new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
600 if (!new)
601 return NULL;
602 /* allocate each struct nfsd4_slot and data cache in one piece */
603 for (i = 0; i < numslots; i++) {
604 mem = sizeof(struct nfsd4_slot) + slotsize;
605 new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
606 if (!new->se_slots[i])
607 goto out_free;
608 }
609 return new;
610out_free:
611 while (i--)
612 kfree(new->se_slots[i]);
613 kfree(new);
614 return NULL;
464} 615}
465 616
466/* 617static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
467 * fchan holds the client values on input, and the server values on output
468 * sv_max_mesg is the maximum payload plus one page for overhead.
469 */
470static int init_forechannel_attrs(struct svc_rqst *rqstp,
471 struct nfsd4_channel_attrs *session_fchan,
472 struct nfsd4_channel_attrs *fchan)
473{ 618{
474 int status = 0; 619 u32 maxrpc = nfsd_serv->sv_max_mesg;
475 __u32 maxcount = nfsd_serv->sv_max_mesg;
476 620
477 /* headerpadsz set to zero in encode routine */ 621 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
626}
478 627
479 /* Use the client's max request and max response size if possible */ 628static void free_conn(struct nfsd4_conn *c)
480 if (fchan->maxreq_sz > maxcount) 629{
481 fchan->maxreq_sz = maxcount; 630 svc_xprt_put(c->cn_xprt);
482 session_fchan->maxreq_sz = fchan->maxreq_sz; 631 kfree(c);
632}
483 633
484 if (fchan->maxresp_sz > maxcount) 634static void nfsd4_conn_lost(struct svc_xpt_user *u)
485 fchan->maxresp_sz = maxcount; 635{
486 session_fchan->maxresp_sz = fchan->maxresp_sz; 636 struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
637 struct nfs4_client *clp = c->cn_session->se_client;
487 638
488 /* Use the client's maxops if possible */ 639 spin_lock(&clp->cl_lock);
489 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 640 if (!list_empty(&c->cn_persession)) {
490 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 641 list_del(&c->cn_persession);
491 session_fchan->maxops = fchan->maxops; 642 free_conn(c);
643 }
644 spin_unlock(&clp->cl_lock);
645}
492 646
493 /* FIXME: Error means no more DRC pages so the server should 647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
494 * recover pages from existing sessions. For now fail session 648{
495 * creation. 649 struct nfsd4_conn *conn;
496 */
497 status = set_forechannel_drc_size(fchan);
498 650
499 session_fchan->maxresp_cached = fchan->maxresp_cached; 651 conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
500 session_fchan->maxreqs = fchan->maxreqs; 652 if (!conn)
653 return NULL;
654 svc_xprt_get(rqstp->rq_xprt);
655 conn->cn_xprt = rqstp->rq_xprt;
656 conn->cn_flags = flags;
657 INIT_LIST_HEAD(&conn->cn_xpt_user.list);
658 return conn;
659}
501 660
502 dprintk("%s status %d\n", __func__, status); 661static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
503 return status; 662{
663 conn->cn_session = ses;
664 list_add(&conn->cn_persession, &ses->se_conns);
504} 665}
505 666
506static void 667static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
507free_session_slots(struct nfsd4_session *ses)
508{ 668{
509 int i; 669 struct nfs4_client *clp = ses->se_client;
510 670
511 for (i = 0; i < ses->se_fchannel.maxreqs; i++) 671 spin_lock(&clp->cl_lock);
512 kfree(ses->se_slots[i]); 672 __nfsd4_hash_conn(conn, ses);
673 spin_unlock(&clp->cl_lock);
513} 674}
514 675
515/* 676static void nfsd4_register_conn(struct nfsd4_conn *conn)
516 * We don't actually need to cache the rpc and session headers, so we
517 * can allocate a little less for each slot:
518 */
519static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
520{ 677{
521 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; 678 conn->cn_xpt_user.callback = nfsd4_conn_lost;
679 register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
522} 680}
523 681
524static int 682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
525alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
526 struct nfsd4_create_session *cses)
527{ 683{
528 struct nfsd4_session *new, tmp; 684 struct nfsd4_conn *conn;
529 struct nfsd4_slot *sp; 685 u32 flags = NFS4_CDFC4_FORE;
530 int idx, slotsize, cachesize, i;
531 int status;
532 686
533 memset(&tmp, 0, sizeof(tmp)); 687 if (ses->se_flags & SESSION4_BACK_CHAN)
688 flags |= NFS4_CDFC4_BACK;
689 conn = alloc_conn(rqstp, flags);
690 if (!conn)
691 return nfserr_jukebox;
692 nfsd4_hash_conn(conn, ses);
693 nfsd4_register_conn(conn);
694 return nfs_ok;
695}
534 696
535 /* FIXME: For now, we just accept the client back channel attributes. */ 697static void nfsd4_del_conns(struct nfsd4_session *s)
536 tmp.se_bchannel = cses->back_channel; 698{
537 status = init_forechannel_attrs(rqstp, &tmp.se_fchannel, 699 struct nfs4_client *clp = s->se_client;
538 &cses->fore_channel); 700 struct nfsd4_conn *c;
539 if (status)
540 goto out;
541 701
542 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 702 spin_lock(&clp->cl_lock);
543 + sizeof(struct nfsd4_session) > PAGE_SIZE); 703 while (!list_empty(&s->se_conns)) {
704 c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
705 list_del_init(&c->cn_persession);
706 spin_unlock(&clp->cl_lock);
544 707
545 status = nfserr_serverfault; 708 unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
546 /* allocate struct nfsd4_session and slot table pointers in one piece */ 709 free_conn(c);
547 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
548 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
549 if (!new)
550 goto out;
551 710
552 memcpy(new, &tmp, sizeof(*new)); 711 spin_lock(&clp->cl_lock);
712 }
713 spin_unlock(&clp->cl_lock);
714}
553 715
554 /* allocate each struct nfsd4_slot and data cache in one piece */ 716void free_session(struct kref *kref)
555 cachesize = slot_bytes(&new->se_fchannel); 717{
556 for (i = 0; i < new->se_fchannel.maxreqs; i++) { 718 struct nfsd4_session *ses;
557 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); 719 int mem;
558 if (!sp) 720
559 goto out_free; 721 ses = container_of(kref, struct nfsd4_session, se_ref);
560 new->se_slots[i] = sp; 722 nfsd4_del_conns(ses);
723 spin_lock(&nfsd_drc_lock);
724 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
725 nfsd_drc_mem_used -= mem;
726 spin_unlock(&nfsd_drc_lock);
727 free_session_slots(ses);
728 kfree(ses);
729}
730
731static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
732{
733 struct nfsd4_session *new;
734 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
735 int numslots, slotsize;
736 int status;
737 int idx;
738
739 /*
740 * Note decreasing slot size below client's request may
741 * make it difficult for client to function correctly, whereas
742 * decreasing the number of slots will (just?) affect
743 * performance. When short on memory we therefore prefer to
744 * decrease number of slots instead of their size.
745 */
746 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
747 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
748
749 new = alloc_session(slotsize, numslots);
750 if (!new) {
751 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
752 return NULL;
561 } 753 }
754 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
562 755
563 new->se_client = clp; 756 new->se_client = clp;
564 gen_sessionid(new); 757 gen_sessionid(new);
565 idx = hash_sessionid(&new->se_sessionid);
566 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
567 NFS4_MAX_SESSIONID_LEN);
568 758
759 INIT_LIST_HEAD(&new->se_conns);
760
761 new->se_cb_seq_nr = 1;
569 new->se_flags = cses->flags; 762 new->se_flags = cses->flags;
763 new->se_cb_prog = cses->callback_prog;
570 kref_init(&new->se_ref); 764 kref_init(&new->se_ref);
765 idx = hash_sessionid(&new->se_sessionid);
571 spin_lock(&client_lock); 766 spin_lock(&client_lock);
572 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 767 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
573 list_add(&new->se_perclnt, &clp->cl_sessions); 768 list_add(&new->se_perclnt, &clp->cl_sessions);
574 spin_unlock(&client_lock); 769 spin_unlock(&client_lock);
575 770
576 status = nfs_ok; 771 status = nfsd4_new_conn(rqstp, new);
577out: 772 /* whoops: benny points out, status is ignored! (err, or bogus) */
578 return status; 773 if (status) {
579out_free: 774 free_session(&new->se_ref);
580 free_session_slots(new); 775 return NULL;
581 kfree(new); 776 }
582 goto out; 777 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
778 struct sockaddr *sa = svc_addr(rqstp);
779
780 clp->cl_cb_session = new;
781 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
782 svc_xprt_get(rqstp->rq_xprt);
783 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
784 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
785 nfsd4_probe_callback(clp);
786 }
787 return new;
583} 788}
584 789
585/* caller must hold client_lock */ 790/* caller must hold client_lock */
@@ -591,10 +796,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
591 796
592 dump_sessionid(__func__, sessionid); 797 dump_sessionid(__func__, sessionid);
593 idx = hash_sessionid(sessionid); 798 idx = hash_sessionid(sessionid);
594 dprintk("%s: idx is %d\n", __func__, idx);
595 /* Search in the appropriate list */ 799 /* Search in the appropriate list */
596 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { 800 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
597 dump_sessionid("list traversal", &elem->se_sessionid);
598 if (!memcmp(elem->se_sessionid.data, sessionid->data, 801 if (!memcmp(elem->se_sessionid.data, sessionid->data,
599 NFS4_MAX_SESSIONID_LEN)) { 802 NFS4_MAX_SESSIONID_LEN)) {
600 return elem; 803 return elem;
@@ -613,21 +816,6 @@ unhash_session(struct nfsd4_session *ses)
613 list_del(&ses->se_perclnt); 816 list_del(&ses->se_perclnt);
614} 817}
615 818
616void
617free_session(struct kref *kref)
618{
619 struct nfsd4_session *ses;
620 int mem;
621
622 ses = container_of(kref, struct nfsd4_session, se_ref);
623 spin_lock(&nfsd_drc_lock);
624 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
625 nfsd_drc_mem_used -= mem;
626 spin_unlock(&nfsd_drc_lock);
627 free_session_slots(ses);
628 kfree(ses);
629}
630
631/* must be called under the client_lock */ 819/* must be called under the client_lock */
632static inline void 820static inline void
633renew_client_locked(struct nfs4_client *clp) 821renew_client_locked(struct nfs4_client *clp)
@@ -694,6 +882,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
694static inline void 882static inline void
695free_client(struct nfs4_client *clp) 883free_client(struct nfs4_client *clp)
696{ 884{
885 while (!list_empty(&clp->cl_sessions)) {
886 struct nfsd4_session *ses;
887 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
888 se_perclnt);
889 list_del(&ses->se_perclnt);
890 nfsd4_put_session(ses);
891 }
697 if (clp->cl_cred.cr_group_info) 892 if (clp->cl_cred.cr_group_info)
698 put_group_info(clp->cl_cred.cr_group_info); 893 put_group_info(clp->cl_cred.cr_group_info);
699 kfree(clp->cl_principal); 894 kfree(clp->cl_principal);
@@ -714,22 +909,18 @@ release_session_client(struct nfsd4_session *session)
714 } else 909 } else
715 renew_client_locked(clp); 910 renew_client_locked(clp);
716 spin_unlock(&client_lock); 911 spin_unlock(&client_lock);
717 nfsd4_put_session(session);
718} 912}
719 913
720/* must be called under the client_lock */ 914/* must be called under the client_lock */
721static inline void 915static inline void
722unhash_client_locked(struct nfs4_client *clp) 916unhash_client_locked(struct nfs4_client *clp)
723{ 917{
918 struct nfsd4_session *ses;
919
724 mark_client_expired(clp); 920 mark_client_expired(clp);
725 list_del(&clp->cl_lru); 921 list_del(&clp->cl_lru);
726 while (!list_empty(&clp->cl_sessions)) { 922 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
727 struct nfsd4_session *ses; 923 list_del_init(&ses->se_hash);
728 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
729 se_perclnt);
730 unhash_session(ses);
731 nfsd4_put_session(ses);
732 }
733} 924}
734 925
735static void 926static void
@@ -758,7 +949,7 @@ expire_client(struct nfs4_client *clp)
758 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 949 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
759 release_openowner(sop); 950 release_openowner(sop);
760 } 951 }
761 nfsd4_set_callback_client(clp, NULL); 952 nfsd4_shutdown_callback(clp);
762 if (clp->cl_cb_conn.cb_xprt) 953 if (clp->cl_cb_conn.cb_xprt)
763 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 954 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
764 list_del(&clp->cl_idhash); 955 list_del(&clp->cl_idhash);
@@ -843,6 +1034,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
843 if (clp == NULL) 1034 if (clp == NULL)
844 return NULL; 1035 return NULL;
845 1036
1037 INIT_LIST_HEAD(&clp->cl_sessions);
1038
846 princ = svc_gss_principal(rqstp); 1039 princ = svc_gss_principal(rqstp);
847 if (princ) { 1040 if (princ) {
848 clp->cl_principal = kstrdup(princ, GFP_KERNEL); 1041 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -859,8 +1052,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
859 INIT_LIST_HEAD(&clp->cl_strhash); 1052 INIT_LIST_HEAD(&clp->cl_strhash);
860 INIT_LIST_HEAD(&clp->cl_openowners); 1053 INIT_LIST_HEAD(&clp->cl_openowners);
861 INIT_LIST_HEAD(&clp->cl_delegations); 1054 INIT_LIST_HEAD(&clp->cl_delegations);
862 INIT_LIST_HEAD(&clp->cl_sessions);
863 INIT_LIST_HEAD(&clp->cl_lru); 1055 INIT_LIST_HEAD(&clp->cl_lru);
1056 spin_lock_init(&clp->cl_lock);
1057 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
864 clp->cl_time = get_seconds(); 1058 clp->cl_time = get_seconds();
865 clear_bit(0, &clp->cl_cb_slot_busy); 1059 clear_bit(0, &clp->cl_cb_slot_busy);
866 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1060 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -869,7 +1063,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
869 clp->cl_flavor = rqstp->rq_flavor; 1063 clp->cl_flavor = rqstp->rq_flavor;
870 copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1064 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
871 gen_confirm(clp); 1065 gen_confirm(clp);
872 1066 clp->cl_cb_session = NULL;
873 return clp; 1067 return clp;
874} 1068}
875 1069
@@ -981,7 +1175,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
981static void 1175static void
982gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1176gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
983{ 1177{
984 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 1178 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
985 unsigned short expected_family; 1179 unsigned short expected_family;
986 1180
987 /* Currently, we only support tcp and tcp6 for the callback channel */ 1181 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -994,24 +1188,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
994 else 1188 else
995 goto out_err; 1189 goto out_err;
996 1190
997 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, 1191 conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
998 se->se_callback_addr_len, 1192 se->se_callback_addr_len,
999 (struct sockaddr *) &cb->cb_addr, 1193 (struct sockaddr *)&conn->cb_addr,
1000 sizeof(cb->cb_addr)); 1194 sizeof(conn->cb_addr));
1001 1195
1002 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) 1196 if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
1003 goto out_err; 1197 goto out_err;
1004 1198
1005 if (cb->cb_addr.ss_family == AF_INET6) 1199 if (conn->cb_addr.ss_family == AF_INET6)
1006 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; 1200 ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
1007 1201
1008 cb->cb_minorversion = 0; 1202 conn->cb_prog = se->se_callback_prog;
1009 cb->cb_prog = se->se_callback_prog; 1203 conn->cb_ident = se->se_callback_ident;
1010 cb->cb_ident = se->se_callback_ident;
1011 return; 1204 return;
1012out_err: 1205out_err:
1013 cb->cb_addr.ss_family = AF_UNSPEC; 1206 conn->cb_addr.ss_family = AF_UNSPEC;
1014 cb->cb_addrlen = 0; 1207 conn->cb_addrlen = 0;
1015 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1208 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
1016 "will not receive delegations\n", 1209 "will not receive delegations\n",
1017 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1210 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1220,7 +1413,7 @@ out_new:
1220 /* Normal case */ 1413 /* Normal case */
1221 new = create_client(exid->clname, dname, rqstp, &verf); 1414 new = create_client(exid->clname, dname, rqstp, &verf);
1222 if (new == NULL) { 1415 if (new == NULL) {
1223 status = nfserr_serverfault; 1416 status = nfserr_jukebox;
1224 goto out; 1417 goto out;
1225 } 1418 }
1226 1419
@@ -1298,7 +1491,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1298{ 1491{
1299 struct sockaddr *sa = svc_addr(rqstp); 1492 struct sockaddr *sa = svc_addr(rqstp);
1300 struct nfs4_client *conf, *unconf; 1493 struct nfs4_client *conf, *unconf;
1494 struct nfsd4_session *new;
1301 struct nfsd4_clid_slot *cs_slot = NULL; 1495 struct nfsd4_clid_slot *cs_slot = NULL;
1496 bool confirm_me = false;
1302 int status = 0; 1497 int status = 0;
1303 1498
1304 nfs4_lock_state(); 1499 nfs4_lock_state();
@@ -1321,7 +1516,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1321 cs_slot->sl_seqid, cr_ses->seqid); 1516 cs_slot->sl_seqid, cr_ses->seqid);
1322 goto out; 1517 goto out;
1323 } 1518 }
1324 cs_slot->sl_seqid++;
1325 } else if (unconf) { 1519 } else if (unconf) {
1326 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1520 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1327 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1521 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1334,25 +1528,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1334 if (status) { 1528 if (status) {
1335 /* an unconfirmed replay returns misordered */ 1529 /* an unconfirmed replay returns misordered */
1336 status = nfserr_seq_misordered; 1530 status = nfserr_seq_misordered;
1337 goto out_cache; 1531 goto out;
1338 } 1532 }
1339 1533
1340 cs_slot->sl_seqid++; /* from 0 to 1 */ 1534 confirm_me = true;
1341 move_to_confirmed(unconf);
1342
1343 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1344 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1345 svc_xprt_get(rqstp->rq_xprt);
1346 rpc_copy_addr(
1347 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1348 sa);
1349 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1350 unconf->cl_cb_conn.cb_minorversion =
1351 cstate->minorversion;
1352 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1353 unconf->cl_cb_seq_nr = 1;
1354 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1355 }
1356 conf = unconf; 1535 conf = unconf;
1357 } else { 1536 } else {
1358 status = nfserr_stale_clientid; 1537 status = nfserr_stale_clientid;
@@ -1360,22 +1539,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1360 } 1539 }
1361 1540
1362 /* 1541 /*
1542 * XXX: we should probably set this at creation time, and check
1543 * for consistent minorversion use throughout:
1544 */
1545 conf->cl_minorversion = 1;
1546 /*
1363 * We do not support RDMA or persistent sessions 1547 * We do not support RDMA or persistent sessions
1364 */ 1548 */
1365 cr_ses->flags &= ~SESSION4_PERSIST; 1549 cr_ses->flags &= ~SESSION4_PERSIST;
1366 cr_ses->flags &= ~SESSION4_RDMA; 1550 cr_ses->flags &= ~SESSION4_RDMA;
1367 1551
1368 status = alloc_init_session(rqstp, conf, cr_ses); 1552 status = nfserr_jukebox;
1369 if (status) 1553 new = alloc_init_session(rqstp, conf, cr_ses);
1554 if (!new)
1370 goto out; 1555 goto out;
1371 1556 status = nfs_ok;
1372 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1557 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1373 NFS4_MAX_SESSIONID_LEN); 1558 NFS4_MAX_SESSIONID_LEN);
1559 cs_slot->sl_seqid++;
1374 cr_ses->seqid = cs_slot->sl_seqid; 1560 cr_ses->seqid = cs_slot->sl_seqid;
1375 1561
1376out_cache:
1377 /* cache solo and embedded create sessions under the state lock */ 1562 /* cache solo and embedded create sessions under the state lock */
1378 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1563 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1564 if (confirm_me)
1565 move_to_confirmed(conf);
1379out: 1566out:
1380 nfs4_unlock_state(); 1567 nfs4_unlock_state();
1381 dprintk("%s returns %d\n", __func__, ntohl(status)); 1568 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1429,8 +1616,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
1429 1616
1430 nfs4_lock_state(); 1617 nfs4_lock_state();
1431 /* wait for callbacks */ 1618 /* wait for callbacks */
1432 nfsd4_set_callback_client(ses->se_client, NULL); 1619 nfsd4_shutdown_callback(ses->se_client);
1433 nfs4_unlock_state(); 1620 nfs4_unlock_state();
1621
1622 nfsd4_del_conns(ses);
1623
1434 nfsd4_put_session(ses); 1624 nfsd4_put_session(ses);
1435 status = nfs_ok; 1625 status = nfs_ok;
1436out: 1626out:
@@ -1438,6 +1628,36 @@ out:
1438 return status; 1628 return status;
1439} 1629}
1440 1630
1631static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
1632{
1633 struct nfsd4_conn *c;
1634
1635 list_for_each_entry(c, &s->se_conns, cn_persession) {
1636 if (c->cn_xprt == xpt) {
1637 return c;
1638 }
1639 }
1640 return NULL;
1641}
1642
1643static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
1644{
1645 struct nfs4_client *clp = ses->se_client;
1646 struct nfsd4_conn *c;
1647
1648 spin_lock(&clp->cl_lock);
1649 c = __nfsd4_find_conn(new->cn_xprt, ses);
1650 if (c) {
1651 spin_unlock(&clp->cl_lock);
1652 free_conn(new);
1653 return;
1654 }
1655 __nfsd4_hash_conn(new, ses);
1656 spin_unlock(&clp->cl_lock);
1657 nfsd4_register_conn(new);
1658 return;
1659}
1660
1441__be32 1661__be32
1442nfsd4_sequence(struct svc_rqst *rqstp, 1662nfsd4_sequence(struct svc_rqst *rqstp,
1443 struct nfsd4_compound_state *cstate, 1663 struct nfsd4_compound_state *cstate,
@@ -1446,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1446 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1666 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1447 struct nfsd4_session *session; 1667 struct nfsd4_session *session;
1448 struct nfsd4_slot *slot; 1668 struct nfsd4_slot *slot;
1669 struct nfsd4_conn *conn;
1449 int status; 1670 int status;
1450 1671
1451 if (resp->opcnt != 1) 1672 if (resp->opcnt != 1)
1452 return nfserr_sequence_pos; 1673 return nfserr_sequence_pos;
1453 1674
1675 /*
1676 * Will be either used or freed by nfsd4_sequence_check_conn
1677 * below.
1678 */
1679 conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
1680 if (!conn)
1681 return nfserr_jukebox;
1682
1454 spin_lock(&client_lock); 1683 spin_lock(&client_lock);
1455 status = nfserr_badsession; 1684 status = nfserr_badsession;
1456 session = find_in_sessionid_hashtbl(&seq->sessionid); 1685 session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1482,6 +1711,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1482 if (status) 1711 if (status)
1483 goto out; 1712 goto out;
1484 1713
1714 nfsd4_sequence_check_conn(conn, session);
1715 conn = NULL;
1716
1485 /* Success! bump slot seqid */ 1717 /* Success! bump slot seqid */
1486 slot->sl_inuse = true; 1718 slot->sl_inuse = true;
1487 slot->sl_seqid = seq->seqid; 1719 slot->sl_seqid = seq->seqid;
@@ -1496,6 +1728,7 @@ out:
1496 nfsd4_get_session(cstate->session); 1728 nfsd4_get_session(cstate->session);
1497 atomic_inc(&session->se_client->cl_refcount); 1729 atomic_inc(&session->se_client->cl_refcount);
1498 } 1730 }
1731 kfree(conn);
1499 spin_unlock(&client_lock); 1732 spin_unlock(&client_lock);
1500 dprintk("%s: return %d\n", __func__, ntohl(status)); 1733 dprintk("%s: return %d\n", __func__, ntohl(status));
1501 return status; 1734 return status;
@@ -1630,6 +1863,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1630 goto out; 1863 goto out;
1631 gen_clid(new); 1864 gen_clid(new);
1632 } 1865 }
1866 /*
1867 * XXX: we should probably set this at creation time, and check
1868 * for consistent minorversion use throughout:
1869 */
1870 new->cl_minorversion = 0;
1633 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1871 gen_callback(new, setclid, rpc_get_scope_id(sa));
1634 add_to_unconfirmed(new, strhashval); 1872 add_to_unconfirmed(new, strhashval);
1635 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1873 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1690,7 +1928,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1690 status = nfserr_clid_inuse; 1928 status = nfserr_clid_inuse;
1691 else { 1929 else {
1692 atomic_set(&conf->cl_cb_set, 0); 1930 atomic_set(&conf->cl_cb_set, 0);
1693 nfsd4_probe_callback(conf, &unconf->cl_cb_conn); 1931 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1932 nfsd4_probe_callback(conf);
1694 expire_client(unconf); 1933 expire_client(unconf);
1695 status = nfs_ok; 1934 status = nfs_ok;
1696 1935
@@ -1724,7 +1963,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1724 } 1963 }
1725 move_to_confirmed(unconf); 1964 move_to_confirmed(unconf);
1726 conf = unconf; 1965 conf = unconf;
1727 nfsd4_probe_callback(conf, &conf->cl_cb_conn); 1966 nfsd4_probe_callback(conf);
1728 status = nfs_ok; 1967 status = nfs_ok;
1729 } 1968 }
1730 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1969 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1760,6 +1999,8 @@ alloc_init_file(struct inode *ino)
1760 fp->fi_inode = igrab(ino); 1999 fp->fi_inode = igrab(ino);
1761 fp->fi_id = current_fileid++; 2000 fp->fi_id = current_fileid++;
1762 fp->fi_had_conflict = false; 2001 fp->fi_had_conflict = false;
2002 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2003 memset(fp->fi_access, 0, sizeof(fp->fi_access));
1763 spin_lock(&recall_lock); 2004 spin_lock(&recall_lock);
1764 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 2005 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1765 spin_unlock(&recall_lock); 2006 spin_unlock(&recall_lock);
@@ -1971,57 +2212,6 @@ static inline int deny_valid(u32 x)
1971} 2212}
1972 2213
1973/* 2214/*
1974 * We store the NONE, READ, WRITE, and BOTH bits separately in the
1975 * st_{access,deny}_bmap field of the stateid, in order to track not
1976 * only what share bits are currently in force, but also what
1977 * combinations of share bits previous opens have used. This allows us
1978 * to enforce the recommendation of rfc 3530 14.2.19 that the server
1979 * return an error if the client attempt to downgrade to a combination
1980 * of share bits not explicable by closing some of its previous opens.
1981 *
1982 * XXX: This enforcement is actually incomplete, since we don't keep
1983 * track of access/deny bit combinations; so, e.g., we allow:
1984 *
1985 * OPEN allow read, deny write
1986 * OPEN allow both, deny none
1987 * DOWNGRADE allow read, deny none
1988 *
1989 * which we should reject.
1990 */
1991static void
1992set_access(unsigned int *access, unsigned long bmap) {
1993 int i;
1994
1995 *access = 0;
1996 for (i = 1; i < 4; i++) {
1997 if (test_bit(i, &bmap))
1998 *access |= i;
1999 }
2000}
2001
2002static void
2003set_deny(unsigned int *deny, unsigned long bmap) {
2004 int i;
2005
2006 *deny = 0;
2007 for (i = 0; i < 4; i++) {
2008 if (test_bit(i, &bmap))
2009 *deny |= i ;
2010 }
2011}
2012
2013static int
2014test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
2015 unsigned int access, deny;
2016
2017 set_access(&access, stp->st_access_bmap);
2018 set_deny(&deny, stp->st_deny_bmap);
2019 if ((access & open->op_share_deny) || (deny & open->op_share_access))
2020 return 0;
2021 return 1;
2022}
2023
2024/*
2025 * Called to check deny when READ with all zero stateid or 2215 * Called to check deny when READ with all zero stateid or
2026 * WRITE with all zero or all one stateid 2216 * WRITE with all zero or all one stateid
2027 */ 2217 */
@@ -2052,14 +2242,12 @@ out:
2052} 2242}
2053 2243
2054static inline void 2244static inline void
2055nfs4_file_downgrade(struct file *filp, unsigned int share_access) 2245nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2056{ 2246{
2057 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 2247 if (share_access & NFS4_SHARE_ACCESS_WRITE)
2058 drop_file_write_access(filp); 2248 nfs4_file_put_access(fp, O_WRONLY);
2059 spin_lock(&filp->f_lock); 2249 if (share_access & NFS4_SHARE_ACCESS_READ)
2060 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2250 nfs4_file_put_access(fp, O_RDONLY);
2061 spin_unlock(&filp->f_lock);
2062 }
2063} 2251}
2064 2252
2065/* 2253/*
@@ -2122,22 +2310,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
2122} 2310}
2123 2311
2124/* 2312/*
2125 * Set the delegation file_lock back pointer.
2126 *
2127 * Called from setlease() with lock_kernel() held.
2128 */
2129static
2130void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
2131{
2132 struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
2133
2134 dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
2135 if (!dp)
2136 return;
2137 dp->dl_flock = new;
2138}
2139
2140/*
2141 * Called from setlease() with lock_kernel() held 2313 * Called from setlease() with lock_kernel() held
2142 */ 2314 */
2143static 2315static
@@ -2167,7 +2339,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2167static const struct lock_manager_operations nfsd_lease_mng_ops = { 2339static const struct lock_manager_operations nfsd_lease_mng_ops = {
2168 .fl_break = nfsd_break_deleg_cb, 2340 .fl_break = nfsd_break_deleg_cb,
2169 .fl_release_private = nfsd_release_deleg_cb, 2341 .fl_release_private = nfsd_release_deleg_cb,
2170 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
2171 .fl_mylease = nfsd_same_client_deleg_cb, 2342 .fl_mylease = nfsd_same_client_deleg_cb,
2172 .fl_change = nfsd_change_deleg_cb, 2343 .fl_change = nfsd_change_deleg_cb,
2173}; 2344};
@@ -2255,6 +2426,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2255 return NULL; 2426 return NULL;
2256} 2427}
2257 2428
2429int share_access_to_flags(u32 share_access)
2430{
2431 share_access &= ~NFS4_SHARE_WANT_MASK;
2432
2433 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
2434}
2435
2258static __be32 2436static __be32
2259nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 2437nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2260 struct nfs4_delegation **dp) 2438 struct nfs4_delegation **dp)
@@ -2265,8 +2443,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2265 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 2443 *dp = find_delegation_file(fp, &open->op_delegate_stateid);
2266 if (*dp == NULL) 2444 if (*dp == NULL)
2267 goto out; 2445 goto out;
2268 flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? 2446 flags = share_access_to_flags(open->op_share_access);
2269 RD_STATE : WR_STATE;
2270 status = nfs4_check_delegmode(*dp, flags); 2447 status = nfs4_check_delegmode(*dp, flags);
2271 if (status) 2448 if (status)
2272 *dp = NULL; 2449 *dp = NULL;
@@ -2308,30 +2485,53 @@ nfs4_alloc_stateid(void)
2308 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 2485 return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
2309} 2486}
2310 2487
2488static inline int nfs4_access_to_access(u32 nfs4_access)
2489{
2490 int flags = 0;
2491
2492 if (nfs4_access & NFS4_SHARE_ACCESS_READ)
2493 flags |= NFSD_MAY_READ;
2494 if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
2495 flags |= NFSD_MAY_WRITE;
2496 return flags;
2497}
2498
2499static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2500*fp, struct svc_fh *cur_fh, u32 nfs4_access)
2501{
2502 __be32 status;
2503 int oflag = nfs4_access_to_omode(nfs4_access);
2504 int access = nfs4_access_to_access(nfs4_access);
2505
2506 if (!fp->fi_fds[oflag]) {
2507 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2508 &fp->fi_fds[oflag]);
2509 if (status == nfserr_dropit)
2510 status = nfserr_jukebox;
2511 if (status)
2512 return status;
2513 }
2514 nfs4_file_get_access(fp, oflag);
2515
2516 return nfs_ok;
2517}
2518
2311static __be32 2519static __be32
2312nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, 2520nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
2313 struct nfs4_delegation *dp, 2521 struct nfs4_file *fp, struct svc_fh *cur_fh,
2314 struct svc_fh *cur_fh, int flags) 2522 struct nfsd4_open *open)
2315{ 2523{
2316 struct nfs4_stateid *stp; 2524 struct nfs4_stateid *stp;
2525 __be32 status;
2317 2526
2318 stp = nfs4_alloc_stateid(); 2527 stp = nfs4_alloc_stateid();
2319 if (stp == NULL) 2528 if (stp == NULL)
2320 return nfserr_resource; 2529 return nfserr_resource;
2321 2530
2322 if (dp) { 2531 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
2323 get_file(dp->dl_vfs_file); 2532 if (status) {
2324 stp->st_vfs_file = dp->dl_vfs_file; 2533 kmem_cache_free(stateid_slab, stp);
2325 } else { 2534 return status;
2326 __be32 status;
2327 status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
2328 &stp->st_vfs_file);
2329 if (status) {
2330 if (status == nfserr_dropit)
2331 status = nfserr_jukebox;
2332 kmem_cache_free(stateid_slab, stp);
2333 return status;
2334 }
2335 } 2535 }
2336 *stpp = stp; 2536 *stpp = stp;
2337 return 0; 2537 return 0;
@@ -2353,35 +2553,28 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2353} 2553}
2354 2554
2355static __be32 2555static __be32
2356nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2556nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
2357{ 2557{
2358 struct file *filp = stp->st_vfs_file; 2558 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2359 struct inode *inode = filp->f_path.dentry->d_inode; 2559 bool new_access;
2360 unsigned int share_access, new_writer;
2361 __be32 status; 2560 __be32 status;
2362 2561
2363 set_access(&share_access, stp->st_access_bmap); 2562 new_access = !test_bit(op_share_access, &stp->st_access_bmap);
2364 new_writer = (~share_access) & open->op_share_access 2563 if (new_access) {
2365 & NFS4_SHARE_ACCESS_WRITE; 2564 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
2366 2565 if (status)
2367 if (new_writer) { 2566 return status;
2368 int err = get_write_access(inode);
2369 if (err)
2370 return nfserrno(err);
2371 err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
2372 if (err)
2373 return nfserrno(err);
2374 file_take_write(filp);
2375 } 2567 }
2376 status = nfsd4_truncate(rqstp, cur_fh, open); 2568 status = nfsd4_truncate(rqstp, cur_fh, open);
2377 if (status) { 2569 if (status) {
2378 if (new_writer) 2570 if (new_access) {
2379 put_write_access(inode); 2571 int oflag = nfs4_access_to_omode(new_access);
2572 nfs4_file_put_access(fp, oflag);
2573 }
2380 return status; 2574 return status;
2381 } 2575 }
2382 /* remember the open */ 2576 /* remember the open */
2383 filp->f_mode |= open->op_share_access; 2577 __set_bit(op_share_access, &stp->st_access_bmap);
2384 __set_bit(open->op_share_access, &stp->st_access_bmap);
2385 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2578 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2386 2579
2387 return nfs_ok; 2580 return nfs_ok;
@@ -2404,7 +2597,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2404 struct nfs4_delegation *dp; 2597 struct nfs4_delegation *dp;
2405 struct nfs4_stateowner *sop = stp->st_stateowner; 2598 struct nfs4_stateowner *sop = stp->st_stateowner;
2406 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2599 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2407 struct file_lock fl, *flp = &fl; 2600 struct file_lock *fl;
2408 int status, flag = 0; 2601 int status, flag = 0;
2409 2602
2410 flag = NFS4_OPEN_DELEGATE_NONE; 2603 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2438,20 +2631,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2438 flag = NFS4_OPEN_DELEGATE_NONE; 2631 flag = NFS4_OPEN_DELEGATE_NONE;
2439 goto out; 2632 goto out;
2440 } 2633 }
2441 locks_init_lock(&fl); 2634 status = -ENOMEM;
2442 fl.fl_lmops = &nfsd_lease_mng_ops; 2635 fl = locks_alloc_lock();
2443 fl.fl_flags = FL_LEASE; 2636 if (!fl)
2444 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2637 goto out;
2445 fl.fl_end = OFFSET_MAX; 2638 locks_init_lock(fl);
2446 fl.fl_owner = (fl_owner_t)dp; 2639 fl->fl_lmops = &nfsd_lease_mng_ops;
2447 fl.fl_file = stp->st_vfs_file; 2640 fl->fl_flags = FL_LEASE;
2448 fl.fl_pid = current->tgid; 2641 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2642 fl->fl_end = OFFSET_MAX;
2643 fl->fl_owner = (fl_owner_t)dp;
2644 fl->fl_file = find_readable_file(stp->st_file);
2645 BUG_ON(!fl->fl_file);
2646 fl->fl_pid = current->tgid;
2647 dp->dl_flock = fl;
2449 2648
2450 /* vfs_setlease checks to see if delegation should be handed out. 2649 /* vfs_setlease checks to see if delegation should be handed out.
2451 * the lock_manager callbacks fl_mylease and fl_change are used 2650 * the lock_manager callbacks fl_mylease and fl_change are used
2452 */ 2651 */
2453 if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { 2652 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2454 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2653 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2654 dp->dl_flock = NULL;
2655 locks_free_lock(fl);
2455 unhash_delegation(dp); 2656 unhash_delegation(dp);
2456 flag = NFS4_OPEN_DELEGATE_NONE; 2657 flag = NFS4_OPEN_DELEGATE_NONE;
2457 goto out; 2658 goto out;
@@ -2514,18 +2715,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2514 */ 2715 */
2515 if (stp) { 2716 if (stp) {
2516 /* Stateid was found, this is an OPEN upgrade */ 2717 /* Stateid was found, this is an OPEN upgrade */
2517 status = nfs4_upgrade_open(rqstp, current_fh, stp, open); 2718 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
2518 if (status) 2719 if (status)
2519 goto out; 2720 goto out;
2520 update_stateid(&stp->st_stateid); 2721 update_stateid(&stp->st_stateid);
2521 } else { 2722 } else {
2522 /* Stateid was not found, this is a new OPEN */ 2723 status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
2523 int flags = 0;
2524 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
2525 flags |= NFSD_MAY_READ;
2526 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2527 flags |= NFSD_MAY_WRITE;
2528 status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
2529 if (status) 2724 if (status)
2530 goto out; 2725 goto out;
2531 init_stateid(stp, fp, open); 2726 init_stateid(stp, fp, open);
@@ -2727,7 +2922,7 @@ search_close_lru(u32 st_id, int flags)
2727static inline int 2922static inline int
2728nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) 2923nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2729{ 2924{
2730 return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; 2925 return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
2731} 2926}
2732 2927
2733static int 2928static int
@@ -2760,6 +2955,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
2760{ 2955{
2761 __be32 status = nfserr_openmode; 2956 __be32 status = nfserr_openmode;
2762 2957
2958 /* For lock stateid's, we test the parent open, not the lock: */
2959 if (stp->st_openstp)
2960 stp = stp->st_openstp;
2763 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) 2961 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
2764 goto out; 2962 goto out;
2765 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) 2963 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2858,7 +3056,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2858 if (STALE_STATEID(stateid)) 3056 if (STALE_STATEID(stateid))
2859 goto out; 3057 goto out;
2860 3058
2861 status = nfserr_bad_stateid; 3059 /*
3060 * We assume that any stateid that has the current boot time,
3061 * but that we can't find, is expired:
3062 */
3063 status = nfserr_expired;
2862 if (is_delegation_stateid(stateid)) { 3064 if (is_delegation_stateid(stateid)) {
2863 dp = find_delegation_stateid(ino, stateid); 3065 dp = find_delegation_stateid(ino, stateid);
2864 if (!dp) 3066 if (!dp)
@@ -2872,11 +3074,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2872 goto out; 3074 goto out;
2873 renew_client(dp->dl_client); 3075 renew_client(dp->dl_client);
2874 if (filpp) 3076 if (filpp)
2875 *filpp = dp->dl_vfs_file; 3077 *filpp = find_readable_file(dp->dl_file);
3078 BUG_ON(!*filpp);
2876 } else { /* open or lock stateid */ 3079 } else { /* open or lock stateid */
2877 stp = find_stateid(stateid, flags); 3080 stp = find_stateid(stateid, flags);
2878 if (!stp) 3081 if (!stp)
2879 goto out; 3082 goto out;
3083 status = nfserr_bad_stateid;
2880 if (nfs4_check_fh(current_fh, stp)) 3084 if (nfs4_check_fh(current_fh, stp))
2881 goto out; 3085 goto out;
2882 if (!stp->st_stateowner->so_confirmed) 3086 if (!stp->st_stateowner->so_confirmed)
@@ -2889,8 +3093,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2889 if (status) 3093 if (status)
2890 goto out; 3094 goto out;
2891 renew_client(stp->st_stateowner->so_client); 3095 renew_client(stp->st_stateowner->so_client);
2892 if (filpp) 3096 if (filpp) {
2893 *filpp = stp->st_vfs_file; 3097 if (flags & RD_STATE)
3098 *filpp = find_readable_file(stp->st_file);
3099 else
3100 *filpp = find_writeable_file(stp->st_file);
3101 }
2894 } 3102 }
2895 status = nfs_ok; 3103 status = nfs_ok;
2896out: 3104out:
@@ -2947,8 +3155,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2947 * a replayed close: 3155 * a replayed close:
2948 */ 3156 */
2949 sop = search_close_lru(stateid->si_stateownerid, flags); 3157 sop = search_close_lru(stateid->si_stateownerid, flags);
3158 /* It's not stale; let's assume it's expired: */
2950 if (sop == NULL) 3159 if (sop == NULL)
2951 return nfserr_bad_stateid; 3160 return nfserr_expired;
2952 *sopp = sop; 3161 *sopp = sop;
2953 goto check_replay; 3162 goto check_replay;
2954 } 3163 }
@@ -3126,8 +3335,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3126 goto out; 3335 goto out;
3127 } 3336 }
3128 set_access(&share_access, stp->st_access_bmap); 3337 set_access(&share_access, stp->st_access_bmap);
3129 nfs4_file_downgrade(stp->st_vfs_file, 3338 nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
3130 share_access & ~od->od_share_access);
3131 3339
3132 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); 3340 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
3133 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3341 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3214,6 +3422,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3214 status = nfserr_bad_stateid; 3422 status = nfserr_bad_stateid;
3215 if (!is_delegation_stateid(stateid)) 3423 if (!is_delegation_stateid(stateid))
3216 goto out; 3424 goto out;
3425 status = nfserr_expired;
3217 dp = find_delegation_stateid(inode, stateid); 3426 dp = find_delegation_stateid(inode, stateid);
3218 if (!dp) 3427 if (!dp)
3219 goto out; 3428 goto out;
@@ -3346,11 +3555,9 @@ static inline void
3346nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) 3555nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3347{ 3556{
3348 struct nfs4_stateowner *sop; 3557 struct nfs4_stateowner *sop;
3349 unsigned int hval;
3350 3558
3351 if (fl->fl_lmops == &nfsd_posix_mng_ops) { 3559 if (fl->fl_lmops == &nfsd_posix_mng_ops) {
3352 sop = (struct nfs4_stateowner *) fl->fl_owner; 3560 sop = (struct nfs4_stateowner *) fl->fl_owner;
3353 hval = lockownerid_hashval(sop->so_id);
3354 kref_get(&sop->so_ref); 3561 kref_get(&sop->so_ref);
3355 deny->ld_sop = sop; 3562 deny->ld_sop = sop;
3356 deny->ld_clientid = sop->so_client->cl_clientid; 3563 deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3446,8 +3653,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3446 stp->st_stateid.si_stateownerid = sop->so_id; 3653 stp->st_stateid.si_stateownerid = sop->so_id;
3447 stp->st_stateid.si_fileid = fp->fi_id; 3654 stp->st_stateid.si_fileid = fp->fi_id;
3448 stp->st_stateid.si_generation = 0; 3655 stp->st_stateid.si_generation = 0;
3449 stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
3450 stp->st_access_bmap = open_stp->st_access_bmap;
3451 stp->st_deny_bmap = open_stp->st_deny_bmap; 3656 stp->st_deny_bmap = open_stp->st_deny_bmap;
3452 stp->st_openstp = open_stp; 3657 stp->st_openstp = open_stp;
3453 3658
@@ -3472,7 +3677,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3472 struct nfs4_stateowner *open_sop = NULL; 3677 struct nfs4_stateowner *open_sop = NULL;
3473 struct nfs4_stateowner *lock_sop = NULL; 3678 struct nfs4_stateowner *lock_sop = NULL;
3474 struct nfs4_stateid *lock_stp; 3679 struct nfs4_stateid *lock_stp;
3475 struct file *filp; 3680 struct nfs4_file *fp;
3681 struct file *filp = NULL;
3476 struct file_lock file_lock; 3682 struct file_lock file_lock;
3477 struct file_lock conflock; 3683 struct file_lock conflock;
3478 __be32 status = 0; 3684 __be32 status = 0;
@@ -3502,7 +3708,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3502 * lock stateid. 3708 * lock stateid.
3503 */ 3709 */
3504 struct nfs4_stateid *open_stp = NULL; 3710 struct nfs4_stateid *open_stp = NULL;
3505 struct nfs4_file *fp;
3506 3711
3507 status = nfserr_stale_clientid; 3712 status = nfserr_stale_clientid;
3508 if (!nfsd4_has_session(cstate) && 3713 if (!nfsd4_has_session(cstate) &&
@@ -3545,9 +3750,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3545 if (status) 3750 if (status)
3546 goto out; 3751 goto out;
3547 lock_sop = lock->lk_replay_owner; 3752 lock_sop = lock->lk_replay_owner;
3753 fp = lock_stp->st_file;
3548 } 3754 }
3549 /* lock->lk_replay_owner and lock_stp have been created or found */ 3755 /* lock->lk_replay_owner and lock_stp have been created or found */
3550 filp = lock_stp->st_vfs_file;
3551 3756
3552 status = nfserr_grace; 3757 status = nfserr_grace;
3553 if (locks_in_grace() && !lock->lk_reclaim) 3758 if (locks_in_grace() && !lock->lk_reclaim)
@@ -3560,11 +3765,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3560 switch (lock->lk_type) { 3765 switch (lock->lk_type) {
3561 case NFS4_READ_LT: 3766 case NFS4_READ_LT:
3562 case NFS4_READW_LT: 3767 case NFS4_READW_LT:
3768 if (find_readable_file(lock_stp->st_file)) {
3769 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
3770 filp = find_readable_file(lock_stp->st_file);
3771 }
3563 file_lock.fl_type = F_RDLCK; 3772 file_lock.fl_type = F_RDLCK;
3564 cmd = F_SETLK; 3773 cmd = F_SETLK;
3565 break; 3774 break;
3566 case NFS4_WRITE_LT: 3775 case NFS4_WRITE_LT:
3567 case NFS4_WRITEW_LT: 3776 case NFS4_WRITEW_LT:
3777 if (find_writeable_file(lock_stp->st_file)) {
3778 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
3779 filp = find_writeable_file(lock_stp->st_file);
3780 }
3568 file_lock.fl_type = F_WRLCK; 3781 file_lock.fl_type = F_WRLCK;
3569 cmd = F_SETLK; 3782 cmd = F_SETLK;
3570 break; 3783 break;
@@ -3572,6 +3785,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3572 status = nfserr_inval; 3785 status = nfserr_inval;
3573 goto out; 3786 goto out;
3574 } 3787 }
3788 if (!filp) {
3789 status = nfserr_openmode;
3790 goto out;
3791 }
3575 file_lock.fl_owner = (fl_owner_t)lock_sop; 3792 file_lock.fl_owner = (fl_owner_t)lock_sop;
3576 file_lock.fl_pid = current->tgid; 3793 file_lock.fl_pid = current->tgid;
3577 file_lock.fl_file = filp; 3794 file_lock.fl_file = filp;
@@ -3740,7 +3957,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3740 &locku->lu_stateowner, &stp, NULL))) 3957 &locku->lu_stateowner, &stp, NULL)))
3741 goto out; 3958 goto out;
3742 3959
3743 filp = stp->st_vfs_file; 3960 filp = find_any_file(stp->st_file);
3961 if (!filp) {
3962 status = nfserr_lock_range;
3963 goto out;
3964 }
3744 BUG_ON(!filp); 3965 BUG_ON(!filp);
3745 locks_init_lock(&file_lock); 3966 locks_init_lock(&file_lock);
3746 file_lock.fl_type = F_UNLCK; 3967 file_lock.fl_type = F_UNLCK;
@@ -3787,13 +4008,13 @@ out_nfserr:
3787 * 0: no locks held by lockowner 4008 * 0: no locks held by lockowner
3788 */ 4009 */
3789static int 4010static int
3790check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) 4011check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3791{ 4012{
3792 struct file_lock **flpp; 4013 struct file_lock **flpp;
3793 struct inode *inode = filp->f_path.dentry->d_inode; 4014 struct inode *inode = filp->fi_inode;
3794 int status = 0; 4015 int status = 0;
3795 4016
3796 lock_kernel(); 4017 lock_flocks();
3797 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4018 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
3798 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4019 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
3799 status = 1; 4020 status = 1;
@@ -3801,7 +4022,7 @@ check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
3801 } 4022 }
3802 } 4023 }
3803out: 4024out:
3804 unlock_kernel(); 4025 unlock_flocks();
3805 return status; 4026 return status;
3806} 4027}
3807 4028
@@ -3841,7 +4062,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3841 continue; 4062 continue;
3842 list_for_each_entry(stp, &sop->so_stateids, 4063 list_for_each_entry(stp, &sop->so_stateids,
3843 st_perstateowner) { 4064 st_perstateowner) {
3844 if (check_for_locks(stp->st_vfs_file, sop)) 4065 if (check_for_locks(stp->st_file, sop))
3845 goto out; 4066 goto out;
3846 /* Note: so_perclient unused for lockowners, 4067 /* Note: so_perclient unused for lockowners,
3847 * so it's OK to fool with here. */ 4068 * so it's OK to fool with here. */
@@ -4066,16 +4287,8 @@ out_free_laundry:
4066int 4287int
4067nfs4_state_start(void) 4288nfs4_state_start(void)
4068{ 4289{
4069 int ret;
4070
4071 if (nfs4_init)
4072 return 0;
4073 nfsd4_load_reboot_recovery_data(); 4290 nfsd4_load_reboot_recovery_data();
4074 ret = __nfs4_state_start(); 4291 return __nfs4_state_start();
4075 if (ret)
4076 return ret;
4077 nfs4_init = 1;
4078 return 0;
4079} 4292}
4080 4293
4081static void 4294static void
@@ -4110,7 +4323,6 @@ __nfs4_state_shutdown(void)
4110 } 4323 }
4111 4324
4112 nfsd4_shutdown_recdir(); 4325 nfsd4_shutdown_recdir();
4113 nfs4_init = 0;
4114} 4326}
4115 4327
4116void 4328void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ac17a708023..f35a94a0402 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1756,6 +1756,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1756 struct nfs4_acl *acl = NULL; 1756 struct nfs4_acl *acl = NULL;
1757 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1757 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1758 u32 minorversion = resp->cstate.minorversion; 1758 u32 minorversion = resp->cstate.minorversion;
1759 struct path path = {
1760 .mnt = exp->ex_path.mnt,
1761 .dentry = dentry,
1762 };
1759 1763
1760 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1764 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1761 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); 1765 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -1776,7 +1780,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1776 FATTR4_WORD0_MAXNAME)) || 1780 FATTR4_WORD0_MAXNAME)) ||
1777 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | 1781 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
1778 FATTR4_WORD1_SPACE_TOTAL))) { 1782 FATTR4_WORD1_SPACE_TOTAL))) {
1779 err = vfs_statfs(dentry, &statfs); 1783 err = vfs_statfs(&path, &statfs);
1780 if (err) 1784 if (err)
1781 goto out_nfserr; 1785 goto out_nfserr;
1782 } 1786 }
@@ -1801,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1801 goto out_nfserr; 1805 goto out_nfserr;
1802 } 1806 }
1803 } 1807 }
1804 if ((buflen -= 16) < 0)
1805 goto out_resource;
1806 1808
1807 if (unlikely(bmval2)) { 1809 if (bmval2) {
1810 if ((buflen -= 16) < 0)
1811 goto out_resource;
1808 WRITE32(3); 1812 WRITE32(3);
1809 WRITE32(bmval0); 1813 WRITE32(bmval0);
1810 WRITE32(bmval1); 1814 WRITE32(bmval1);
1811 WRITE32(bmval2); 1815 WRITE32(bmval2);
1812 } else if (likely(bmval1)) { 1816 } else if (bmval1) {
1817 if ((buflen -= 12) < 0)
1818 goto out_resource;
1813 WRITE32(2); 1819 WRITE32(2);
1814 WRITE32(bmval0); 1820 WRITE32(bmval0);
1815 WRITE32(bmval1); 1821 WRITE32(bmval1);
1816 } else { 1822 } else {
1823 if ((buflen -= 8) < 0)
1824 goto out_resource;
1817 WRITE32(1); 1825 WRITE32(1);
1818 WRITE32(bmval0); 1826 WRITE32(bmval0);
1819 } 1827 }
@@ -1824,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1824 u32 word1 = nfsd_suppattrs1(minorversion); 1832 u32 word1 = nfsd_suppattrs1(minorversion);
1825 u32 word2 = nfsd_suppattrs2(minorversion); 1833 u32 word2 = nfsd_suppattrs2(minorversion);
1826 1834
1827 if ((buflen -= 12) < 0)
1828 goto out_resource;
1829 if (!aclsupport) 1835 if (!aclsupport)
1830 word0 &= ~FATTR4_WORD0_ACL; 1836 word0 &= ~FATTR4_WORD0_ACL;
1831 if (!word2) { 1837 if (!word2) {
1838 if ((buflen -= 12) < 0)
1839 goto out_resource;
1832 WRITE32(2); 1840 WRITE32(2);
1833 WRITE32(word0); 1841 WRITE32(word0);
1834 WRITE32(word1); 1842 WRITE32(word1);
1835 } else { 1843 } else {
1844 if ((buflen -= 16) < 0)
1845 goto out_resource;
1836 WRITE32(3); 1846 WRITE32(3);
1837 WRITE32(word0); 1847 WRITE32(word0);
1838 WRITE32(word1); 1848 WRITE32(word1);
@@ -2630,7 +2640,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2630 } 2640 }
2631 read->rd_vlen = v; 2641 read->rd_vlen = v;
2632 2642
2633 nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, 2643 nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
2634 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, 2644 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2635 &maxcount); 2645 &maxcount);
2636 2646
@@ -3325,6 +3335,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3325 } 3335 }
3326 /* Renew the clientid on success and on replay */ 3336 /* Renew the clientid on success and on replay */
3327 release_session_client(cs->session); 3337 release_session_client(cs->session);
3338 nfsd4_put_session(cs->session);
3328 } 3339 }
3329 return 1; 3340 return 1;
3330} 3341}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 508941c23af..4514ebbee4d 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23enum { 23enum {
24 NFSD_Root = 1, 24 NFSD_Root = 1,
25#ifdef CONFIG_NFSD_DEPRECATED
25 NFSD_Svc, 26 NFSD_Svc,
26 NFSD_Add, 27 NFSD_Add,
27 NFSD_Del, 28 NFSD_Del,
@@ -29,6 +30,7 @@ enum {
29 NFSD_Unexport, 30 NFSD_Unexport,
30 NFSD_Getfd, 31 NFSD_Getfd,
31 NFSD_Getfs, 32 NFSD_Getfs,
33#endif
32 NFSD_List, 34 NFSD_List,
33 NFSD_Export_features, 35 NFSD_Export_features,
34 NFSD_Fh, 36 NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
54/* 56/*
55 * write() for these nodes. 57 * write() for these nodes.
56 */ 58 */
59#ifdef CONFIG_NFSD_DEPRECATED
57static ssize_t write_svc(struct file *file, char *buf, size_t size); 60static ssize_t write_svc(struct file *file, char *buf, size_t size);
58static ssize_t write_add(struct file *file, char *buf, size_t size); 61static ssize_t write_add(struct file *file, char *buf, size_t size);
59static ssize_t write_del(struct file *file, char *buf, size_t size); 62static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
61static ssize_t write_unexport(struct file *file, char *buf, size_t size); 64static ssize_t write_unexport(struct file *file, char *buf, size_t size);
62static ssize_t write_getfd(struct file *file, char *buf, size_t size); 65static ssize_t write_getfd(struct file *file, char *buf, size_t size);
63static ssize_t write_getfs(struct file *file, char *buf, size_t size); 66static ssize_t write_getfs(struct file *file, char *buf, size_t size);
67#endif
64static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 68static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
65static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); 69static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
66static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); 70static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
76#endif 80#endif
77 81
78static ssize_t (*write_op[])(struct file *, char *, size_t) = { 82static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83#ifdef CONFIG_NFSD_DEPRECATED
79 [NFSD_Svc] = write_svc, 84 [NFSD_Svc] = write_svc,
80 [NFSD_Add] = write_add, 85 [NFSD_Add] = write_add,
81 [NFSD_Del] = write_del, 86 [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83 [NFSD_Unexport] = write_unexport, 88 [NFSD_Unexport] = write_unexport,
84 [NFSD_Getfd] = write_getfd, 89 [NFSD_Getfd] = write_getfd,
85 [NFSD_Getfs] = write_getfs, 90 [NFSD_Getfs] = write_getfs,
91#endif
86 [NFSD_Fh] = write_filehandle, 92 [NFSD_Fh] = write_filehandle,
87 [NFSD_FO_UnlockIP] = write_unlock_ip, 93 [NFSD_FO_UnlockIP] = write_unlock_ip,
88 [NFSD_FO_UnlockFS] = write_unlock_fs, 94 [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
121 127
122static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
123{ 129{
130 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO
133 "Warning: \"%s\" uses deprecated NFSD interface: %s."
134 " This will be removed in 2.6.40\n",
135 current->comm, file->f_dentry->d_name.name);
136 warned = 1;
137 }
124 if (! file->private_data) { 138 if (! file->private_data) {
125 /* An attempt to read a transaction file without writing 139 /* An attempt to read a transaction file without writing
126 * causes a 0-byte write so that the file can return 140 * causes a 0-byte write so that the file can return
@@ -137,6 +151,7 @@ static const struct file_operations transaction_ops = {
137 .write = nfsctl_transaction_write, 151 .write = nfsctl_transaction_write,
138 .read = nfsctl_transaction_read, 152 .read = nfsctl_transaction_read,
139 .release = simple_transaction_release, 153 .release = simple_transaction_release,
154 .llseek = default_llseek,
140}; 155};
141 156
142static int exports_open(struct inode *inode, struct file *file) 157static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
186 * payload - write methods 201 * payload - write methods
187 */ 202 */
188 203
204#ifdef CONFIG_NFSD_DEPRECATED
189/** 205/**
190 * write_svc - Start kernel's NFSD server 206 * write_svc - Start kernel's NFSD server
191 * 207 *
@@ -401,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
401 417
402 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 418 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
403 419
404 clp = auth_unix_lookup(&in6); 420 clp = auth_unix_lookup(&init_net, &in6);
405 if (!clp) 421 if (!clp)
406 err = -EPERM; 422 err = -EPERM;
407 else { 423 else {
@@ -464,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
464 480
465 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 481 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
466 482
467 clp = auth_unix_lookup(&in6); 483 clp = auth_unix_lookup(&init_net, &in6);
468 if (!clp) 484 if (!clp)
469 err = -EPERM; 485 err = -EPERM;
470 else { 486 else {
@@ -481,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
481 out: 497 out:
482 return err; 498 return err;
483} 499}
500#endif /* CONFIG_NFSD_DEPRECATED */
484 501
485/** 502/**
486 * write_unlock_ip - Release all locks used by a client 503 * write_unlock_ip - Release all locks used by a client
@@ -949,15 +966,12 @@ static ssize_t __write_ports_addfd(char *buf)
949 if (err != 0) 966 if (err != 0)
950 return err; 967 return err;
951 968
952 err = lockd_up();
953 if (err != 0)
954 goto out;
955
956 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 969 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
957 if (err < 0) 970 if (err < 0) {
958 lockd_down(); 971 svc_destroy(nfsd_serv);
972 return err;
973 }
959 974
960out:
961 /* Decrease the count, but don't shut down the service */ 975 /* Decrease the count, but don't shut down the service */
962 nfsd_serv->sv_nrthreads--; 976 nfsd_serv->sv_nrthreads--;
963 return err; 977 return err;
@@ -978,9 +992,6 @@ static ssize_t __write_ports_delfd(char *buf)
978 if (nfsd_serv != NULL) 992 if (nfsd_serv != NULL)
979 len = svc_sock_names(nfsd_serv, buf, 993 len = svc_sock_names(nfsd_serv, buf,
980 SIMPLE_TRANSACTION_LIMIT, toclose); 994 SIMPLE_TRANSACTION_LIMIT, toclose);
981 if (len >= 0)
982 lockd_down();
983
984 kfree(toclose); 995 kfree(toclose);
985 return len; 996 return len;
986} 997}
@@ -1005,15 +1016,18 @@ static ssize_t __write_ports_addxprt(char *buf)
1005 if (err != 0) 1016 if (err != 0)
1006 return err; 1017 return err;
1007 1018
1008 err = svc_create_xprt(nfsd_serv, transport, 1019 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1009 PF_INET, port, SVC_SOCK_ANONYMOUS); 1020 PF_INET, port, SVC_SOCK_ANONYMOUS);
1010 if (err < 0) 1021 if (err < 0)
1011 goto out_err; 1022 goto out_err;
1012 1023
1013 err = svc_create_xprt(nfsd_serv, transport, 1024 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1014 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1025 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1015 if (err < 0 && err != -EAFNOSUPPORT) 1026 if (err < 0 && err != -EAFNOSUPPORT)
1016 goto out_close; 1027 goto out_close;
1028
1029 /* Decrease the count, but don't shut down the service */
1030 nfsd_serv->sv_nrthreads--;
1017 return 0; 1031 return 0;
1018out_close: 1032out_close:
1019 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); 1033 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1022,8 +1036,7 @@ out_close:
1022 svc_xprt_put(xprt); 1036 svc_xprt_put(xprt);
1023 } 1037 }
1024out_err: 1038out_err:
1025 /* Decrease the count, but don't shut down the service */ 1039 svc_destroy(nfsd_serv);
1026 nfsd_serv->sv_nrthreads--;
1027 return err; 1040 return err;
1028} 1041}
1029 1042
@@ -1194,7 +1207,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1194 bsize = NFSSVC_MAXBLKSIZE; 1207 bsize = NFSSVC_MAXBLKSIZE;
1195 bsize &= ~(1024-1); 1208 bsize &= ~(1024-1);
1196 mutex_lock(&nfsd_mutex); 1209 mutex_lock(&nfsd_mutex);
1197 if (nfsd_serv && nfsd_serv->sv_nrthreads) { 1210 if (nfsd_serv) {
1198 mutex_unlock(&nfsd_mutex); 1211 mutex_unlock(&nfsd_mutex);
1199 return -EBUSY; 1212 return -EBUSY;
1200 } 1213 }
@@ -1310,6 +1323,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1310 return -EINVAL; 1323 return -EINVAL;
1311 1324
1312 status = nfs4_reset_recoverydir(recdir); 1325 status = nfs4_reset_recoverydir(recdir);
1326 if (status)
1327 return status;
1313 } 1328 }
1314 1329
1315 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", 1330 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
@@ -1357,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1357static int nfsd_fill_super(struct super_block * sb, void * data, int silent) 1372static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1358{ 1373{
1359 static struct tree_descr nfsd_files[] = { 1374 static struct tree_descr nfsd_files[] = {
1375#ifdef CONFIG_NFSD_DEPRECATED
1360 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, 1376 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
1361 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, 1377 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
1362 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, 1378 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1364,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1364 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, 1380 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
1365 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, 1381 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
1366 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
1383#endif
1367 [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, 1384 [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
1368 [NFSD_Export_features] = {"export_features", 1385 [NFSD_Export_features] = {"export_features",
1369 &export_features_operations, S_IRUGO}, 1386 &export_features_operations, S_IRUGO},
@@ -1388,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1388 return simple_fill_super(sb, 0x6e667364, nfsd_files); 1405 return simple_fill_super(sb, 0x6e667364, nfsd_files);
1389} 1406}
1390 1407
1391static int nfsd_get_sb(struct file_system_type *fs_type, 1408static struct dentry *nfsd_mount(struct file_system_type *fs_type,
1392 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1409 int flags, const char *dev_name, void *data)
1393{ 1410{
1394 return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); 1411 return mount_single(fs_type, flags, data, nfsd_fill_super);
1395} 1412}
1396 1413
1397static struct file_system_type nfsd_fs_type = { 1414static struct file_system_type nfsd_fs_type = {
1398 .owner = THIS_MODULE, 1415 .owner = THIS_MODULE,
1399 .name = "nfsd", 1416 .name = "nfsd",
1400 .get_sb = nfsd_get_sb, 1417 .mount = nfsd_mount,
1401 .kill_sb = kill_litter_super, 1418 .kill_sb = kill_litter_super,
1402}; 1419};
1403 1420
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 72377761270..6b641cf2c19 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -153,6 +153,7 @@ void nfsd_lockd_shutdown(void);
153#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) 153#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID)
154#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) 154#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK)
155#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) 155#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME)
156#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE)
156#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) 157#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH)
157#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
158#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
@@ -248,7 +249,7 @@ extern time_t nfsd4_grace;
248#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
249#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
250 251
251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 252#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
252 253
253/* 254/*
254 * The following attributes are currently not supported by the NFSv4 server: 255 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a420..c16f8d8331b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
196static inline void 196static inline void
197fh_unlock(struct svc_fh *fhp) 197fh_unlock(struct svc_fh *fhp)
198{ 198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) { 199 if (fhp->fh_locked) {
202 fill_post_wcc(fhp); 200 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); 201 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111e..08e17264784 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); 144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
145 145
146 resp->count = argp->count; 146 resp->count = argp->count;
147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
148 argp->offset, 148 argp->offset,
149 rqstp->rq_vec, argp->vlen, 149 rqstp->rq_vec, argp->vlen,
150 &resp->count); 150 &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
290 * gospel of sun micro 290 * gospel of sun micro
291 */ 291 */
292 if (type != S_IFREG) { 292 if (type != S_IFREG) {
293 int is_borc = 0;
294 if (type != S_IFBLK && type != S_IFCHR) { 293 if (type != S_IFBLK && type != S_IFCHR) {
295 rdev = 0; 294 rdev = 0;
296 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { 295 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
298 type = S_IFIFO; 297 type = S_IFIFO;
299 } else { 298 } else {
300 /* Okay, char or block special */ 299 /* Okay, char or block special */
301 is_borc = 1;
302 if (!rdev) 300 if (!rdev)
303 rdev = wanted; 301 rdev = wanted;
304 } 302 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 06b2a26edfe..2bae1d86f5f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
16#include <linux/lockd/bind.h> 16#include <linux/lockd/bind.h>
17#include <linux/nfsacl.h> 17#include <linux/nfsacl.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <net/net_namespace.h>
19#include "nfsd.h" 20#include "nfsd.h"
20#include "cache.h" 21#include "cache.h"
21#include "vfs.h" 22#include "vfs.h"
@@ -180,15 +181,80 @@ int nfsd_nrthreads(void)
180 return rv; 181 return rv;
181} 182}
182 183
184static int nfsd_init_socks(int port)
185{
186 int error;
187 if (!list_empty(&nfsd_serv->sv_permsocks))
188 return 0;
189
190 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
191 SVC_SOCK_DEFAULTS);
192 if (error < 0)
193 return error;
194
195 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
196 SVC_SOCK_DEFAULTS);
197 if (error < 0)
198 return error;
199
200 return 0;
201}
202
203static bool nfsd_up = false;
204
205static int nfsd_startup(unsigned short port, int nrservs)
206{
207 int ret;
208
209 if (nfsd_up)
210 return 0;
211 /*
212 * Readahead param cache - will no-op if it already exists.
213 * (Note therefore results will be suboptimal if number of
214 * threads is modified after nfsd start.)
215 */
216 ret = nfsd_racache_init(2*nrservs);
217 if (ret)
218 return ret;
219 ret = nfsd_init_socks(port);
220 if (ret)
221 goto out_racache;
222 ret = lockd_up();
223 if (ret)
224 goto out_racache;
225 ret = nfs4_state_start();
226 if (ret)
227 goto out_lockd;
228 nfsd_up = true;
229 return 0;
230out_lockd:
231 lockd_down();
232out_racache:
233 nfsd_racache_shutdown();
234 return ret;
235}
236
237static void nfsd_shutdown(void)
238{
239 /*
240 * write_ports can create the server without actually starting
241 * any threads--if we get shut down before any threads are
242 * started, then nfsd_last_thread will be run before any of this
243 * other initialization has been done.
244 */
245 if (!nfsd_up)
246 return;
247 nfs4_state_shutdown();
248 lockd_down();
249 nfsd_racache_shutdown();
250 nfsd_up = false;
251}
252
183static void nfsd_last_thread(struct svc_serv *serv) 253static void nfsd_last_thread(struct svc_serv *serv)
184{ 254{
185 /* When last nfsd thread exits we need to do some clean-up */ 255 /* When last nfsd thread exits we need to do some clean-up */
186 struct svc_xprt *xprt;
187 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
188 lockd_down();
189 nfsd_serv = NULL; 256 nfsd_serv = NULL;
190 nfsd_racache_shutdown(); 257 nfsd_shutdown();
191 nfs4_state_shutdown();
192 258
193 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 259 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
194 "cache\n"); 260 "cache\n");
@@ -263,45 +329,18 @@ int nfsd_create_serv(void)
263 nfsd_max_blksize >= 8*1024*2) 329 nfsd_max_blksize >= 8*1024*2)
264 nfsd_max_blksize /= 2; 330 nfsd_max_blksize /= 2;
265 } 331 }
332 nfsd_reset_versions();
266 333
267 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 334 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
268 nfsd_last_thread, nfsd, THIS_MODULE); 335 nfsd_last_thread, nfsd, THIS_MODULE);
269 if (nfsd_serv == NULL) 336 if (nfsd_serv == NULL)
270 err = -ENOMEM; 337 return -ENOMEM;
271 else
272 set_max_drc();
273 338
339 set_max_drc();
274 do_gettimeofday(&nfssvc_boot); /* record boot time */ 340 do_gettimeofday(&nfssvc_boot); /* record boot time */
275 return err; 341 return err;
276} 342}
277 343
278static int nfsd_init_socks(int port)
279{
280 int error;
281 if (!list_empty(&nfsd_serv->sv_permsocks))
282 return 0;
283
284 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
285 SVC_SOCK_DEFAULTS);
286 if (error < 0)
287 return error;
288
289 error = lockd_up();
290 if (error < 0)
291 return error;
292
293 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
294 SVC_SOCK_DEFAULTS);
295 if (error < 0)
296 return error;
297
298 error = lockd_up();
299 if (error < 0)
300 return error;
301
302 return 0;
303}
304
305int nfsd_nrpools(void) 344int nfsd_nrpools(void)
306{ 345{
307 if (nfsd_serv == NULL) 346 if (nfsd_serv == NULL)
@@ -376,10 +415,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
376 return err; 415 return err;
377} 416}
378 417
418/*
419 * Adjust the number of threads and return the new number of threads.
420 * This is also the function that starts the server if necessary, if
421 * this is the first time nrservs is nonzero.
422 */
379int 423int
380nfsd_svc(unsigned short port, int nrservs) 424nfsd_svc(unsigned short port, int nrservs)
381{ 425{
382 int error; 426 int error;
427 bool nfsd_up_before;
383 428
384 mutex_lock(&nfsd_mutex); 429 mutex_lock(&nfsd_mutex);
385 dprintk("nfsd: creating service\n"); 430 dprintk("nfsd: creating service\n");
@@ -391,34 +436,29 @@ nfsd_svc(unsigned short port, int nrservs)
391 if (nrservs == 0 && nfsd_serv == NULL) 436 if (nrservs == 0 && nfsd_serv == NULL)
392 goto out; 437 goto out;
393 438
394 /* Readahead param cache - will no-op if it already exists */ 439 error = nfsd_create_serv();
395 error = nfsd_racache_init(2*nrservs);
396 if (error<0)
397 goto out;
398 error = nfs4_state_start();
399 if (error) 440 if (error)
400 goto out; 441 goto out;
401 442
402 nfsd_reset_versions(); 443 nfsd_up_before = nfsd_up;
403
404 error = nfsd_create_serv();
405 444
445 error = nfsd_startup(port, nrservs);
406 if (error) 446 if (error)
407 goto out; 447 goto out_destroy;
408 error = nfsd_init_socks(port);
409 if (error)
410 goto failure;
411
412 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 448 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
413 if (error == 0) 449 if (error)
414 /* We are holding a reference to nfsd_serv which 450 goto out_shutdown;
415 * we don't want to count in the return value, 451 /* We are holding a reference to nfsd_serv which
416 * so subtract 1 452 * we don't want to count in the return value,
417 */ 453 * so subtract 1
418 error = nfsd_serv->sv_nrthreads - 1; 454 */
419 failure: 455 error = nfsd_serv->sv_nrthreads - 1;
456out_shutdown:
457 if (error < 0 && !nfsd_up_before)
458 nfsd_shutdown();
459out_destroy:
420 svc_destroy(nfsd_serv); /* Release server */ 460 svc_destroy(nfsd_serv); /* Release server */
421 out: 461out:
422 mutex_unlock(&nfsd_mutex); 462 mutex_unlock(&nfsd_mutex);
423 return error; 463 return error;
424} 464}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 006c84230c7..39adc27b068 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/sunrpc/svc_xprt.h>
38#include <linux/nfsd/nfsfh.h> 39#include <linux/nfsd/nfsfh.h>
39#include "nfsfh.h" 40#include "nfsfh.h"
40 41
@@ -64,19 +65,12 @@ typedef struct {
64 (s)->si_fileid, \ 65 (s)->si_fileid, \
65 (s)->si_generation 66 (s)->si_generation
66 67
67struct nfsd4_cb_sequence {
68 /* args/res */
69 u32 cbs_minorversion;
70 struct nfs4_client *cbs_clp;
71};
72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback { 68struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args; 69 void *cb_op;
70 struct nfs4_client *cb_clp;
71 u32 cb_minorversion;
72 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops;
80 struct work_struct cb_work; 74 struct work_struct cb_work;
81}; 75};
82 76
@@ -88,11 +82,9 @@ struct nfs4_delegation {
88 struct nfs4_client *dl_client; 82 struct nfs4_client *dl_client;
89 struct nfs4_file *dl_file; 83 struct nfs4_file *dl_file;
90 struct file_lock *dl_flock; 84 struct file_lock *dl_flock;
91 struct file *dl_vfs_file;
92 u32 dl_type; 85 u32 dl_type;
93 time_t dl_time; 86 time_t dl_time;
94/* For recall: */ 87/* For recall: */
95 u32 dl_ident;
96 stateid_t dl_stateid; 88 stateid_t dl_stateid;
97 struct knfsd_fh dl_fh; 89 struct knfsd_fh dl_fh;
98 int dl_retries; 90 int dl_retries;
@@ -104,8 +96,8 @@ struct nfs4_cb_conn {
104 /* SETCLIENTID info */ 96 /* SETCLIENTID info */
105 struct sockaddr_storage cb_addr; 97 struct sockaddr_storage cb_addr;
106 size_t cb_addrlen; 98 size_t cb_addrlen;
107 u32 cb_prog; 99 u32 cb_prog; /* used only in 4.0 case;
108 u32 cb_minorversion; 100 per-session otherwise */
109 u32 cb_ident; /* minorversion 0 only */ 101 u32 cb_ident; /* minorversion 0 only */
110 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 102 struct svc_xprt *cb_xprt; /* minorversion 1 only */
111}; 103};
@@ -161,6 +153,15 @@ struct nfsd4_clid_slot {
161 struct nfsd4_create_session sl_cr_ses; 153 struct nfsd4_create_session sl_cr_ses;
162}; 154};
163 155
156struct nfsd4_conn {
157 struct list_head cn_persession;
158 struct svc_xprt *cn_xprt;
159 struct svc_xpt_user cn_xpt_user;
160 struct nfsd4_session *cn_session;
161/* CDFC4_FORE, CDFC4_BACK: */
162 unsigned char cn_flags;
163};
164
164struct nfsd4_session { 165struct nfsd4_session {
165 struct kref se_ref; 166 struct kref se_ref;
166 struct list_head se_hash; /* hash by sessionid */ 167 struct list_head se_hash; /* hash by sessionid */
@@ -170,6 +171,9 @@ struct nfsd4_session {
170 struct nfs4_sessionid se_sessionid; 171 struct nfs4_sessionid se_sessionid;
171 struct nfsd4_channel_attrs se_fchannel; 172 struct nfsd4_channel_attrs se_fchannel;
172 struct nfsd4_channel_attrs se_bchannel; 173 struct nfsd4_channel_attrs se_bchannel;
174 struct list_head se_conns;
175 u32 se_cb_prog;
176 u32 se_cb_seq_nr;
173 struct nfsd4_slot *se_slots[]; /* forward channel slots */ 177 struct nfsd4_slot *se_slots[]; /* forward channel slots */
174}; 178};
175 179
@@ -222,24 +226,32 @@ struct nfs4_client {
222 clientid_t cl_clientid; /* generated by server */ 226 clientid_t cl_clientid; /* generated by server */
223 nfs4_verifier cl_confirm; /* generated by server */ 227 nfs4_verifier cl_confirm; /* generated by server */
224 u32 cl_firststate; /* recovery dir creation */ 228 u32 cl_firststate; /* recovery dir creation */
229 u32 cl_minorversion;
225 230
226 /* for v4.0 and v4.1 callbacks: */ 231 /* for v4.0 and v4.1 callbacks: */
227 struct nfs4_cb_conn cl_cb_conn; 232 struct nfs4_cb_conn cl_cb_conn;
233#define NFSD4_CLIENT_CB_UPDATE 1
234#define NFSD4_CLIENT_KILL 2
235 unsigned long cl_cb_flags;
228 struct rpc_clnt *cl_cb_client; 236 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident;
229 atomic_t cl_cb_set; 238 atomic_t cl_cb_set;
239 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session;
241
242 /* for all client information that callback code might need: */
243 spinlock_t cl_lock;
230 244
231 /* for nfs41 */ 245 /* for nfs41 */
232 struct list_head cl_sessions; 246 struct list_head cl_sessions;
233 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 247 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
234 u32 cl_exchange_flags; 248 u32 cl_exchange_flags;
235 struct nfs4_sessionid cl_sessionid;
236 /* number of rpc's in progress over an associated session: */ 249 /* number of rpc's in progress over an associated session: */
237 atomic_t cl_refcount; 250 atomic_t cl_refcount;
238 251
239 /* for nfs41 callbacks */ 252 /* for nfs41 callbacks */
240 /* We currently support a single back channel with a single slot */ 253 /* We currently support a single back channel with a single slot */
241 unsigned long cl_cb_slot_busy; 254 unsigned long cl_cb_slot_busy;
242 u32 cl_cb_seq_nr;
243 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 255 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
244 /* wait here for slots */ 256 /* wait here for slots */
245}; 257};
@@ -342,12 +354,50 @@ struct nfs4_file {
342 struct list_head fi_hash; /* hash by "struct inode *" */ 354 struct list_head fi_hash; /* hash by "struct inode *" */
343 struct list_head fi_stateids; 355 struct list_head fi_stateids;
344 struct list_head fi_delegations; 356 struct list_head fi_delegations;
357 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
358 struct file * fi_fds[3];
359 /* One each for O_RDONLY, O_WRONLY: */
360 atomic_t fi_access[2];
361 /*
362 * Each open stateid contributes 1 to either fi_readers or
363 * fi_writers, or both, depending on the open mode. A
364 * delegation also takes an fi_readers reference. Lock
365 * stateid's take none.
366 */
367 atomic_t fi_readers;
368 atomic_t fi_writers;
345 struct inode *fi_inode; 369 struct inode *fi_inode;
346 u32 fi_id; /* used with stateowner->so_id 370 u32 fi_id; /* used with stateowner->so_id
347 * for stateid_hashtbl hash */ 371 * for stateid_hashtbl hash */
348 bool fi_had_conflict; 372 bool fi_had_conflict;
349}; 373};
350 374
375/* XXX: for first cut may fall back on returning file that doesn't work
376 * at all? */
377static inline struct file *find_writeable_file(struct nfs4_file *f)
378{
379 if (f->fi_fds[O_WRONLY])
380 return f->fi_fds[O_WRONLY];
381 return f->fi_fds[O_RDWR];
382}
383
384static inline struct file *find_readable_file(struct nfs4_file *f)
385{
386 if (f->fi_fds[O_RDONLY])
387 return f->fi_fds[O_RDONLY];
388 return f->fi_fds[O_RDWR];
389}
390
391static inline struct file *find_any_file(struct nfs4_file *f)
392{
393 if (f->fi_fds[O_RDWR])
394 return f->fi_fds[O_RDWR];
395 else if (f->fi_fds[O_WRONLY])
396 return f->fi_fds[O_WRONLY];
397 else
398 return f->fi_fds[O_RDONLY];
399}
400
351/* 401/*
352* nfs4_stateid can either be an open stateid or (eventually) a lock stateid 402* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
353* 403*
@@ -373,7 +423,6 @@ struct nfs4_stateid {
373 struct nfs4_stateowner * st_stateowner; 423 struct nfs4_stateowner * st_stateowner;
374 struct nfs4_file * st_file; 424 struct nfs4_file * st_file;
375 stateid_t st_stateid; 425 stateid_t st_stateid;
376 struct file * st_vfs_file;
377 unsigned long st_access_bmap; 426 unsigned long st_access_bmap;
378 unsigned long st_deny_bmap; 427 unsigned long st_deny_bmap;
379 struct nfs4_stateid * st_openstp; 428 struct nfs4_stateid * st_openstp;
@@ -404,12 +453,13 @@ extern int nfs4_in_grace(void);
404extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 453extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
405extern void nfs4_free_stateowner(struct kref *kref); 454extern void nfs4_free_stateowner(struct kref *kref);
406extern int set_callback_cred(void); 455extern int set_callback_cred(void);
407extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 456extern void nfsd4_probe_callback(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
408extern void nfsd4_do_callback_rpc(struct work_struct *); 458extern void nfsd4_do_callback_rpc(struct work_struct *);
409extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 459extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
410extern int nfsd4_create_callback_queue(void); 460extern int nfsd4_create_callback_queue(void);
411extern void nfsd4_destroy_callback_queue(void); 461extern void nfsd4_destroy_callback_queue(void);
412extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); 462extern void nfsd4_shutdown_callback(struct nfs4_client *);
413extern void nfs4_put_delegation(struct nfs4_delegation *dp); 463extern void nfs4_put_delegation(struct nfs4_delegation *dp);
414extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 464extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
415extern void nfsd4_init_recdir(char *recdir_name); 465extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b61..184938fcff0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
281{ 281{
282 struct inode *inode = fhp->fh_dentry->d_inode; 282 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op; 283 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285 284
286 if (!EX_ISSYNC(fhp->fh_export)) 285 if (!EX_ISSYNC(fhp->fh_export))
287 return 0; 286 return 0;
288 287
289 if (export_ops->commit_metadata) { 288 if (export_ops->commit_metadata)
290 error = export_ops->commit_metadata(inode); 289 return export_ops->commit_metadata(inode);
291 } else { 290 return sync_inode_metadata(inode, 1);
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301} 291}
302 292
303/* 293/*
@@ -604,7 +594,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
604 return error; 594 return error;
605} 595}
606 596
607#endif /* defined(CONFIG_NFS_V4) */ 597#endif /* defined(CONFIG_NFSD_V4) */
608 598
609#ifdef CONFIG_NFSD_V3 599#ifdef CONFIG_NFSD_V3
610/* 600/*
@@ -903,7 +893,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 893 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
904{ 894{
905 struct inode *inode; 895 struct inode *inode;
906 struct raparms *ra;
907 mm_segment_t oldfs; 896 mm_segment_t oldfs;
908 __be32 err; 897 __be32 err;
909 int host_err; 898 int host_err;
@@ -914,12 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
914 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) 903 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
915 goto out; 904 goto out;
916 905
917 /* Get readahead parameters */
918 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
919
920 if (ra && ra->p_set)
921 file->f_ra = ra->p_ra;
922
923 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 906 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
924 struct splice_desc sd = { 907 struct splice_desc sd = {
925 .len = 0, 908 .len = 0,
@@ -937,21 +920,11 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
937 set_fs(oldfs); 920 set_fs(oldfs);
938 } 921 }
939 922
940 /* Write back readahead params */
941 if (ra) {
942 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
943 spin_lock(&rab->pb_lock);
944 ra->p_ra = file->f_ra;
945 ra->p_set = 1;
946 ra->p_count--;
947 spin_unlock(&rab->pb_lock);
948 }
949
950 if (host_err >= 0) { 923 if (host_err >= 0) {
951 nfsdstats.io_read += host_err; 924 nfsdstats.io_read += host_err;
952 *count = host_err; 925 *count = host_err;
953 err = 0; 926 err = 0;
954 fsnotify_access(file->f_path.dentry); 927 fsnotify_access(file);
955 } else 928 } else
956 err = nfserrno(host_err); 929 err = nfserrno(host_err);
957out: 930out:
@@ -1062,7 +1035,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1062 goto out_nfserr; 1035 goto out_nfserr;
1063 *cnt = host_err; 1036 *cnt = host_err;
1064 nfsdstats.io_write += host_err; 1037 nfsdstats.io_write += host_err;
1065 fsnotify_modify(file->f_path.dentry); 1038 fsnotify_modify(file);
1066 1039
1067 /* clear setuid/setgid flag after write */ 1040 /* clear setuid/setgid flag after write */
1068 if (inode->i_mode & (S_ISUID | S_ISGID)) 1041 if (inode->i_mode & (S_ISUID | S_ISGID))
@@ -1086,8 +1059,45 @@ out:
1086 * on entry. On return, *count contains the number of bytes actually read. 1059 * on entry. On return, *count contains the number of bytes actually read.
1087 * N.B. After this call fhp needs an fh_put 1060 * N.B. After this call fhp needs an fh_put
1088 */ 1061 */
1062__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
1063 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
1064{
1065 struct file *file;
1066 struct inode *inode;
1067 struct raparms *ra;
1068 __be32 err;
1069
1070 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
1071 if (err)
1072 return err;
1073
1074 inode = file->f_path.dentry->d_inode;
1075
1076 /* Get readahead parameters */
1077 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
1078
1079 if (ra && ra->p_set)
1080 file->f_ra = ra->p_ra;
1081
1082 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1083
1084 /* Write back readahead params */
1085 if (ra) {
1086 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
1087 spin_lock(&rab->pb_lock);
1088 ra->p_ra = file->f_ra;
1089 ra->p_set = 1;
1090 ra->p_count--;
1091 spin_unlock(&rab->pb_lock);
1092 }
1093
1094 nfsd_close(file);
1095 return err;
1096}
1097
1098/* As above, but use the provided file descriptor. */
1089__be32 1099__be32
1090nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1100nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1091 loff_t offset, struct kvec *vec, int vlen, 1101 loff_t offset, struct kvec *vec, int vlen,
1092 unsigned long *count) 1102 unsigned long *count)
1093{ 1103{
@@ -1099,13 +1109,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1099 if (err) 1109 if (err)
1100 goto out; 1110 goto out;
1101 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); 1111 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1102 } else { 1112 } else /* Note file may still be NULL in NFSv4 special stateid case: */
1103 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); 1113 err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
1104 if (err)
1105 goto out;
1106 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1107 nfsd_close(file);
1108 }
1109out: 1114out:
1110 return err; 1115 return err;
1111} 1116}
@@ -1631,7 +1636,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1631 char *name, int len, struct svc_fh *tfhp) 1636 char *name, int len, struct svc_fh *tfhp)
1632{ 1637{
1633 struct dentry *ddir, *dnew, *dold; 1638 struct dentry *ddir, *dnew, *dold;
1634 struct inode *dirp, *dest; 1639 struct inode *dirp;
1635 __be32 err; 1640 __be32 err;
1636 int host_err; 1641 int host_err;
1637 1642
@@ -1659,7 +1664,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1659 goto out_nfserr; 1664 goto out_nfserr;
1660 1665
1661 dold = tfhp->fh_dentry; 1666 dold = tfhp->fh_dentry;
1662 dest = dold->d_inode;
1663 1667
1664 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); 1668 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
1665 if (host_err) { 1669 if (host_err) {
@@ -2019,9 +2023,17 @@ out:
2019__be32 2023__be32
2020nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) 2024nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
2021{ 2025{
2022 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); 2026 __be32 err;
2023 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 2027
2024 err = nfserr_io; 2028 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
2029 if (!err) {
2030 struct path path = {
2031 .mnt = fhp->fh_export->ex_path.mnt,
2032 .dentry = fhp->fh_dentry,
2033 };
2034 if (vfs_statfs(&path, stat))
2035 err = nfserr_io;
2036 }
2025 return err; 2037 return err;
2026} 2038}
2027 2039
@@ -2038,7 +2050,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2038 struct dentry *dentry, int acc) 2050 struct dentry *dentry, int acc)
2039{ 2051{
2040 struct inode *inode = dentry->d_inode; 2052 struct inode *inode = dentry->d_inode;
2041 struct path path;
2042 int err; 2053 int err;
2043 2054
2044 if (acc == NFSD_MAY_NOP) 2055 if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2122,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2111 if (err == -EACCES && S_ISREG(inode->i_mode) && 2122 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2112 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2123 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2113 err = inode_permission(inode, MAY_EXEC); 2124 err = inode_permission(inode, MAY_EXEC);
2114 if (err)
2115 goto nfsd_out;
2116 2125
2117 /* Do integrity (permission) checking now, but defer incrementing
2118 * IMA counts to the actual file open.
2119 */
2120 path.mnt = exp->ex_path.mnt;
2121 path.dentry = dentry;
2122nfsd_out:
2123 return err? nfserrno(err) : 0; 2126 return err? nfserrno(err) : 0;
2124} 2127}
2125 2128
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 217a62c2a35..9a370a5e36b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -64,7 +64,9 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
64__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, 64__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
65 int, struct file **); 65 int, struct file **);
66void nfsd_close(struct file *); 66void nfsd_close(struct file *);
67__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, 67__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
68 loff_t, struct kvec *, int, unsigned long *);
69__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
68 loff_t, struct kvec *, int, unsigned long *); 70 loff_t, struct kvec *, int, unsigned long *);
69__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, 71__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
70 loff_t, struct kvec *,int, unsigned long *, int *); 72 loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc..85c98737a14 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ 2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \ 3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ 4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o 5 ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c1..8b782b062ba 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
26#include "nilfs.h" 26#include "nilfs.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "sb.h" 28#include "sb.h"
29#include "btree.h"
30#include "direct.h"
29#include "btnode.h" 31#include "btnode.h"
30#include "mdt.h" 32#include "mdt.h"
31#include "dat.h" 33#include "dat.h"
@@ -531,18 +533,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
531 nilfs_btree_init_gc(bmap); 533 nilfs_btree_init_gc(bmap);
532} 534}
533 535
534void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 536void nilfs_bmap_save(const struct nilfs_bmap *bmap,
537 struct nilfs_bmap_store *store)
535{ 538{
536 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); 539 memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
537 init_rwsem(&gcbmap->b_sem); 540 store->last_allocated_key = bmap->b_last_allocated_key;
538 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 541 store->last_allocated_ptr = bmap->b_last_allocated_ptr;
539 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 542 store->state = bmap->b_state;
540} 543}
541 544
542void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 545void nilfs_bmap_restore(struct nilfs_bmap *bmap,
546 const struct nilfs_bmap_store *store)
543{ 547{
544 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); 548 memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
545 init_rwsem(&bmap->b_sem); 549 bmap->b_last_allocated_key = store->last_allocated_key;
546 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 550 bmap->b_last_allocated_ptr = store->last_allocated_ptr;
547 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 551 bmap->b_state = store->state;
548} 552}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab9..bde1c0aa2e1 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
32 32
33#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
34 34
35#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
36#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
37#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
38#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
39
40#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) 35#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
41 36
42 37
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
71 int (*bop_delete)(struct nilfs_bmap *, __u64); 66 int (*bop_delete)(struct nilfs_bmap *, __u64);
72 void (*bop_clear)(struct nilfs_bmap *); 67 void (*bop_clear)(struct nilfs_bmap *);
73 68
74 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); 69 int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
75 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, 70 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
76 struct list_head *); 71 struct list_head *);
77 72
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
110 * @b_last_allocated_ptr: last allocated ptr for data block 105 * @b_last_allocated_ptr: last allocated ptr for data block
111 * @b_ptr_type: pointer type 106 * @b_ptr_type: pointer type
112 * @b_state: state 107 * @b_state: state
108 * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
113 */ 109 */
114struct nilfs_bmap { 110struct nilfs_bmap {
115 union { 111 union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
123 __u64 b_last_allocated_ptr; 119 __u64 b_last_allocated_ptr;
124 int b_ptr_type; 120 int b_ptr_type;
125 int b_state; 121 int b_state;
122 __u16 b_nchildren_per_block;
126}; 123};
127 124
128/* pointer type */ 125/* pointer type */
@@ -138,6 +135,12 @@ struct nilfs_bmap {
138/* state */ 135/* state */
139#define NILFS_BMAP_DIRTY 0x00000001 136#define NILFS_BMAP_DIRTY 0x00000001
140 137
138struct nilfs_bmap_store {
139 __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
140 __u64 last_allocated_key;
141 __u64 last_allocated_ptr;
142 int state;
143};
141 144
142int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 145int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
143int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 146int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -156,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
156int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); 159int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
157 160
158void nilfs_bmap_init_gc(struct nilfs_bmap *); 161void nilfs_bmap_init_gc(struct nilfs_bmap *);
159void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
161 162
163void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
164void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
162 165
163static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, 166static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
164 __u64 *ptr) 167 __u64 *ptr)
@@ -224,6 +227,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
224 nilfs_dat_abort_end(dat, &req->bpr_req); 227 nilfs_dat_abort_end(dat, &req->bpr_req);
225} 228}
226 229
230static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
231 __u64 ptr)
232{
233 bmap->b_last_allocated_key = key;
234 bmap->b_last_allocated_ptr = ptr;
235}
236
227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 237__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
228 const struct buffer_head *); 238 const struct buffer_head *);
229 239
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47..00000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a330..5115814cb74 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
37 37
38void nilfs_btnode_cache_init_once(struct address_space *btnc) 38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{ 39{
40 memset(btnc, 0, sizeof(*btnc)); 40 nilfs_mapping_init_once(btnc);
41 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
42 spin_lock_init(&btnc->tree_lock);
43 INIT_LIST_HEAD(&btnc->private_list);
44 spin_lock_init(&btnc->private_lock);
45
46 spin_lock_init(&btnc->i_mmap_lock);
47 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
48 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
49} 41}
50 42
51static const struct address_space_operations def_btnode_aops = { 43static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
55void nilfs_btnode_cache_init(struct address_space *btnc, 47void nilfs_btnode_cache_init(struct address_space *btnc,
56 struct backing_dev_info *bdi) 48 struct backing_dev_info *bdi)
57{ 49{
58 btnc->host = NULL; /* can safely set to host inode ? */ 50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
59 btnc->flags = 0;
60 mapping_set_gfp_mask(btnc, GFP_NOFS);
61 btnc->assoc_mapping = NULL;
62 btnc->backing_dev_info = bdi;
63 btnc->a_ops = &def_btnode_aops;
64} 51}
65 52
66void nilfs_btnode_cache_clear(struct address_space *btnc) 53void nilfs_btnode_cache_clear(struct address_space *btnc)
@@ -96,10 +83,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
96} 83}
97 84
98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, 85int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
99 sector_t pblocknr, struct buffer_head **pbh) 86 sector_t pblocknr, int mode,
87 struct buffer_head **pbh, sector_t *submit_ptr)
100{ 88{
101 struct buffer_head *bh; 89 struct buffer_head *bh;
102 struct inode *inode = NILFS_BTNC_I(btnc); 90 struct inode *inode = NILFS_BTNC_I(btnc);
91 struct page *page;
103 int err; 92 int err;
104 93
105 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); 94 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +96,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
107 return -ENOMEM; 96 return -ENOMEM;
108 97
109 err = -EEXIST; /* internal code */ 98 err = -EEXIST; /* internal code */
99 page = bh->b_page;
110 100
111 if (buffer_uptodate(bh) || buffer_dirty(bh)) 101 if (buffer_uptodate(bh) || buffer_dirty(bh))
112 goto found; 102 goto found;
@@ -125,7 +115,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
125 } 115 }
126 } 116 }
127 } 117 }
128 lock_buffer(bh); 118
119 if (mode == READA) {
120 if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
121 err = -EBUSY; /* internal code */
122 brelse(bh);
123 goto out_locked;
124 }
125 } else { /* mode == READ */
126 lock_buffer(bh);
127 }
129 if (buffer_uptodate(bh)) { 128 if (buffer_uptodate(bh)) {
130 unlock_buffer(bh); 129 unlock_buffer(bh);
131 err = -EEXIST; /* internal code */ 130 err = -EEXIST; /* internal code */
@@ -136,15 +135,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
136 bh->b_blocknr = pblocknr; /* set block address for read */ 135 bh->b_blocknr = pblocknr; /* set block address for read */
137 bh->b_end_io = end_buffer_read_sync; 136 bh->b_end_io = end_buffer_read_sync;
138 get_bh(bh); 137 get_bh(bh);
139 submit_bh(READ, bh); 138 submit_bh(mode, bh);
140 bh->b_blocknr = blocknr; /* set back to the given block address */ 139 bh->b_blocknr = blocknr; /* set back to the given block address */
140 *submit_ptr = pblocknr;
141 err = 0; 141 err = 0;
142found: 142found:
143 *pbh = bh; 143 *pbh = bh;
144 144
145out_locked: 145out_locked:
146 unlock_page(bh->b_page); 146 unlock_page(page);
147 page_cache_release(bh->b_page); 147 page_cache_release(page);
148 return err; 148 return err;
149} 149}
150 150
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f0771..79037494f1e 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 42void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, 43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
44 __u64 blocknr); 44 __u64 blocknr);
45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, 45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
46 struct buffer_head **); 46 struct buffer_head **, sector_t *);
47void nilfs_btnode_delete(struct buffer_head *); 47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *, 48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *); 49 struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b27a342c5af..300c2bc00c3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
66/* 66/*
67 * B-tree node operations 67 * B-tree node operations
68 */ 68 */
69static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, 69static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
70 struct buffer_head **bhp)
71{
72 struct address_space *btnc =
73 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
74 int err;
75
76 err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
77 if (err)
78 return err == -EEXIST ? 0 : err;
79
80 wait_on_buffer(*bhp);
81 if (!buffer_uptodate(*bhp)) {
82 brelse(*bhp);
83 return -EIO;
84 }
85 return 0;
86}
87
88static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
89 __u64 ptr, struct buffer_head **bhp) 70 __u64 ptr, struct buffer_head **bhp)
90{ 71{
91 struct address_space *btnc = 72 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
92 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
93 struct buffer_head *bh; 73 struct buffer_head *bh;
94 74
95 bh = nilfs_btnode_create_block(btnc, ptr); 75 bh = nilfs_btnode_create_block(btnc, ptr);
@@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
101 return 0; 81 return 0;
102} 82}
103 83
104static inline int 84static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
105nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
106{ 85{
107 return node->bn_flags; 86 return node->bn_flags;
108} 87}
109 88
110static inline void 89static void
111nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) 90nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
112{ 91{
113 node->bn_flags = flags; 92 node->bn_flags = flags;
114} 93}
115 94
116static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) 95static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
117{ 96{
118 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; 97 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
119} 98}
120 99
121static inline int 100static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
122nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
123{ 101{
124 return node->bn_level; 102 return node->bn_level;
125} 103}
126 104
127static inline void 105static void
128nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) 106nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
129{ 107{
130 node->bn_level = level; 108 node->bn_level = level;
131} 109}
132 110
133static inline int 111static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
134nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
135{ 112{
136 return le16_to_cpu(node->bn_nchildren); 113 return le16_to_cpu(node->bn_nchildren);
137} 114}
138 115
139static inline void 116static void
140nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) 117nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
141{ 118{
142 node->bn_nchildren = cpu_to_le16(nchildren); 119 node->bn_nchildren = cpu_to_le16(nchildren);
143} 120}
144 121
145static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) 122static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
146{ 123{
147 return 1 << btree->bt_bmap.b_inode->i_blkbits; 124 return 1 << btree->b_inode->i_blkbits;
148} 125}
149 126
150static inline int 127static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
151nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
152 const struct nilfs_btree *btree)
153{ 128{
154 return nilfs_btree_node_root(node) ? 129 return btree->b_nchildren_per_block;
155 NILFS_BTREE_ROOT_NCHILDREN_MIN :
156 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
157} 130}
158 131
159static inline int 132static __le64 *
160nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
161 const struct nilfs_btree *btree)
162{
163 return nilfs_btree_node_root(node) ?
164 NILFS_BTREE_ROOT_NCHILDREN_MAX :
165 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
166}
167
168static inline __le64 *
169nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) 133nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
170{ 134{
171 return (__le64 *)((char *)(node + 1) + 135 return (__le64 *)((char *)(node + 1) +
@@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
173 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 137 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
174} 138}
175 139
176static inline __le64 * 140static __le64 *
177nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, 141nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
178 const struct nilfs_btree *btree)
179{ 142{
180 return (__le64 *)(nilfs_btree_node_dkeys(node) + 143 return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
181 nilfs_btree_node_nchildren_max(node, btree));
182} 144}
183 145
184static inline __u64 146static __u64
185nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) 147nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
186{ 148{
187 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); 149 return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
188} 150}
189 151
190static inline void 152static void
191nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) 153nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
192{ 154{
193 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); 155 *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
194} 156}
195 157
196static inline __u64 158static __u64
197nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 159nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
198 const struct nilfs_btree_node *node, int index) 160 int ncmax)
199{ 161{
200 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + 162 return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
201 index));
202} 163}
203 164
204static inline void 165static void
205nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 166nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
206 struct nilfs_btree_node *node, int index, __u64 ptr) 167 int ncmax)
207{ 168{
208 *(nilfs_btree_node_dptrs(node, btree) + index) = 169 *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
209 nilfs_bmap_ptr_to_dptr(ptr);
210} 170}
211 171
212static void nilfs_btree_node_init(struct nilfs_btree *btree, 172static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
213 struct nilfs_btree_node *node, 173 int level, int nchildren, int ncmax,
214 int flags, int level, int nchildren,
215 const __u64 *keys, const __u64 *ptrs) 174 const __u64 *keys, const __u64 *ptrs)
216{ 175{
217 __le64 *dkeys; 176 __le64 *dkeys;
@@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
223 nilfs_btree_node_set_nchildren(node, nchildren); 182 nilfs_btree_node_set_nchildren(node, nchildren);
224 183
225 dkeys = nilfs_btree_node_dkeys(node); 184 dkeys = nilfs_btree_node_dkeys(node);
226 dptrs = nilfs_btree_node_dptrs(node, btree); 185 dptrs = nilfs_btree_node_dptrs(node, ncmax);
227 for (i = 0; i < nchildren; i++) { 186 for (i = 0; i < nchildren; i++) {
228 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 187 dkeys[i] = cpu_to_le64(keys[i]);
229 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 188 dptrs[i] = cpu_to_le64(ptrs[i]);
230 } 189 }
231} 190}
232 191
233/* Assume the buffer heads corresponding to left and right are locked. */ 192/* Assume the buffer heads corresponding to left and right are locked. */
234static void nilfs_btree_node_move_left(struct nilfs_btree *btree, 193static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
235 struct nilfs_btree_node *left,
236 struct nilfs_btree_node *right, 194 struct nilfs_btree_node *right,
237 int n) 195 int n, int lncmax, int rncmax)
238{ 196{
239 __le64 *ldkeys, *rdkeys; 197 __le64 *ldkeys, *rdkeys;
240 __le64 *ldptrs, *rdptrs; 198 __le64 *ldptrs, *rdptrs;
241 int lnchildren, rnchildren; 199 int lnchildren, rnchildren;
242 200
243 ldkeys = nilfs_btree_node_dkeys(left); 201 ldkeys = nilfs_btree_node_dkeys(left);
244 ldptrs = nilfs_btree_node_dptrs(left, btree); 202 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
245 lnchildren = nilfs_btree_node_get_nchildren(left); 203 lnchildren = nilfs_btree_node_get_nchildren(left);
246 204
247 rdkeys = nilfs_btree_node_dkeys(right); 205 rdkeys = nilfs_btree_node_dkeys(right);
248 rdptrs = nilfs_btree_node_dptrs(right, btree); 206 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
249 rnchildren = nilfs_btree_node_get_nchildren(right); 207 rnchildren = nilfs_btree_node_get_nchildren(right);
250 208
251 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 209 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
260} 218}
261 219
262/* Assume that the buffer heads corresponding to left and right are locked. */ 220/* Assume that the buffer heads corresponding to left and right are locked. */
263static void nilfs_btree_node_move_right(struct nilfs_btree *btree, 221static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
264 struct nilfs_btree_node *left,
265 struct nilfs_btree_node *right, 222 struct nilfs_btree_node *right,
266 int n) 223 int n, int lncmax, int rncmax)
267{ 224{
268 __le64 *ldkeys, *rdkeys; 225 __le64 *ldkeys, *rdkeys;
269 __le64 *ldptrs, *rdptrs; 226 __le64 *ldptrs, *rdptrs;
270 int lnchildren, rnchildren; 227 int lnchildren, rnchildren;
271 228
272 ldkeys = nilfs_btree_node_dkeys(left); 229 ldkeys = nilfs_btree_node_dkeys(left);
273 ldptrs = nilfs_btree_node_dptrs(left, btree); 230 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
274 lnchildren = nilfs_btree_node_get_nchildren(left); 231 lnchildren = nilfs_btree_node_get_nchildren(left);
275 232
276 rdkeys = nilfs_btree_node_dkeys(right); 233 rdkeys = nilfs_btree_node_dkeys(right);
277 rdptrs = nilfs_btree_node_dptrs(right, btree); 234 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
278 rnchildren = nilfs_btree_node_get_nchildren(right); 235 rnchildren = nilfs_btree_node_get_nchildren(right);
279 236
280 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 237 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
289} 246}
290 247
291/* Assume that the buffer head corresponding to node is locked. */ 248/* Assume that the buffer head corresponding to node is locked. */
292static void nilfs_btree_node_insert(struct nilfs_btree *btree, 249static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
293 struct nilfs_btree_node *node, 250 __u64 key, __u64 ptr, int ncmax)
294 __u64 key, __u64 ptr, int index)
295{ 251{
296 __le64 *dkeys; 252 __le64 *dkeys;
297 __le64 *dptrs; 253 __le64 *dptrs;
298 int nchildren; 254 int nchildren;
299 255
300 dkeys = nilfs_btree_node_dkeys(node); 256 dkeys = nilfs_btree_node_dkeys(node);
301 dptrs = nilfs_btree_node_dptrs(node, btree); 257 dptrs = nilfs_btree_node_dptrs(node, ncmax);
302 nchildren = nilfs_btree_node_get_nchildren(node); 258 nchildren = nilfs_btree_node_get_nchildren(node);
303 if (index < nchildren) { 259 if (index < nchildren) {
304 memmove(dkeys + index + 1, dkeys + index, 260 memmove(dkeys + index + 1, dkeys + index,
@@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
306 memmove(dptrs + index + 1, dptrs + index, 262 memmove(dptrs + index + 1, dptrs + index,
307 (nchildren - index) * sizeof(*dptrs)); 263 (nchildren - index) * sizeof(*dptrs));
308 } 264 }
309 dkeys[index] = nilfs_bmap_key_to_dkey(key); 265 dkeys[index] = cpu_to_le64(key);
310 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 266 dptrs[index] = cpu_to_le64(ptr);
311 nchildren++; 267 nchildren++;
312 nilfs_btree_node_set_nchildren(node, nchildren); 268 nilfs_btree_node_set_nchildren(node, nchildren);
313} 269}
314 270
315/* Assume that the buffer head corresponding to node is locked. */ 271/* Assume that the buffer head corresponding to node is locked. */
316static void nilfs_btree_node_delete(struct nilfs_btree *btree, 272static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
317 struct nilfs_btree_node *node, 273 __u64 *keyp, __u64 *ptrp, int ncmax)
318 __u64 *keyp, __u64 *ptrp, int index)
319{ 274{
320 __u64 key; 275 __u64 key;
321 __u64 ptr; 276 __u64 ptr;
@@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
324 int nchildren; 279 int nchildren;
325 280
326 dkeys = nilfs_btree_node_dkeys(node); 281 dkeys = nilfs_btree_node_dkeys(node);
327 dptrs = nilfs_btree_node_dptrs(node, btree); 282 dptrs = nilfs_btree_node_dptrs(node, ncmax);
328 key = nilfs_bmap_dkey_to_key(dkeys[index]); 283 key = le64_to_cpu(dkeys[index]);
329 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 284 ptr = le64_to_cpu(dptrs[index]);
330 nchildren = nilfs_btree_node_get_nchildren(node); 285 nchildren = nilfs_btree_node_get_nchildren(node);
331 if (keyp != NULL) 286 if (keyp != NULL)
332 *keyp = key; 287 *keyp = key;
@@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
382 return s == 0; 337 return s == 0;
383} 338}
384 339
385static inline struct nilfs_btree_node * 340/**
386nilfs_btree_get_root(const struct nilfs_btree *btree) 341 * nilfs_btree_node_broken - verify consistency of btree node
342 * @node: btree node block to be examined
343 * @size: node size (in bytes)
344 * @blocknr: block number
345 *
346 * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
347 */
348static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
349 size_t size, sector_t blocknr)
387{ 350{
388 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; 351 int level, flags, nchildren;
352 int ret = 0;
353
354 level = nilfs_btree_node_get_level(node);
355 flags = nilfs_btree_node_get_flags(node);
356 nchildren = nilfs_btree_node_get_nchildren(node);
357
358 if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
359 level >= NILFS_BTREE_LEVEL_MAX ||
360 (flags & NILFS_BTREE_NODE_ROOT) ||
361 nchildren < 0 ||
362 nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
363 printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
364 "level = %d, flags = 0x%x, nchildren = %d\n",
365 (unsigned long long)blocknr, level, flags, nchildren);
366 ret = 1;
367 }
368 return ret;
389} 369}
390 370
391static inline struct nilfs_btree_node * 371int nilfs_btree_broken_node_block(struct buffer_head *bh)
372{
373 int ret;
374
375 if (buffer_nilfs_checked(bh))
376 return 0;
377
378 ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
379 bh->b_size, bh->b_blocknr);
380 if (likely(!ret))
381 set_buffer_nilfs_checked(bh);
382 return ret;
383}
384
385static struct nilfs_btree_node *
386nilfs_btree_get_root(const struct nilfs_bmap *btree)
387{
388 return (struct nilfs_btree_node *)btree->b_u.u_data;
389}
390
391static struct nilfs_btree_node *
392nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) 392nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
393{ 393{
394 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 394 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
395} 395}
396 396
397static inline struct nilfs_btree_node * 397static struct nilfs_btree_node *
398nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) 398nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
399{ 399{
400 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 400 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
401} 401}
402 402
403static inline int nilfs_btree_height(const struct nilfs_btree *btree) 403static int nilfs_btree_height(const struct nilfs_bmap *btree)
404{ 404{
405 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; 405 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
406} 406}
407 407
408static inline struct nilfs_btree_node * 408static struct nilfs_btree_node *
409nilfs_btree_get_node(const struct nilfs_btree *btree, 409nilfs_btree_get_node(const struct nilfs_bmap *btree,
410 const struct nilfs_btree_path *path, 410 const struct nilfs_btree_path *path,
411 int level) 411 int level, int *ncmaxp)
412{ 412{
413 return (level == nilfs_btree_height(btree) - 1) ? 413 struct nilfs_btree_node *node;
414 nilfs_btree_get_root(btree) : 414
415 nilfs_btree_get_nonroot_node(path, level); 415 if (level == nilfs_btree_height(btree) - 1) {
416 node = nilfs_btree_get_root(btree);
417 *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
418 } else {
419 node = nilfs_btree_get_nonroot_node(path, level);
420 *ncmaxp = nilfs_btree_nchildren_per_block(btree);
421 }
422 return node;
416} 423}
417 424
418static inline int 425static int
419nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) 426nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
420{ 427{
421 if (unlikely(nilfs_btree_node_get_level(node) != level)) { 428 if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
427 return 0; 434 return 0;
428} 435}
429 436
430static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 437struct nilfs_btree_readahead_info {
438 struct nilfs_btree_node *node; /* parent node */
439 int max_ra_blocks; /* max nof blocks to read ahead */
440 int index; /* current index on the parent node */
441 int ncmax; /* nof children in the parent node */
442};
443
444static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
445 struct buffer_head **bhp,
446 const struct nilfs_btree_readahead_info *ra)
447{
448 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
449 struct buffer_head *bh, *ra_bh;
450 sector_t submit_ptr = 0;
451 int ret;
452
453 ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
454 if (ret) {
455 if (ret != -EEXIST)
456 return ret;
457 goto out_check;
458 }
459
460 if (ra) {
461 int i, n;
462 __u64 ptr2;
463
464 /* read ahead sibling nodes */
465 for (n = ra->max_ra_blocks, i = ra->index + 1;
466 n > 0 && i < ra->ncmax; n--, i++) {
467 ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
468
469 ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
470 &ra_bh, &submit_ptr);
471 if (likely(!ret || ret == -EEXIST))
472 brelse(ra_bh);
473 else if (ret != -EBUSY)
474 break;
475 if (!buffer_locked(bh))
476 goto out_no_wait;
477 }
478 }
479
480 wait_on_buffer(bh);
481
482 out_no_wait:
483 if (!buffer_uptodate(bh)) {
484 brelse(bh);
485 return -EIO;
486 }
487
488 out_check:
489 if (nilfs_btree_broken_node_block(bh)) {
490 clear_buffer_uptodate(bh);
491 brelse(bh);
492 return -EINVAL;
493 }
494
495 *bhp = bh;
496 return 0;
497}
498
499static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
500 struct buffer_head **bhp)
501{
502 return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
503}
504
505static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
431 struct nilfs_btree_path *path, 506 struct nilfs_btree_path *path,
432 __u64 key, __u64 *ptrp, int minlevel) 507 __u64 key, __u64 *ptrp, int minlevel,
508 int readahead)
433{ 509{
434 struct nilfs_btree_node *node; 510 struct nilfs_btree_node *node;
511 struct nilfs_btree_readahead_info p, *ra;
435 __u64 ptr; 512 __u64 ptr;
436 int level, index, found, ret; 513 int level, index, found, ncmax, ret;
437 514
438 node = nilfs_btree_get_root(btree); 515 node = nilfs_btree_get_root(btree);
439 level = nilfs_btree_node_get_level(node); 516 level = nilfs_btree_node_get_level(node);
@@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
441 return -ENOENT; 518 return -ENOENT;
442 519
443 found = nilfs_btree_node_lookup(node, key, &index); 520 found = nilfs_btree_node_lookup(node, key, &index);
444 ptr = nilfs_btree_node_get_ptr(btree, node, index); 521 ptr = nilfs_btree_node_get_ptr(node, index,
522 NILFS_BTREE_ROOT_NCHILDREN_MAX);
445 path[level].bp_bh = NULL; 523 path[level].bp_bh = NULL;
446 path[level].bp_index = index; 524 path[level].bp_index = index;
447 525
448 for (level--; level >= minlevel; level--) { 526 ncmax = nilfs_btree_nchildren_per_block(btree);
449 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 527
528 while (--level >= minlevel) {
529 ra = NULL;
530 if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
531 p.node = nilfs_btree_get_node(btree, path, level + 1,
532 &p.ncmax);
533 p.index = index;
534 p.max_ra_blocks = 7;
535 ra = &p;
536 }
537 ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
538 ra);
450 if (ret < 0) 539 if (ret < 0)
451 return ret; 540 return ret;
541
452 node = nilfs_btree_get_nonroot_node(path, level); 542 node = nilfs_btree_get_nonroot_node(path, level);
453 if (nilfs_btree_bad_node(node, level)) 543 if (nilfs_btree_bad_node(node, level))
454 return -EINVAL; 544 return -EINVAL;
@@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
456 found = nilfs_btree_node_lookup(node, key, &index); 546 found = nilfs_btree_node_lookup(node, key, &index);
457 else 547 else
458 index = 0; 548 index = 0;
459 if (index < nilfs_btree_node_nchildren_max(node, btree)) 549 if (index < ncmax) {
460 ptr = nilfs_btree_node_get_ptr(btree, node, index); 550 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
461 else { 551 } else {
462 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 552 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
463 /* insert */ 553 /* insert */
464 ptr = NILFS_BMAP_INVALID_PTR; 554 ptr = NILFS_BMAP_INVALID_PTR;
@@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
474 return 0; 564 return 0;
475} 565}
476 566
477static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, 567static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
478 struct nilfs_btree_path *path, 568 struct nilfs_btree_path *path,
479 __u64 *keyp, __u64 *ptrp) 569 __u64 *keyp, __u64 *ptrp)
480{ 570{
481 struct nilfs_btree_node *node; 571 struct nilfs_btree_node *node;
482 __u64 ptr; 572 __u64 ptr;
483 int index, level, ret; 573 int index, level, ncmax, ret;
484 574
485 node = nilfs_btree_get_root(btree); 575 node = nilfs_btree_get_root(btree);
486 index = nilfs_btree_node_get_nchildren(node) - 1; 576 index = nilfs_btree_node_get_nchildren(node) - 1;
487 if (index < 0) 577 if (index < 0)
488 return -ENOENT; 578 return -ENOENT;
489 level = nilfs_btree_node_get_level(node); 579 level = nilfs_btree_node_get_level(node);
490 ptr = nilfs_btree_node_get_ptr(btree, node, index); 580 ptr = nilfs_btree_node_get_ptr(node, index,
581 NILFS_BTREE_ROOT_NCHILDREN_MAX);
491 path[level].bp_bh = NULL; 582 path[level].bp_bh = NULL;
492 path[level].bp_index = index; 583 path[level].bp_index = index;
584 ncmax = nilfs_btree_nchildren_per_block(btree);
493 585
494 for (level--; level > 0; level--) { 586 for (level--; level > 0; level--) {
495 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 587 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
499 if (nilfs_btree_bad_node(node, level)) 591 if (nilfs_btree_bad_node(node, level))
500 return -EINVAL; 592 return -EINVAL;
501 index = nilfs_btree_node_get_nchildren(node) - 1; 593 index = nilfs_btree_node_get_nchildren(node) - 1;
502 ptr = nilfs_btree_node_get_ptr(btree, node, index); 594 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
503 path[level].bp_index = index; 595 path[level].bp_index = index;
504 } 596 }
505 597
@@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
511 return 0; 603 return 0;
512} 604}
513 605
514static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, 606static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
515 __u64 key, int level, __u64 *ptrp) 607 __u64 key, int level, __u64 *ptrp)
516{ 608{
517 struct nilfs_btree *btree;
518 struct nilfs_btree_path *path; 609 struct nilfs_btree_path *path;
519 __u64 ptr;
520 int ret; 610 int ret;
521 611
522 btree = (struct nilfs_btree *)bmap;
523 path = nilfs_btree_alloc_path(); 612 path = nilfs_btree_alloc_path();
524 if (path == NULL) 613 if (path == NULL)
525 return -ENOMEM; 614 return -ENOMEM;
526 615
527 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 616 ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
528
529 if (ptrp != NULL)
530 *ptrp = ptr;
531 617
532 nilfs_btree_free_path(path); 618 nilfs_btree_free_path(path);
533 619
534 return ret; 620 return ret;
535} 621}
536 622
537static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, 623static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
538 __u64 key, __u64 *ptrp, unsigned maxblocks) 624 __u64 key, __u64 *ptrp, unsigned maxblocks)
539{ 625{
540 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
541 struct nilfs_btree_path *path; 626 struct nilfs_btree_path *path;
542 struct nilfs_btree_node *node; 627 struct nilfs_btree_node *node;
543 struct inode *dat = NULL; 628 struct inode *dat = NULL;
544 __u64 ptr, ptr2; 629 __u64 ptr, ptr2;
545 sector_t blocknr; 630 sector_t blocknr;
546 int level = NILFS_BTREE_LEVEL_NODE_MIN; 631 int level = NILFS_BTREE_LEVEL_NODE_MIN;
547 int ret, cnt, index, maxlevel; 632 int ret, cnt, index, maxlevel, ncmax;
633 struct nilfs_btree_readahead_info p;
548 634
549 path = nilfs_btree_alloc_path(); 635 path = nilfs_btree_alloc_path();
550 if (path == NULL) 636 if (path == NULL)
551 return -ENOMEM; 637 return -ENOMEM;
552 638
553 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 639 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
554 if (ret < 0) 640 if (ret < 0)
555 goto out; 641 goto out;
556 642
557 if (NILFS_BMAP_USE_VBN(bmap)) { 643 if (NILFS_BMAP_USE_VBN(btree)) {
558 dat = nilfs_bmap_get_dat(bmap); 644 dat = nilfs_bmap_get_dat(btree);
559 ret = nilfs_dat_translate(dat, ptr, &blocknr); 645 ret = nilfs_dat_translate(dat, ptr, &blocknr);
560 if (ret < 0) 646 if (ret < 0)
561 goto out; 647 goto out;
@@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
566 goto end; 652 goto end;
567 653
568 maxlevel = nilfs_btree_height(btree) - 1; 654 maxlevel = nilfs_btree_height(btree) - 1;
569 node = nilfs_btree_get_node(btree, path, level); 655 node = nilfs_btree_get_node(btree, path, level, &ncmax);
570 index = path[level].bp_index + 1; 656 index = path[level].bp_index + 1;
571 for (;;) { 657 for (;;) {
572 while (index < nilfs_btree_node_get_nchildren(node)) { 658 while (index < nilfs_btree_node_get_nchildren(node)) {
573 if (nilfs_btree_node_get_key(node, index) != 659 if (nilfs_btree_node_get_key(node, index) !=
574 key + cnt) 660 key + cnt)
575 goto end; 661 goto end;
576 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 662 ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
577 if (dat) { 663 if (dat) {
578 ret = nilfs_dat_translate(dat, ptr2, &blocknr); 664 ret = nilfs_dat_translate(dat, ptr2, &blocknr);
579 if (ret < 0) 665 if (ret < 0)
@@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
589 break; 675 break;
590 676
591 /* look-up right sibling node */ 677 /* look-up right sibling node */
592 node = nilfs_btree_get_node(btree, path, level + 1); 678 p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
593 index = path[level + 1].bp_index + 1; 679 p.index = path[level + 1].bp_index + 1;
594 if (index >= nilfs_btree_node_get_nchildren(node) || 680 p.max_ra_blocks = 7;
595 nilfs_btree_node_get_key(node, index) != key + cnt) 681 if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
682 nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
596 break; 683 break;
597 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 684 ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
598 path[level + 1].bp_index = index; 685 path[level + 1].bp_index = p.index;
599 686
600 brelse(path[level].bp_bh); 687 brelse(path[level].bp_bh);
601 path[level].bp_bh = NULL; 688 path[level].bp_bh = NULL;
602 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 689
690 ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
691 &p);
603 if (ret < 0) 692 if (ret < 0)
604 goto out; 693 goto out;
605 node = nilfs_btree_get_nonroot_node(path, level); 694 node = nilfs_btree_get_nonroot_node(path, level);
695 ncmax = nilfs_btree_nchildren_per_block(btree);
606 index = 0; 696 index = 0;
607 path[level].bp_index = index; 697 path[level].bp_index = index;
608 } 698 }
@@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
614 return ret; 704 return ret;
615} 705}
616 706
617static void nilfs_btree_promote_key(struct nilfs_btree *btree, 707static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
618 struct nilfs_btree_path *path, 708 struct nilfs_btree_path *path,
619 int level, __u64 key) 709 int level, __u64 key)
620{ 710{
@@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
636 } 726 }
637} 727}
638 728
639static void nilfs_btree_do_insert(struct nilfs_btree *btree, 729static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
640 struct nilfs_btree_path *path, 730 struct nilfs_btree_path *path,
641 int level, __u64 *keyp, __u64 *ptrp) 731 int level, __u64 *keyp, __u64 *ptrp)
642{ 732{
643 struct nilfs_btree_node *node; 733 struct nilfs_btree_node *node;
734 int ncblk;
644 735
645 if (level < nilfs_btree_height(btree) - 1) { 736 if (level < nilfs_btree_height(btree) - 1) {
646 node = nilfs_btree_get_nonroot_node(path, level); 737 node = nilfs_btree_get_nonroot_node(path, level);
647 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 738 ncblk = nilfs_btree_nchildren_per_block(btree);
648 path[level].bp_index); 739 nilfs_btree_node_insert(node, path[level].bp_index,
740 *keyp, *ptrp, ncblk);
649 if (!buffer_dirty(path[level].bp_bh)) 741 if (!buffer_dirty(path[level].bp_bh))
650 nilfs_btnode_mark_dirty(path[level].bp_bh); 742 nilfs_btnode_mark_dirty(path[level].bp_bh);
651 743
@@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
655 0)); 747 0));
656 } else { 748 } else {
657 node = nilfs_btree_get_root(btree); 749 node = nilfs_btree_get_root(btree);
658 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 750 nilfs_btree_node_insert(node, path[level].bp_index,
659 path[level].bp_index); 751 *keyp, *ptrp,
752 NILFS_BTREE_ROOT_NCHILDREN_MAX);
660 } 753 }
661} 754}
662 755
663static void nilfs_btree_carry_left(struct nilfs_btree *btree, 756static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
664 struct nilfs_btree_path *path, 757 struct nilfs_btree_path *path,
665 int level, __u64 *keyp, __u64 *ptrp) 758 int level, __u64 *keyp, __u64 *ptrp)
666{ 759{
667 struct nilfs_btree_node *node, *left; 760 struct nilfs_btree_node *node, *left;
668 int nchildren, lnchildren, n, move; 761 int nchildren, lnchildren, n, move, ncblk;
669 762
670 node = nilfs_btree_get_nonroot_node(path, level); 763 node = nilfs_btree_get_nonroot_node(path, level);
671 left = nilfs_btree_get_sib_node(path, level); 764 left = nilfs_btree_get_sib_node(path, level);
672 nchildren = nilfs_btree_node_get_nchildren(node); 765 nchildren = nilfs_btree_node_get_nchildren(node);
673 lnchildren = nilfs_btree_node_get_nchildren(left); 766 lnchildren = nilfs_btree_node_get_nchildren(left);
767 ncblk = nilfs_btree_nchildren_per_block(btree);
674 move = 0; 768 move = 0;
675 769
676 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 770 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
680 move = 1; 774 move = 1;
681 } 775 }
682 776
683 nilfs_btree_node_move_left(btree, left, node, n); 777 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
684 778
685 if (!buffer_dirty(path[level].bp_bh)) 779 if (!buffer_dirty(path[level].bp_bh))
686 nilfs_btnode_mark_dirty(path[level].bp_bh); 780 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
705 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 799 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
706} 800}
707 801
708static void nilfs_btree_carry_right(struct nilfs_btree *btree, 802static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
709 struct nilfs_btree_path *path, 803 struct nilfs_btree_path *path,
710 int level, __u64 *keyp, __u64 *ptrp) 804 int level, __u64 *keyp, __u64 *ptrp)
711{ 805{
712 struct nilfs_btree_node *node, *right; 806 struct nilfs_btree_node *node, *right;
713 int nchildren, rnchildren, n, move; 807 int nchildren, rnchildren, n, move, ncblk;
714 808
715 node = nilfs_btree_get_nonroot_node(path, level); 809 node = nilfs_btree_get_nonroot_node(path, level);
716 right = nilfs_btree_get_sib_node(path, level); 810 right = nilfs_btree_get_sib_node(path, level);
717 nchildren = nilfs_btree_node_get_nchildren(node); 811 nchildren = nilfs_btree_node_get_nchildren(node);
718 rnchildren = nilfs_btree_node_get_nchildren(right); 812 rnchildren = nilfs_btree_node_get_nchildren(right);
813 ncblk = nilfs_btree_nchildren_per_block(btree);
719 move = 0; 814 move = 0;
720 815
721 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 816 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
725 move = 1; 820 move = 1;
726 } 821 }
727 822
728 nilfs_btree_node_move_right(btree, node, right, n); 823 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
729 824
730 if (!buffer_dirty(path[level].bp_bh)) 825 if (!buffer_dirty(path[level].bp_bh))
731 nilfs_btnode_mark_dirty(path[level].bp_bh); 826 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
751 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 846 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
752} 847}
753 848
754static void nilfs_btree_split(struct nilfs_btree *btree, 849static void nilfs_btree_split(struct nilfs_bmap *btree,
755 struct nilfs_btree_path *path, 850 struct nilfs_btree_path *path,
756 int level, __u64 *keyp, __u64 *ptrp) 851 int level, __u64 *keyp, __u64 *ptrp)
757{ 852{
758 struct nilfs_btree_node *node, *right; 853 struct nilfs_btree_node *node, *right;
759 __u64 newkey; 854 __u64 newkey;
760 __u64 newptr; 855 __u64 newptr;
761 int nchildren, n, move; 856 int nchildren, n, move, ncblk;
762 857
763 node = nilfs_btree_get_nonroot_node(path, level); 858 node = nilfs_btree_get_nonroot_node(path, level);
764 right = nilfs_btree_get_sib_node(path, level); 859 right = nilfs_btree_get_sib_node(path, level);
765 nchildren = nilfs_btree_node_get_nchildren(node); 860 nchildren = nilfs_btree_node_get_nchildren(node);
861 ncblk = nilfs_btree_nchildren_per_block(btree);
766 move = 0; 862 move = 0;
767 863
768 n = (nchildren + 1) / 2; 864 n = (nchildren + 1) / 2;
@@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
771 move = 1; 867 move = 1;
772 } 868 }
773 869
774 nilfs_btree_node_move_right(btree, node, right, n); 870 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
775 871
776 if (!buffer_dirty(path[level].bp_bh)) 872 if (!buffer_dirty(path[level].bp_bh))
777 nilfs_btnode_mark_dirty(path[level].bp_bh); 873 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
783 879
784 if (move) { 880 if (move) {
785 path[level].bp_index -= nilfs_btree_node_get_nchildren(node); 881 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
786 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 882 nilfs_btree_node_insert(right, path[level].bp_index,
787 path[level].bp_index); 883 *keyp, *ptrp, ncblk);
788 884
789 *keyp = nilfs_btree_node_get_key(right, 0); 885 *keyp = nilfs_btree_node_get_key(right, 0);
790 *ptrp = path[level].bp_newreq.bpr_ptr; 886 *ptrp = path[level].bp_newreq.bpr_ptr;
@@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
805 path[level + 1].bp_index++; 901 path[level + 1].bp_index++;
806} 902}
807 903
808static void nilfs_btree_grow(struct nilfs_btree *btree, 904static void nilfs_btree_grow(struct nilfs_bmap *btree,
809 struct nilfs_btree_path *path, 905 struct nilfs_btree_path *path,
810 int level, __u64 *keyp, __u64 *ptrp) 906 int level, __u64 *keyp, __u64 *ptrp)
811{ 907{
812 struct nilfs_btree_node *root, *child; 908 struct nilfs_btree_node *root, *child;
813 int n; 909 int n, ncblk;
814 910
815 root = nilfs_btree_get_root(btree); 911 root = nilfs_btree_get_root(btree);
816 child = nilfs_btree_get_sib_node(path, level); 912 child = nilfs_btree_get_sib_node(path, level);
913 ncblk = nilfs_btree_nchildren_per_block(btree);
817 914
818 n = nilfs_btree_node_get_nchildren(root); 915 n = nilfs_btree_node_get_nchildren(root);
819 916
820 nilfs_btree_node_move_right(btree, root, child, n); 917 nilfs_btree_node_move_right(root, child, n,
918 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
821 nilfs_btree_node_set_level(root, level + 1); 919 nilfs_btree_node_set_level(root, level + 1);
822 920
823 if (!buffer_dirty(path[level].bp_sib_bh)) 921 if (!buffer_dirty(path[level].bp_sib_bh))
@@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
832 *ptrp = path[level].bp_newreq.bpr_ptr; 930 *ptrp = path[level].bp_newreq.bpr_ptr;
833} 931}
834 932
835static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, 933static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
836 const struct nilfs_btree_path *path) 934 const struct nilfs_btree_path *path)
837{ 935{
838 struct nilfs_btree_node *node; 936 struct nilfs_btree_node *node;
839 int level; 937 int level, ncmax;
840 938
841 if (path == NULL) 939 if (path == NULL)
842 return NILFS_BMAP_INVALID_PTR; 940 return NILFS_BMAP_INVALID_PTR;
@@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
844 /* left sibling */ 942 /* left sibling */
845 level = NILFS_BTREE_LEVEL_NODE_MIN; 943 level = NILFS_BTREE_LEVEL_NODE_MIN;
846 if (path[level].bp_index > 0) { 944 if (path[level].bp_index > 0) {
847 node = nilfs_btree_get_node(btree, path, level); 945 node = nilfs_btree_get_node(btree, path, level, &ncmax);
848 return nilfs_btree_node_get_ptr(btree, node, 946 return nilfs_btree_node_get_ptr(node,
849 path[level].bp_index - 1); 947 path[level].bp_index - 1,
948 ncmax);
850 } 949 }
851 950
852 /* parent */ 951 /* parent */
853 level = NILFS_BTREE_LEVEL_NODE_MIN + 1; 952 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
854 if (level <= nilfs_btree_height(btree) - 1) { 953 if (level <= nilfs_btree_height(btree) - 1) {
855 node = nilfs_btree_get_node(btree, path, level); 954 node = nilfs_btree_get_node(btree, path, level, &ncmax);
856 return nilfs_btree_node_get_ptr(btree, node, 955 return nilfs_btree_node_get_ptr(node, path[level].bp_index,
857 path[level].bp_index); 956 ncmax);
858 } 957 }
859 958
860 return NILFS_BMAP_INVALID_PTR; 959 return NILFS_BMAP_INVALID_PTR;
861} 960}
862 961
863static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, 962static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
864 const struct nilfs_btree_path *path, 963 const struct nilfs_btree_path *path,
865 __u64 key) 964 __u64 key)
866{ 965{
867 __u64 ptr; 966 __u64 ptr;
868 967
869 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); 968 ptr = nilfs_bmap_find_target_seq(btree, key);
870 if (ptr != NILFS_BMAP_INVALID_PTR) 969 if (ptr != NILFS_BMAP_INVALID_PTR)
871 /* sequential access */ 970 /* sequential access */
872 return ptr; 971 return ptr;
@@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
877 return ptr; 976 return ptr;
878 } 977 }
879 /* block group */ 978 /* block group */
880 return nilfs_bmap_find_target_in_group(&btree->bt_bmap); 979 return nilfs_bmap_find_target_in_group(btree);
881}
882
883static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
884 __u64 ptr)
885{
886 btree->bt_bmap.b_last_allocated_key = key;
887 btree->bt_bmap.b_last_allocated_ptr = ptr;
888} 980}
889 981
890static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, 982static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
891 struct nilfs_btree_path *path, 983 struct nilfs_btree_path *path,
892 int *levelp, __u64 key, __u64 ptr, 984 int *levelp, __u64 key, __u64 ptr,
893 struct nilfs_bmap_stats *stats) 985 struct nilfs_bmap_stats *stats)
@@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
895 struct buffer_head *bh; 987 struct buffer_head *bh;
896 struct nilfs_btree_node *node, *parent, *sib; 988 struct nilfs_btree_node *node, *parent, *sib;
897 __u64 sibptr; 989 __u64 sibptr;
898 int pindex, level, ret; 990 int pindex, level, ncmax, ncblk, ret;
899 struct inode *dat = NULL; 991 struct inode *dat = NULL;
900 992
901 stats->bs_nblocks = 0; 993 stats->bs_nblocks = 0;
902 level = NILFS_BTREE_LEVEL_DATA; 994 level = NILFS_BTREE_LEVEL_DATA;
903 995
904 /* allocate a new ptr for data block */ 996 /* allocate a new ptr for data block */
905 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 997 if (NILFS_BMAP_USE_VBN(btree)) {
906 path[level].bp_newreq.bpr_ptr = 998 path[level].bp_newreq.bpr_ptr =
907 nilfs_btree_find_target_v(btree, path, key); 999 nilfs_btree_find_target_v(btree, path, key);
908 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1000 dat = nilfs_bmap_get_dat(btree);
909 } 1001 }
910 1002
911 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1003 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
912 &path[level].bp_newreq, dat);
913 if (ret < 0) 1004 if (ret < 0)
914 goto err_out_data; 1005 goto err_out_data;
915 1006
1007 ncblk = nilfs_btree_nchildren_per_block(btree);
1008
916 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1009 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
917 level < nilfs_btree_height(btree) - 1; 1010 level < nilfs_btree_height(btree) - 1;
918 level++) { 1011 level++) {
919 node = nilfs_btree_get_nonroot_node(path, level); 1012 node = nilfs_btree_get_nonroot_node(path, level);
920 if (nilfs_btree_node_get_nchildren(node) < 1013 if (nilfs_btree_node_get_nchildren(node) < ncblk) {
921 nilfs_btree_node_nchildren_max(node, btree)) {
922 path[level].bp_op = nilfs_btree_do_insert; 1014 path[level].bp_op = nilfs_btree_do_insert;
923 stats->bs_nblocks++; 1015 stats->bs_nblocks++;
924 goto out; 1016 goto out;
925 } 1017 }
926 1018
927 parent = nilfs_btree_get_node(btree, path, level + 1); 1019 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
928 pindex = path[level + 1].bp_index; 1020 pindex = path[level + 1].bp_index;
929 1021
930 /* left sibling */ 1022 /* left sibling */
931 if (pindex > 0) { 1023 if (pindex > 0) {
932 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1024 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
933 pindex - 1); 1025 ncmax);
934 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1026 ret = nilfs_btree_get_block(btree, sibptr, &bh);
935 if (ret < 0) 1027 if (ret < 0)
936 goto err_out_child_node; 1028 goto err_out_child_node;
937 sib = (struct nilfs_btree_node *)bh->b_data; 1029 sib = (struct nilfs_btree_node *)bh->b_data;
938 if (nilfs_btree_node_get_nchildren(sib) < 1030 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
939 nilfs_btree_node_nchildren_max(sib, btree)) {
940 path[level].bp_sib_bh = bh; 1031 path[level].bp_sib_bh = bh;
941 path[level].bp_op = nilfs_btree_carry_left; 1032 path[level].bp_op = nilfs_btree_carry_left;
942 stats->bs_nblocks++; 1033 stats->bs_nblocks++;
943 goto out; 1034 goto out;
944 } else 1035 } else {
945 brelse(bh); 1036 brelse(bh);
1037 }
946 } 1038 }
947 1039
948 /* right sibling */ 1040 /* right sibling */
949 if (pindex < 1041 if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
950 nilfs_btree_node_get_nchildren(parent) - 1) { 1042 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
951 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1043 ncmax);
952 pindex + 1);
953 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1044 ret = nilfs_btree_get_block(btree, sibptr, &bh);
954 if (ret < 0) 1045 if (ret < 0)
955 goto err_out_child_node; 1046 goto err_out_child_node;
956 sib = (struct nilfs_btree_node *)bh->b_data; 1047 sib = (struct nilfs_btree_node *)bh->b_data;
957 if (nilfs_btree_node_get_nchildren(sib) < 1048 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
958 nilfs_btree_node_nchildren_max(sib, btree)) {
959 path[level].bp_sib_bh = bh; 1049 path[level].bp_sib_bh = bh;
960 path[level].bp_op = nilfs_btree_carry_right; 1050 path[level].bp_op = nilfs_btree_carry_right;
961 stats->bs_nblocks++; 1051 stats->bs_nblocks++;
962 goto out; 1052 goto out;
963 } else 1053 } else {
964 brelse(bh); 1054 brelse(bh);
1055 }
965 } 1056 }
966 1057
967 /* split */ 1058 /* split */
968 path[level].bp_newreq.bpr_ptr = 1059 path[level].bp_newreq.bpr_ptr =
969 path[level - 1].bp_newreq.bpr_ptr + 1; 1060 path[level - 1].bp_newreq.bpr_ptr + 1;
970 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1061 ret = nilfs_bmap_prepare_alloc_ptr(btree,
971 &path[level].bp_newreq, dat); 1062 &path[level].bp_newreq, dat);
972 if (ret < 0) 1063 if (ret < 0)
973 goto err_out_child_node; 1064 goto err_out_child_node;
@@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
979 1070
980 stats->bs_nblocks++; 1071 stats->bs_nblocks++;
981 1072
982 nilfs_btree_node_init(btree, 1073 sib = (struct nilfs_btree_node *)bh->b_data;
983 (struct nilfs_btree_node *)bh->b_data, 1074 nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
984 0, level, 0, NULL, NULL);
985 path[level].bp_sib_bh = bh; 1075 path[level].bp_sib_bh = bh;
986 path[level].bp_op = nilfs_btree_split; 1076 path[level].bp_op = nilfs_btree_split;
987 } 1077 }
@@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
989 /* root */ 1079 /* root */
990 node = nilfs_btree_get_root(btree); 1080 node = nilfs_btree_get_root(btree);
991 if (nilfs_btree_node_get_nchildren(node) < 1081 if (nilfs_btree_node_get_nchildren(node) <
992 nilfs_btree_node_nchildren_max(node, btree)) { 1082 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
993 path[level].bp_op = nilfs_btree_do_insert; 1083 path[level].bp_op = nilfs_btree_do_insert;
994 stats->bs_nblocks++; 1084 stats->bs_nblocks++;
995 goto out; 1085 goto out;
@@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
997 1087
998 /* grow */ 1088 /* grow */
999 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1089 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1000 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1090 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
1001 &path[level].bp_newreq, dat);
1002 if (ret < 0) 1091 if (ret < 0)
1003 goto err_out_child_node; 1092 goto err_out_child_node;
1004 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1093 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1006 if (ret < 0) 1095 if (ret < 0)
1007 goto err_out_curr_node; 1096 goto err_out_curr_node;
1008 1097
1009 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 1098 nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
1010 0, level, 0, NULL, NULL); 1099 0, level, 0, ncblk, NULL, NULL);
1011 path[level].bp_sib_bh = bh; 1100 path[level].bp_sib_bh = bh;
1012 path[level].bp_op = nilfs_btree_grow; 1101 path[level].bp_op = nilfs_btree_grow;
1013 1102
@@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1024 1113
1025 /* error */ 1114 /* error */
1026 err_out_curr_node: 1115 err_out_curr_node:
1027 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1116 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1028 dat);
1029 err_out_child_node: 1117 err_out_child_node:
1030 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1118 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1031 nilfs_btnode_delete(path[level].bp_sib_bh); 1119 nilfs_btnode_delete(path[level].bp_sib_bh);
1032 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1120 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1033 &path[level].bp_newreq, dat);
1034 1121
1035 } 1122 }
1036 1123
1037 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1124 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1038 dat);
1039 err_out_data: 1125 err_out_data:
1040 *levelp = level; 1126 *levelp = level;
1041 stats->bs_nblocks = 0; 1127 stats->bs_nblocks = 0;
1042 return ret; 1128 return ret;
1043} 1129}
1044 1130
1045static void nilfs_btree_commit_insert(struct nilfs_btree *btree, 1131static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
1046 struct nilfs_btree_path *path, 1132 struct nilfs_btree_path *path,
1047 int maxlevel, __u64 key, __u64 ptr) 1133 int maxlevel, __u64 key, __u64 ptr)
1048{ 1134{
@@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1051 1137
1052 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1138 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1053 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1139 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1054 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 1140 if (NILFS_BMAP_USE_VBN(btree)) {
1055 nilfs_btree_set_target_v(btree, key, ptr); 1141 nilfs_bmap_set_target_v(btree, key, ptr);
1056 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1142 dat = nilfs_bmap_get_dat(btree);
1057 } 1143 }
1058 1144
1059 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1145 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1060 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1146 nilfs_bmap_commit_alloc_ptr(btree,
1061 &path[level - 1].bp_newreq, dat); 1147 &path[level - 1].bp_newreq, dat);
1062 path[level].bp_op(btree, path, level, &key, &ptr); 1148 path[level].bp_op(btree, path, level, &key, &ptr);
1063 } 1149 }
1064 1150
1065 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1151 if (!nilfs_bmap_dirty(btree))
1066 nilfs_bmap_set_dirty(&btree->bt_bmap); 1152 nilfs_bmap_set_dirty(btree);
1067} 1153}
1068 1154
1069static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 1155static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1070{ 1156{
1071 struct nilfs_btree *btree;
1072 struct nilfs_btree_path *path; 1157 struct nilfs_btree_path *path;
1073 struct nilfs_bmap_stats stats; 1158 struct nilfs_bmap_stats stats;
1074 int level, ret; 1159 int level, ret;
1075 1160
1076 btree = (struct nilfs_btree *)bmap;
1077 path = nilfs_btree_alloc_path(); 1161 path = nilfs_btree_alloc_path();
1078 if (path == NULL) 1162 if (path == NULL)
1079 return -ENOMEM; 1163 return -ENOMEM;
1080 1164
1081 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1165 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1082 NILFS_BTREE_LEVEL_NODE_MIN); 1166 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1083 if (ret != -ENOENT) { 1167 if (ret != -ENOENT) {
1084 if (ret == 0) 1168 if (ret == 0)
1085 ret = -EEXIST; 1169 ret = -EEXIST;
@@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1090 if (ret < 0) 1174 if (ret < 0)
1091 goto out; 1175 goto out;
1092 nilfs_btree_commit_insert(btree, path, level, key, ptr); 1176 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1093 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1177 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1094 1178
1095 out: 1179 out:
1096 nilfs_btree_free_path(path); 1180 nilfs_btree_free_path(path);
1097 return ret; 1181 return ret;
1098} 1182}
1099 1183
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree, 1184static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
1101 struct nilfs_btree_path *path, 1185 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp) 1186 int level, __u64 *keyp, __u64 *ptrp)
1103{ 1187{
1104 struct nilfs_btree_node *node; 1188 struct nilfs_btree_node *node;
1189 int ncblk;
1105 1190
1106 if (level < nilfs_btree_height(btree) - 1) { 1191 if (level < nilfs_btree_height(btree) - 1) {
1107 node = nilfs_btree_get_nonroot_node(path, level); 1192 node = nilfs_btree_get_nonroot_node(path, level);
1108 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1193 ncblk = nilfs_btree_nchildren_per_block(btree);
1109 path[level].bp_index); 1194 nilfs_btree_node_delete(node, path[level].bp_index,
1195 keyp, ptrp, ncblk);
1110 if (!buffer_dirty(path[level].bp_bh)) 1196 if (!buffer_dirty(path[level].bp_bh))
1111 nilfs_btnode_mark_dirty(path[level].bp_bh); 1197 nilfs_btnode_mark_dirty(path[level].bp_bh);
1112 if (path[level].bp_index == 0) 1198 if (path[level].bp_index == 0)
@@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1114 nilfs_btree_node_get_key(node, 0)); 1200 nilfs_btree_node_get_key(node, 0));
1115 } else { 1201 } else {
1116 node = nilfs_btree_get_root(btree); 1202 node = nilfs_btree_get_root(btree);
1117 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1203 nilfs_btree_node_delete(node, path[level].bp_index,
1118 path[level].bp_index); 1204 keyp, ptrp,
1205 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1119 } 1206 }
1120} 1207}
1121 1208
1122static void nilfs_btree_borrow_left(struct nilfs_btree *btree, 1209static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
1123 struct nilfs_btree_path *path, 1210 struct nilfs_btree_path *path,
1124 int level, __u64 *keyp, __u64 *ptrp) 1211 int level, __u64 *keyp, __u64 *ptrp)
1125{ 1212{
1126 struct nilfs_btree_node *node, *left; 1213 struct nilfs_btree_node *node, *left;
1127 int nchildren, lnchildren, n; 1214 int nchildren, lnchildren, n, ncblk;
1128 1215
1129 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1216 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1130 1217
@@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1132 left = nilfs_btree_get_sib_node(path, level); 1219 left = nilfs_btree_get_sib_node(path, level);
1133 nchildren = nilfs_btree_node_get_nchildren(node); 1220 nchildren = nilfs_btree_node_get_nchildren(node);
1134 lnchildren = nilfs_btree_node_get_nchildren(left); 1221 lnchildren = nilfs_btree_node_get_nchildren(left);
1222 ncblk = nilfs_btree_nchildren_per_block(btree);
1135 1223
1136 n = (nchildren + lnchildren) / 2 - nchildren; 1224 n = (nchildren + lnchildren) / 2 - nchildren;
1137 1225
1138 nilfs_btree_node_move_right(btree, left, node, n); 1226 nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
1139 1227
1140 if (!buffer_dirty(path[level].bp_bh)) 1228 if (!buffer_dirty(path[level].bp_bh))
1141 nilfs_btnode_mark_dirty(path[level].bp_bh); 1229 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1150 path[level].bp_index += n; 1238 path[level].bp_index += n;
1151} 1239}
1152 1240
1153static void nilfs_btree_borrow_right(struct nilfs_btree *btree, 1241static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
1154 struct nilfs_btree_path *path, 1242 struct nilfs_btree_path *path,
1155 int level, __u64 *keyp, __u64 *ptrp) 1243 int level, __u64 *keyp, __u64 *ptrp)
1156{ 1244{
1157 struct nilfs_btree_node *node, *right; 1245 struct nilfs_btree_node *node, *right;
1158 int nchildren, rnchildren, n; 1246 int nchildren, rnchildren, n, ncblk;
1159 1247
1160 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1248 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1161 1249
@@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1163 right = nilfs_btree_get_sib_node(path, level); 1251 right = nilfs_btree_get_sib_node(path, level);
1164 nchildren = nilfs_btree_node_get_nchildren(node); 1252 nchildren = nilfs_btree_node_get_nchildren(node);
1165 rnchildren = nilfs_btree_node_get_nchildren(right); 1253 rnchildren = nilfs_btree_node_get_nchildren(right);
1254 ncblk = nilfs_btree_nchildren_per_block(btree);
1166 1255
1167 n = (nchildren + rnchildren) / 2 - nchildren; 1256 n = (nchildren + rnchildren) / 2 - nchildren;
1168 1257
1169 nilfs_btree_node_move_left(btree, node, right, n); 1258 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1170 1259
1171 if (!buffer_dirty(path[level].bp_bh)) 1260 if (!buffer_dirty(path[level].bp_bh))
1172 nilfs_btnode_mark_dirty(path[level].bp_bh); 1261 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1182 path[level].bp_sib_bh = NULL; 1271 path[level].bp_sib_bh = NULL;
1183} 1272}
1184 1273
1185static void nilfs_btree_concat_left(struct nilfs_btree *btree, 1274static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
1186 struct nilfs_btree_path *path, 1275 struct nilfs_btree_path *path,
1187 int level, __u64 *keyp, __u64 *ptrp) 1276 int level, __u64 *keyp, __u64 *ptrp)
1188{ 1277{
1189 struct nilfs_btree_node *node, *left; 1278 struct nilfs_btree_node *node, *left;
1190 int n; 1279 int n, ncblk;
1191 1280
1192 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1281 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1193 1282
1194 node = nilfs_btree_get_nonroot_node(path, level); 1283 node = nilfs_btree_get_nonroot_node(path, level);
1195 left = nilfs_btree_get_sib_node(path, level); 1284 left = nilfs_btree_get_sib_node(path, level);
1285 ncblk = nilfs_btree_nchildren_per_block(btree);
1196 1286
1197 n = nilfs_btree_node_get_nchildren(node); 1287 n = nilfs_btree_node_get_nchildren(node);
1198 1288
1199 nilfs_btree_node_move_left(btree, left, node, n); 1289 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
1200 1290
1201 if (!buffer_dirty(path[level].bp_sib_bh)) 1291 if (!buffer_dirty(path[level].bp_sib_bh))
1202 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1292 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1207 path[level].bp_index += nilfs_btree_node_get_nchildren(left); 1297 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1208} 1298}
1209 1299
1210static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1300static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
1211 struct nilfs_btree_path *path, 1301 struct nilfs_btree_path *path,
1212 int level, __u64 *keyp, __u64 *ptrp) 1302 int level, __u64 *keyp, __u64 *ptrp)
1213{ 1303{
1214 struct nilfs_btree_node *node, *right; 1304 struct nilfs_btree_node *node, *right;
1215 int n; 1305 int n, ncblk;
1216 1306
1217 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1307 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1218 1308
1219 node = nilfs_btree_get_nonroot_node(path, level); 1309 node = nilfs_btree_get_nonroot_node(path, level);
1220 right = nilfs_btree_get_sib_node(path, level); 1310 right = nilfs_btree_get_sib_node(path, level);
1311 ncblk = nilfs_btree_nchildren_per_block(btree);
1221 1312
1222 n = nilfs_btree_node_get_nchildren(right); 1313 n = nilfs_btree_node_get_nchildren(right);
1223 1314
1224 nilfs_btree_node_move_left(btree, node, right, n); 1315 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1225 1316
1226 if (!buffer_dirty(path[level].bp_bh)) 1317 if (!buffer_dirty(path[level].bp_bh))
1227 nilfs_btnode_mark_dirty(path[level].bp_bh); 1318 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 path[level + 1].bp_index++; 1322 path[level + 1].bp_index++;
1232} 1323}
1233 1324
1234static void nilfs_btree_shrink(struct nilfs_btree *btree, 1325static void nilfs_btree_shrink(struct nilfs_bmap *btree,
1235 struct nilfs_btree_path *path, 1326 struct nilfs_btree_path *path,
1236 int level, __u64 *keyp, __u64 *ptrp) 1327 int level, __u64 *keyp, __u64 *ptrp)
1237{ 1328{
1238 struct nilfs_btree_node *root, *child; 1329 struct nilfs_btree_node *root, *child;
1239 int n; 1330 int n, ncblk;
1240 1331
1241 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1332 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1242 1333
1243 root = nilfs_btree_get_root(btree); 1334 root = nilfs_btree_get_root(btree);
1244 child = nilfs_btree_get_nonroot_node(path, level); 1335 child = nilfs_btree_get_nonroot_node(path, level);
1336 ncblk = nilfs_btree_nchildren_per_block(btree);
1245 1337
1246 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1338 nilfs_btree_node_delete(root, 0, NULL, NULL,
1339 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1247 nilfs_btree_node_set_level(root, level); 1340 nilfs_btree_node_set_level(root, level);
1248 n = nilfs_btree_node_get_nchildren(child); 1341 n = nilfs_btree_node_get_nchildren(child);
1249 nilfs_btree_node_move_left(btree, root, child, n); 1342 nilfs_btree_node_move_left(root, child, n,
1343 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
1250 1344
1251 nilfs_btnode_delete(path[level].bp_bh); 1345 nilfs_btnode_delete(path[level].bp_bh);
1252 path[level].bp_bh = NULL; 1346 path[level].bp_bh = NULL;
1253} 1347}
1254 1348
1255 1349
1256static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1350static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
1257 struct nilfs_btree_path *path, 1351 struct nilfs_btree_path *path,
1258 int *levelp, 1352 int *levelp,
1259 struct nilfs_bmap_stats *stats, 1353 struct nilfs_bmap_stats *stats,
@@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1262 struct buffer_head *bh; 1356 struct buffer_head *bh;
1263 struct nilfs_btree_node *node, *parent, *sib; 1357 struct nilfs_btree_node *node, *parent, *sib;
1264 __u64 sibptr; 1358 __u64 sibptr;
1265 int pindex, level, ret; 1359 int pindex, level, ncmin, ncmax, ncblk, ret;
1266 1360
1267 ret = 0; 1361 ret = 0;
1268 stats->bs_nblocks = 0; 1362 stats->bs_nblocks = 0;
1363 ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
1364 ncblk = nilfs_btree_nchildren_per_block(btree);
1365
1269 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1366 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1270 level < nilfs_btree_height(btree) - 1; 1367 level < nilfs_btree_height(btree) - 1;
1271 level++) { 1368 level++) {
1272 node = nilfs_btree_get_nonroot_node(path, level); 1369 node = nilfs_btree_get_nonroot_node(path, level);
1273 path[level].bp_oldreq.bpr_ptr = 1370 path[level].bp_oldreq.bpr_ptr =
1274 nilfs_btree_node_get_ptr(btree, node, 1371 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1275 path[level].bp_index); 1372 ncblk);
1276 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1373 ret = nilfs_bmap_prepare_end_ptr(btree,
1277 &path[level].bp_oldreq, dat); 1374 &path[level].bp_oldreq, dat);
1278 if (ret < 0) 1375 if (ret < 0)
1279 goto err_out_child_node; 1376 goto err_out_child_node;
1280 1377
1281 if (nilfs_btree_node_get_nchildren(node) > 1378 if (nilfs_btree_node_get_nchildren(node) > ncmin) {
1282 nilfs_btree_node_nchildren_min(node, btree)) {
1283 path[level].bp_op = nilfs_btree_do_delete; 1379 path[level].bp_op = nilfs_btree_do_delete;
1284 stats->bs_nblocks++; 1380 stats->bs_nblocks++;
1285 goto out; 1381 goto out;
1286 } 1382 }
1287 1383
1288 parent = nilfs_btree_get_node(btree, path, level + 1); 1384 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1289 pindex = path[level + 1].bp_index; 1385 pindex = path[level + 1].bp_index;
1290 1386
1291 if (pindex > 0) { 1387 if (pindex > 0) {
1292 /* left sibling */ 1388 /* left sibling */
1293 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1389 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
1294 pindex - 1); 1390 ncmax);
1295 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1391 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1296 if (ret < 0) 1392 if (ret < 0)
1297 goto err_out_curr_node; 1393 goto err_out_curr_node;
1298 sib = (struct nilfs_btree_node *)bh->b_data; 1394 sib = (struct nilfs_btree_node *)bh->b_data;
1299 if (nilfs_btree_node_get_nchildren(sib) > 1395 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1300 nilfs_btree_node_nchildren_min(sib, btree)) {
1301 path[level].bp_sib_bh = bh; 1396 path[level].bp_sib_bh = bh;
1302 path[level].bp_op = nilfs_btree_borrow_left; 1397 path[level].bp_op = nilfs_btree_borrow_left;
1303 stats->bs_nblocks++; 1398 stats->bs_nblocks++;
@@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1311 } else if (pindex < 1406 } else if (pindex <
1312 nilfs_btree_node_get_nchildren(parent) - 1) { 1407 nilfs_btree_node_get_nchildren(parent) - 1) {
1313 /* right sibling */ 1408 /* right sibling */
1314 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1409 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
1315 pindex + 1); 1410 ncmax);
1316 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1411 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1317 if (ret < 0) 1412 if (ret < 0)
1318 goto err_out_curr_node; 1413 goto err_out_curr_node;
1319 sib = (struct nilfs_btree_node *)bh->b_data; 1414 sib = (struct nilfs_btree_node *)bh->b_data;
1320 if (nilfs_btree_node_get_nchildren(sib) > 1415 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1321 nilfs_btree_node_nchildren_min(sib, btree)) {
1322 path[level].bp_sib_bh = bh; 1416 path[level].bp_sib_bh = bh;
1323 path[level].bp_op = nilfs_btree_borrow_right; 1417 path[level].bp_op = nilfs_btree_borrow_right;
1324 stats->bs_nblocks++; 1418 stats->bs_nblocks++;
@@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1349 1443
1350 node = nilfs_btree_get_root(btree); 1444 node = nilfs_btree_get_root(btree);
1351 path[level].bp_oldreq.bpr_ptr = 1445 path[level].bp_oldreq.bpr_ptr =
1352 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1446 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1447 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1353 1448
1354 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1449 ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
1355 &path[level].bp_oldreq, dat);
1356 if (ret < 0) 1450 if (ret < 0)
1357 goto err_out_child_node; 1451 goto err_out_child_node;
1358 1452
@@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1367 1461
1368 /* error */ 1462 /* error */
1369 err_out_curr_node: 1463 err_out_curr_node:
1370 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); 1464 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1371 err_out_child_node: 1465 err_out_child_node:
1372 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1466 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1373 brelse(path[level].bp_sib_bh); 1467 brelse(path[level].bp_sib_bh);
1374 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1468 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1375 &path[level].bp_oldreq, dat);
1376 } 1469 }
1377 *levelp = level; 1470 *levelp = level;
1378 stats->bs_nblocks = 0; 1471 stats->bs_nblocks = 0;
1379 return ret; 1472 return ret;
1380} 1473}
1381 1474
1382static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1475static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
1383 struct nilfs_btree_path *path, 1476 struct nilfs_btree_path *path,
1384 int maxlevel, struct inode *dat) 1477 int maxlevel, struct inode *dat)
1385{ 1478{
1386 int level; 1479 int level;
1387 1480
1388 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1481 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1389 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1482 nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
1390 &path[level].bp_oldreq, dat);
1391 path[level].bp_op(btree, path, level, NULL, NULL); 1483 path[level].bp_op(btree, path, level, NULL, NULL);
1392 } 1484 }
1393 1485
1394 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1486 if (!nilfs_bmap_dirty(btree))
1395 nilfs_bmap_set_dirty(&btree->bt_bmap); 1487 nilfs_bmap_set_dirty(btree);
1396} 1488}
1397 1489
1398static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) 1490static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1399 1491
1400{ 1492{
1401 struct nilfs_btree *btree;
1402 struct nilfs_btree_path *path; 1493 struct nilfs_btree_path *path;
1403 struct nilfs_bmap_stats stats; 1494 struct nilfs_bmap_stats stats;
1404 struct inode *dat; 1495 struct inode *dat;
1405 int level, ret; 1496 int level, ret;
1406 1497
1407 btree = (struct nilfs_btree *)bmap;
1408 path = nilfs_btree_alloc_path(); 1498 path = nilfs_btree_alloc_path();
1409 if (path == NULL) 1499 if (path == NULL)
1410 return -ENOMEM; 1500 return -ENOMEM;
1411 1501
1412 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1502 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1413 NILFS_BTREE_LEVEL_NODE_MIN); 1503 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1414 if (ret < 0) 1504 if (ret < 0)
1415 goto out; 1505 goto out;
1416 1506
1417 1507
1418 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? 1508 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1419 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1420 1509
1421 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); 1510 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1422 if (ret < 0) 1511 if (ret < 0)
1423 goto out; 1512 goto out;
1424 nilfs_btree_commit_delete(btree, path, level, dat); 1513 nilfs_btree_commit_delete(btree, path, level, dat);
1425 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1514 nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
1426 1515
1427out: 1516out:
1428 nilfs_btree_free_path(path); 1517 nilfs_btree_free_path(path);
1429 return ret; 1518 return ret;
1430} 1519}
1431 1520
1432static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 1521static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
1433{ 1522{
1434 struct nilfs_btree *btree;
1435 struct nilfs_btree_path *path; 1523 struct nilfs_btree_path *path;
1436 int ret; 1524 int ret;
1437 1525
1438 btree = (struct nilfs_btree *)bmap;
1439 path = nilfs_btree_alloc_path(); 1526 path = nilfs_btree_alloc_path();
1440 if (path == NULL) 1527 if (path == NULL)
1441 return -ENOMEM; 1528 return -ENOMEM;
@@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1447 return ret; 1534 return ret;
1448} 1535}
1449 1536
1450static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) 1537static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
1451{ 1538{
1452 struct buffer_head *bh; 1539 struct buffer_head *bh;
1453 struct nilfs_btree *btree;
1454 struct nilfs_btree_node *root, *node; 1540 struct nilfs_btree_node *root, *node;
1455 __u64 maxkey, nextmaxkey; 1541 __u64 maxkey, nextmaxkey;
1456 __u64 ptr; 1542 __u64 ptr;
1457 int nchildren, ret; 1543 int nchildren, ret;
1458 1544
1459 btree = (struct nilfs_btree *)bmap;
1460 root = nilfs_btree_get_root(btree); 1545 root = nilfs_btree_get_root(btree);
1461 switch (nilfs_btree_height(btree)) { 1546 switch (nilfs_btree_height(btree)) {
1462 case 2: 1547 case 2:
@@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1467 nchildren = nilfs_btree_node_get_nchildren(root); 1552 nchildren = nilfs_btree_node_get_nchildren(root);
1468 if (nchildren > 1) 1553 if (nchildren > 1)
1469 return 0; 1554 return 0;
1470 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1555 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1556 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1471 ret = nilfs_btree_get_block(btree, ptr, &bh); 1557 ret = nilfs_btree_get_block(btree, ptr, &bh);
1472 if (ret < 0) 1558 if (ret < 0)
1473 return ret; 1559 return ret;
@@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1487 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); 1573 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
1488} 1574}
1489 1575
1490static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, 1576static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
1491 __u64 *keys, __u64 *ptrs, int nitems) 1577 __u64 *keys, __u64 *ptrs, int nitems)
1492{ 1578{
1493 struct buffer_head *bh; 1579 struct buffer_head *bh;
1494 struct nilfs_btree *btree;
1495 struct nilfs_btree_node *node, *root; 1580 struct nilfs_btree_node *node, *root;
1496 __le64 *dkeys; 1581 __le64 *dkeys;
1497 __le64 *dptrs; 1582 __le64 *dptrs;
1498 __u64 ptr; 1583 __u64 ptr;
1499 int nchildren, i, ret; 1584 int nchildren, ncmax, i, ret;
1500 1585
1501 btree = (struct nilfs_btree *)bmap;
1502 root = nilfs_btree_get_root(btree); 1586 root = nilfs_btree_get_root(btree);
1503 switch (nilfs_btree_height(btree)) { 1587 switch (nilfs_btree_height(btree)) {
1504 case 2: 1588 case 2:
1505 bh = NULL; 1589 bh = NULL;
1506 node = root; 1590 node = root;
1591 ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
1507 break; 1592 break;
1508 case 3: 1593 case 3:
1509 nchildren = nilfs_btree_node_get_nchildren(root); 1594 nchildren = nilfs_btree_node_get_nchildren(root);
1510 WARN_ON(nchildren > 1); 1595 WARN_ON(nchildren > 1);
1511 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1596 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1597 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1512 ret = nilfs_btree_get_block(btree, ptr, &bh); 1598 ret = nilfs_btree_get_block(btree, ptr, &bh);
1513 if (ret < 0) 1599 if (ret < 0)
1514 return ret; 1600 return ret;
1515 node = (struct nilfs_btree_node *)bh->b_data; 1601 node = (struct nilfs_btree_node *)bh->b_data;
1602 ncmax = nilfs_btree_nchildren_per_block(btree);
1516 break; 1603 break;
1517 default: 1604 default:
1518 node = NULL; 1605 node = NULL;
@@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1523 if (nchildren < nitems) 1610 if (nchildren < nitems)
1524 nitems = nchildren; 1611 nitems = nchildren;
1525 dkeys = nilfs_btree_node_dkeys(node); 1612 dkeys = nilfs_btree_node_dkeys(node);
1526 dptrs = nilfs_btree_node_dptrs(node, btree); 1613 dptrs = nilfs_btree_node_dptrs(node, ncmax);
1527 for (i = 0; i < nitems; i++) { 1614 for (i = 0; i < nitems; i++) {
1528 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1615 keys[i] = le64_to_cpu(dkeys[i]);
1529 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1616 ptrs[i] = le64_to_cpu(dptrs[i]);
1530 } 1617 }
1531 1618
1532 if (bh != NULL) 1619 if (bh != NULL)
@@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1536} 1623}
1537 1624
1538static int 1625static int
1539nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, 1626nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
1540 union nilfs_bmap_ptr_req *dreq, 1627 union nilfs_bmap_ptr_req *dreq,
1541 union nilfs_bmap_ptr_req *nreq, 1628 union nilfs_bmap_ptr_req *nreq,
1542 struct buffer_head **bhp, 1629 struct buffer_head **bhp,
1543 struct nilfs_bmap_stats *stats) 1630 struct nilfs_bmap_stats *stats)
1544{ 1631{
1545 struct buffer_head *bh; 1632 struct buffer_head *bh;
1546 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1547 struct inode *dat = NULL; 1633 struct inode *dat = NULL;
1548 int ret; 1634 int ret;
1549 1635
@@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1551 1637
1552 /* for data */ 1638 /* for data */
1553 /* cannot find near ptr */ 1639 /* cannot find near ptr */
1554 if (NILFS_BMAP_USE_VBN(bmap)) { 1640 if (NILFS_BMAP_USE_VBN(btree)) {
1555 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1641 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1556 dat = nilfs_bmap_get_dat(bmap); 1642 dat = nilfs_bmap_get_dat(btree);
1557 } 1643 }
1558 1644
1559 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); 1645 ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
1560 if (ret < 0) 1646 if (ret < 0)
1561 return ret; 1647 return ret;
1562 1648
@@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1564 stats->bs_nblocks++; 1650 stats->bs_nblocks++;
1565 if (nreq != NULL) { 1651 if (nreq != NULL) {
1566 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1652 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1567 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); 1653 ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
1568 if (ret < 0) 1654 if (ret < 0)
1569 goto err_out_dreq; 1655 goto err_out_dreq;
1570 1656
@@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1581 1667
1582 /* error */ 1668 /* error */
1583 err_out_nreq: 1669 err_out_nreq:
1584 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); 1670 nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
1585 err_out_dreq: 1671 err_out_dreq:
1586 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); 1672 nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
1587 stats->bs_nblocks = 0; 1673 stats->bs_nblocks = 0;
1588 return ret; 1674 return ret;
1589 1675
1590} 1676}
1591 1677
1592static void 1678static void
1593nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, 1679nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
1594 __u64 key, __u64 ptr, 1680 __u64 key, __u64 ptr,
1595 const __u64 *keys, const __u64 *ptrs, 1681 const __u64 *keys, const __u64 *ptrs,
1596 int n, 1682 int n,
@@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1598 union nilfs_bmap_ptr_req *nreq, 1684 union nilfs_bmap_ptr_req *nreq,
1599 struct buffer_head *bh) 1685 struct buffer_head *bh)
1600{ 1686{
1601 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1602 struct nilfs_btree_node *node; 1687 struct nilfs_btree_node *node;
1603 struct inode *dat; 1688 struct inode *dat;
1604 __u64 tmpptr; 1689 __u64 tmpptr;
1690 int ncblk;
1605 1691
1606 /* free resources */ 1692 /* free resources */
1607 if (bmap->b_ops->bop_clear != NULL) 1693 if (btree->b_ops->bop_clear != NULL)
1608 bmap->b_ops->bop_clear(bmap); 1694 btree->b_ops->bop_clear(btree);
1609 1695
1610 /* ptr must be a pointer to a buffer head. */ 1696 /* ptr must be a pointer to a buffer head. */
1611 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1697 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1612 1698
1613 /* convert and insert */ 1699 /* convert and insert */
1614 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 1700 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1615 nilfs_btree_init(bmap); 1701 nilfs_btree_init(btree);
1616 if (nreq != NULL) { 1702 if (nreq != NULL) {
1617 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1703 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1618 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); 1704 nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
1619 1705
1620 /* create child node at level 1 */ 1706 /* create child node at level 1 */
1621 node = (struct nilfs_btree_node *)bh->b_data; 1707 node = (struct nilfs_btree_node *)bh->b_data;
1622 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); 1708 ncblk = nilfs_btree_nchildren_per_block(btree);
1623 nilfs_btree_node_insert(btree, node, 1709 nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
1624 key, dreq->bpr_ptr, n); 1710 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
1625 if (!buffer_dirty(bh)) 1711 if (!buffer_dirty(bh))
1626 nilfs_btnode_mark_dirty(bh); 1712 nilfs_btnode_mark_dirty(bh);
1627 if (!nilfs_bmap_dirty(bmap)) 1713 if (!nilfs_bmap_dirty(btree))
1628 nilfs_bmap_set_dirty(bmap); 1714 nilfs_bmap_set_dirty(btree);
1629 1715
1630 brelse(bh); 1716 brelse(bh);
1631 1717
1632 /* create root node at level 2 */ 1718 /* create root node at level 2 */
1633 node = nilfs_btree_get_root(btree); 1719 node = nilfs_btree_get_root(btree);
1634 tmpptr = nreq->bpr_ptr; 1720 tmpptr = nreq->bpr_ptr;
1635 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1721 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
1636 2, 1, &keys[0], &tmpptr); 1722 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1723 &keys[0], &tmpptr);
1637 } else { 1724 } else {
1638 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1725 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1639 1726
1640 /* create root node at level 1 */ 1727 /* create root node at level 1 */
1641 node = nilfs_btree_get_root(btree); 1728 node = nilfs_btree_get_root(btree);
1642 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1729 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
1643 1, n, keys, ptrs); 1730 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1644 nilfs_btree_node_insert(btree, node, 1731 keys, ptrs);
1645 key, dreq->bpr_ptr, n); 1732 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
1646 if (!nilfs_bmap_dirty(bmap)) 1733 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1647 nilfs_bmap_set_dirty(bmap); 1734 if (!nilfs_bmap_dirty(btree))
1735 nilfs_bmap_set_dirty(btree);
1648 } 1736 }
1649 1737
1650 if (NILFS_BMAP_USE_VBN(bmap)) 1738 if (NILFS_BMAP_USE_VBN(btree))
1651 nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); 1739 nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
1652} 1740}
1653 1741
1654/** 1742/**
@@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1660 * @ptrs: 1748 * @ptrs:
1661 * @n: 1749 * @n:
1662 */ 1750 */
1663int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, 1751int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1664 __u64 key, __u64 ptr, 1752 __u64 key, __u64 ptr,
1665 const __u64 *keys, const __u64 *ptrs, int n) 1753 const __u64 *keys, const __u64 *ptrs, int n)
1666{ 1754{
@@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1673 di = &dreq; 1761 di = &dreq;
1674 ni = NULL; 1762 ni = NULL;
1675 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( 1763 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1676 1 << bmap->b_inode->i_blkbits)) { 1764 1 << btree->b_inode->i_blkbits)) {
1677 di = &dreq; 1765 di = &dreq;
1678 ni = &nreq; 1766 ni = &nreq;
1679 } else { 1767 } else {
@@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1682 BUG(); 1770 BUG();
1683 } 1771 }
1684 1772
1685 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, 1773 ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
1686 &stats); 1774 &stats);
1687 if (ret < 0) 1775 if (ret < 0)
1688 return ret; 1776 return ret;
1689 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, 1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1690 di, ni, bh); 1778 di, ni, bh);
1691 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1779 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1692 return 0; 1780 return 0;
1693} 1781}
1694 1782
1695static int nilfs_btree_propagate_p(struct nilfs_btree *btree, 1783static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
1696 struct nilfs_btree_path *path, 1784 struct nilfs_btree_path *path,
1697 int level, 1785 int level,
1698 struct buffer_head *bh) 1786 struct buffer_head *bh)
@@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1704 return 0; 1792 return 0;
1705} 1793}
1706 1794
1707static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1795static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
1708 struct nilfs_btree_path *path, 1796 struct nilfs_btree_path *path,
1709 int level, struct inode *dat) 1797 int level, struct inode *dat)
1710{ 1798{
1711 struct nilfs_btree_node *parent; 1799 struct nilfs_btree_node *parent;
1712 int ret; 1800 int ncmax, ret;
1713 1801
1714 parent = nilfs_btree_get_node(btree, path, level + 1); 1802 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1715 path[level].bp_oldreq.bpr_ptr = 1803 path[level].bp_oldreq.bpr_ptr =
1716 nilfs_btree_node_get_ptr(btree, parent, 1804 nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1717 path[level + 1].bp_index); 1805 ncmax);
1718 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1806 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1719 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, 1807 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1720 &path[level].bp_newreq.bpr_req); 1808 &path[level].bp_newreq.bpr_req);
@@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1726 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; 1814 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1727 path[level].bp_ctxt.bh = path[level].bp_bh; 1815 path[level].bp_ctxt.bh = path[level].bp_bh;
1728 ret = nilfs_btnode_prepare_change_key( 1816 ret = nilfs_btnode_prepare_change_key(
1729 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1817 &NILFS_BMAP_I(btree)->i_btnode_cache,
1730 &path[level].bp_ctxt); 1818 &path[level].bp_ctxt);
1731 if (ret < 0) { 1819 if (ret < 0) {
1732 nilfs_dat_abort_update(dat, 1820 nilfs_dat_abort_update(dat,
@@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1739 return 0; 1827 return 0;
1740} 1828}
1741 1829
1742static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1830static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
1743 struct nilfs_btree_path *path, 1831 struct nilfs_btree_path *path,
1744 int level, struct inode *dat) 1832 int level, struct inode *dat)
1745{ 1833{
1746 struct nilfs_btree_node *parent; 1834 struct nilfs_btree_node *parent;
1835 int ncmax;
1747 1836
1748 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, 1837 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1749 &path[level].bp_newreq.bpr_req, 1838 &path[level].bp_newreq.bpr_req,
1750 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); 1839 btree->b_ptr_type == NILFS_BMAP_PTR_VS);
1751 1840
1752 if (buffer_nilfs_node(path[level].bp_bh)) { 1841 if (buffer_nilfs_node(path[level].bp_bh)) {
1753 nilfs_btnode_commit_change_key( 1842 nilfs_btnode_commit_change_key(
1754 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1843 &NILFS_BMAP_I(btree)->i_btnode_cache,
1755 &path[level].bp_ctxt); 1844 &path[level].bp_ctxt);
1756 path[level].bp_bh = path[level].bp_ctxt.bh; 1845 path[level].bp_bh = path[level].bp_ctxt.bh;
1757 } 1846 }
1758 set_buffer_nilfs_volatile(path[level].bp_bh); 1847 set_buffer_nilfs_volatile(path[level].bp_bh);
1759 1848
1760 parent = nilfs_btree_get_node(btree, path, level + 1); 1849 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1761 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, 1850 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
1762 path[level].bp_newreq.bpr_ptr); 1851 path[level].bp_newreq.bpr_ptr, ncmax);
1763} 1852}
1764 1853
1765static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1854static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
1766 struct nilfs_btree_path *path, 1855 struct nilfs_btree_path *path,
1767 int level, struct inode *dat) 1856 int level, struct inode *dat)
1768{ 1857{
@@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1770 &path[level].bp_newreq.bpr_req); 1859 &path[level].bp_newreq.bpr_req);
1771 if (buffer_nilfs_node(path[level].bp_bh)) 1860 if (buffer_nilfs_node(path[level].bp_bh))
1772 nilfs_btnode_abort_change_key( 1861 nilfs_btnode_abort_change_key(
1773 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1862 &NILFS_BMAP_I(btree)->i_btnode_cache,
1774 &path[level].bp_ctxt); 1863 &path[level].bp_ctxt);
1775} 1864}
1776 1865
1777static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1866static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
1778 struct nilfs_btree_path *path, 1867 struct nilfs_btree_path *path,
1779 int minlevel, int *maxlevelp, 1868 int minlevel, int *maxlevelp,
1780 struct inode *dat) 1869 struct inode *dat)
@@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1809 return ret; 1898 return ret;
1810} 1899}
1811 1900
1812static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1901static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
1813 struct nilfs_btree_path *path, 1902 struct nilfs_btree_path *path,
1814 int minlevel, int maxlevel, 1903 int minlevel, int maxlevel,
1815 struct buffer_head *bh, 1904 struct buffer_head *bh,
@@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1824 nilfs_btree_commit_update_v(btree, path, level, dat); 1913 nilfs_btree_commit_update_v(btree, path, level, dat);
1825} 1914}
1826 1915
1827static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1916static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
1828 struct nilfs_btree_path *path, 1917 struct nilfs_btree_path *path,
1829 int level, struct buffer_head *bh) 1918 int level, struct buffer_head *bh)
1830{ 1919{
1831 int maxlevel = 0, ret; 1920 int maxlevel = 0, ret;
1832 struct nilfs_btree_node *parent; 1921 struct nilfs_btree_node *parent;
1833 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1922 struct inode *dat = nilfs_bmap_get_dat(btree);
1834 __u64 ptr; 1923 __u64 ptr;
1924 int ncmax;
1835 1925
1836 get_bh(bh); 1926 get_bh(bh);
1837 path[level].bp_bh = bh; 1927 path[level].bp_bh = bh;
@@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1841 goto out; 1931 goto out;
1842 1932
1843 if (buffer_nilfs_volatile(path[level].bp_bh)) { 1933 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1844 parent = nilfs_btree_get_node(btree, path, level + 1); 1934 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1845 ptr = nilfs_btree_node_get_ptr(btree, parent, 1935 ptr = nilfs_btree_node_get_ptr(parent,
1846 path[level + 1].bp_index); 1936 path[level + 1].bp_index,
1937 ncmax);
1847 ret = nilfs_dat_mark_dirty(dat, ptr); 1938 ret = nilfs_dat_mark_dirty(dat, ptr);
1848 if (ret < 0) 1939 if (ret < 0)
1849 goto out; 1940 goto out;
@@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1857 return ret; 1948 return ret;
1858} 1949}
1859 1950
1860static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, 1951static int nilfs_btree_propagate(struct nilfs_bmap *btree,
1861 struct buffer_head *bh) 1952 struct buffer_head *bh)
1862{ 1953{
1863 struct nilfs_btree *btree;
1864 struct nilfs_btree_path *path; 1954 struct nilfs_btree_path *path;
1865 struct nilfs_btree_node *node; 1955 struct nilfs_btree_node *node;
1866 __u64 key; 1956 __u64 key;
@@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1868 1958
1869 WARN_ON(!buffer_dirty(bh)); 1959 WARN_ON(!buffer_dirty(bh));
1870 1960
1871 btree = (struct nilfs_btree *)bmap;
1872 path = nilfs_btree_alloc_path(); 1961 path = nilfs_btree_alloc_path();
1873 if (path == NULL) 1962 if (path == NULL)
1874 return -ENOMEM; 1963 return -ENOMEM;
@@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1878 key = nilfs_btree_node_get_key(node, 0); 1967 key = nilfs_btree_node_get_key(node, 0);
1879 level = nilfs_btree_node_get_level(node); 1968 level = nilfs_btree_node_get_level(node);
1880 } else { 1969 } else {
1881 key = nilfs_bmap_data_get_key(bmap, bh); 1970 key = nilfs_bmap_data_get_key(btree, bh);
1882 level = NILFS_BTREE_LEVEL_DATA; 1971 level = NILFS_BTREE_LEVEL_DATA;
1883 } 1972 }
1884 1973
1885 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 1974 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
1886 if (ret < 0) { 1975 if (ret < 0) {
1887 if (unlikely(ret == -ENOENT)) 1976 if (unlikely(ret == -ENOENT))
1888 printk(KERN_CRIT "%s: key = %llu, level == %d\n", 1977 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1890 goto out; 1979 goto out;
1891 } 1980 }
1892 1981
1893 ret = NILFS_BMAP_USE_VBN(bmap) ? 1982 ret = NILFS_BMAP_USE_VBN(btree) ?
1894 nilfs_btree_propagate_v(btree, path, level, bh) : 1983 nilfs_btree_propagate_v(btree, path, level, bh) :
1895 nilfs_btree_propagate_p(btree, path, level, bh); 1984 nilfs_btree_propagate_p(btree, path, level, bh);
1896 1985
@@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 return ret; 1989 return ret;
1901} 1990}
1902 1991
1903static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1992static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
1904 struct buffer_head *bh) 1993 struct buffer_head *bh)
1905{ 1994{
1906 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); 1995 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
1907} 1996}
1908 1997
1909static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1998static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
1910 struct list_head *lists, 1999 struct list_head *lists,
1911 struct buffer_head *bh) 2000 struct buffer_head *bh)
1912{ 2001{
@@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1920 node = (struct nilfs_btree_node *)bh->b_data; 2009 node = (struct nilfs_btree_node *)bh->b_data;
1921 key = nilfs_btree_node_get_key(node, 0); 2010 key = nilfs_btree_node_get_key(node, 0);
1922 level = nilfs_btree_node_get_level(node); 2011 level = nilfs_btree_node_get_level(node);
2012 if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
2013 level >= NILFS_BTREE_LEVEL_MAX) {
2014 dump_stack();
2015 printk(KERN_WARNING
2016 "%s: invalid btree level: %d (key=%llu, ino=%lu, "
2017 "blocknr=%llu)\n",
2018 __func__, level, (unsigned long long)key,
2019 NILFS_BMAP_I(btree)->vfs_inode.i_ino,
2020 (unsigned long long)bh->b_blocknr);
2021 return;
2022 }
2023
1923 list_for_each(head, &lists[level]) { 2024 list_for_each(head, &lists[level]) {
1924 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2025 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1925 cnode = (struct nilfs_btree_node *)cbh->b_data; 2026 cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1930 list_add_tail(&bh->b_assoc_buffers, head); 2031 list_add_tail(&bh->b_assoc_buffers, head);
1931} 2032}
1932 2033
1933static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, 2034static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
1934 struct list_head *listp) 2035 struct list_head *listp)
1935{ 2036{
1936 struct nilfs_btree *btree = (struct nilfs_btree *)bmap; 2037 struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
1937 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1938 struct list_head lists[NILFS_BTREE_LEVEL_MAX]; 2038 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1939 struct pagevec pvec; 2039 struct pagevec pvec;
1940 struct buffer_head *bh, *head; 2040 struct buffer_head *bh, *head;
@@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1968 list_splice_tail(&lists[level], listp); 2068 list_splice_tail(&lists[level], listp);
1969} 2069}
1970 2070
1971static int nilfs_btree_assign_p(struct nilfs_btree *btree, 2071static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
1972 struct nilfs_btree_path *path, 2072 struct nilfs_btree_path *path,
1973 int level, 2073 int level,
1974 struct buffer_head **bh, 2074 struct buffer_head **bh,
@@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
1978 struct nilfs_btree_node *parent; 2078 struct nilfs_btree_node *parent;
1979 __u64 key; 2079 __u64 key;
1980 __u64 ptr; 2080 __u64 ptr;
1981 int ret; 2081 int ncmax, ret;
1982 2082
1983 parent = nilfs_btree_get_node(btree, path, level + 1); 2083 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1984 ptr = nilfs_btree_node_get_ptr(btree, parent, 2084 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1985 path[level + 1].bp_index); 2085 ncmax);
1986 if (buffer_nilfs_node(*bh)) { 2086 if (buffer_nilfs_node(*bh)) {
1987 path[level].bp_ctxt.oldkey = ptr; 2087 path[level].bp_ctxt.oldkey = ptr;
1988 path[level].bp_ctxt.newkey = blocknr; 2088 path[level].bp_ctxt.newkey = blocknr;
1989 path[level].bp_ctxt.bh = *bh; 2089 path[level].bp_ctxt.bh = *bh;
1990 ret = nilfs_btnode_prepare_change_key( 2090 ret = nilfs_btnode_prepare_change_key(
1991 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2091 &NILFS_BMAP_I(btree)->i_btnode_cache,
1992 &path[level].bp_ctxt); 2092 &path[level].bp_ctxt);
1993 if (ret < 0) 2093 if (ret < 0)
1994 return ret; 2094 return ret;
1995 nilfs_btnode_commit_change_key( 2095 nilfs_btnode_commit_change_key(
1996 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2096 &NILFS_BMAP_I(btree)->i_btnode_cache,
1997 &path[level].bp_ctxt); 2097 &path[level].bp_ctxt);
1998 *bh = path[level].bp_ctxt.bh; 2098 *bh = path[level].bp_ctxt.bh;
1999 } 2099 }
2000 2100
2001 nilfs_btree_node_set_ptr(btree, parent, 2101 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
2002 path[level + 1].bp_index, blocknr); 2102 ncmax);
2003 2103
2004 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2104 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2005 /* on-disk format */ 2105 /* on-disk format */
2006 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2106 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
2007 binfo->bi_dat.bi_level = level; 2107 binfo->bi_dat.bi_level = level;
2008 2108
2009 return 0; 2109 return 0;
2010} 2110}
2011 2111
2012static int nilfs_btree_assign_v(struct nilfs_btree *btree, 2112static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
2013 struct nilfs_btree_path *path, 2113 struct nilfs_btree_path *path,
2014 int level, 2114 int level,
2015 struct buffer_head **bh, 2115 struct buffer_head **bh,
@@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2017 union nilfs_binfo *binfo) 2117 union nilfs_binfo *binfo)
2018{ 2118{
2019 struct nilfs_btree_node *parent; 2119 struct nilfs_btree_node *parent;
2020 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 2120 struct inode *dat = nilfs_bmap_get_dat(btree);
2021 __u64 key; 2121 __u64 key;
2022 __u64 ptr; 2122 __u64 ptr;
2023 union nilfs_bmap_ptr_req req; 2123 union nilfs_bmap_ptr_req req;
2024 int ret; 2124 int ncmax, ret;
2025 2125
2026 parent = nilfs_btree_get_node(btree, path, level + 1); 2126 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2027 ptr = nilfs_btree_node_get_ptr(btree, parent, 2127 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2028 path[level + 1].bp_index); 2128 ncmax);
2029 req.bpr_ptr = ptr; 2129 req.bpr_ptr = ptr;
2030 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 2130 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2031 if (ret < 0) 2131 if (ret < 0)
@@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2034 2134
2035 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2135 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2036 /* on-disk format */ 2136 /* on-disk format */
2037 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2137 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
2038 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2138 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2039 2139
2040 return 0; 2140 return 0;
2041} 2141}
2042 2142
2043static int nilfs_btree_assign(struct nilfs_bmap *bmap, 2143static int nilfs_btree_assign(struct nilfs_bmap *btree,
2044 struct buffer_head **bh, 2144 struct buffer_head **bh,
2045 sector_t blocknr, 2145 sector_t blocknr,
2046 union nilfs_binfo *binfo) 2146 union nilfs_binfo *binfo)
2047{ 2147{
2048 struct nilfs_btree *btree;
2049 struct nilfs_btree_path *path; 2148 struct nilfs_btree_path *path;
2050 struct nilfs_btree_node *node; 2149 struct nilfs_btree_node *node;
2051 __u64 key; 2150 __u64 key;
2052 int level, ret; 2151 int level, ret;
2053 2152
2054 btree = (struct nilfs_btree *)bmap;
2055 path = nilfs_btree_alloc_path(); 2153 path = nilfs_btree_alloc_path();
2056 if (path == NULL) 2154 if (path == NULL)
2057 return -ENOMEM; 2155 return -ENOMEM;
@@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2061 key = nilfs_btree_node_get_key(node, 0); 2159 key = nilfs_btree_node_get_key(node, 0);
2062 level = nilfs_btree_node_get_level(node); 2160 level = nilfs_btree_node_get_level(node);
2063 } else { 2161 } else {
2064 key = nilfs_bmap_data_get_key(bmap, *bh); 2162 key = nilfs_bmap_data_get_key(btree, *bh);
2065 level = NILFS_BTREE_LEVEL_DATA; 2163 level = NILFS_BTREE_LEVEL_DATA;
2066 } 2164 }
2067 2165
2068 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 2166 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
2069 if (ret < 0) { 2167 if (ret < 0) {
2070 WARN_ON(ret == -ENOENT); 2168 WARN_ON(ret == -ENOENT);
2071 goto out; 2169 goto out;
2072 } 2170 }
2073 2171
2074 ret = NILFS_BMAP_USE_VBN(bmap) ? 2172 ret = NILFS_BMAP_USE_VBN(btree) ?
2075 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : 2173 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
2076 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2174 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2077 2175
@@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2081 return ret; 2179 return ret;
2082} 2180}
2083 2181
2084static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, 2182static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
2085 struct buffer_head **bh, 2183 struct buffer_head **bh,
2086 sector_t blocknr, 2184 sector_t blocknr,
2087 union nilfs_binfo *binfo) 2185 union nilfs_binfo *binfo)
@@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2090 __u64 key; 2188 __u64 key;
2091 int ret; 2189 int ret;
2092 2190
2093 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, 2191 ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
2094 blocknr); 2192 blocknr);
2095 if (ret < 0) 2193 if (ret < 0)
2096 return ret; 2194 return ret;
@@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2099 node = (struct nilfs_btree_node *)(*bh)->b_data; 2197 node = (struct nilfs_btree_node *)(*bh)->b_data;
2100 key = nilfs_btree_node_get_key(node, 0); 2198 key = nilfs_btree_node_get_key(node, 0);
2101 } else 2199 } else
2102 key = nilfs_bmap_data_get_key(bmap, *bh); 2200 key = nilfs_bmap_data_get_key(btree, *bh);
2103 2201
2104 /* on-disk format */ 2202 /* on-disk format */
2105 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); 2203 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2106 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2204 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2107 2205
2108 return 0; 2206 return 0;
2109} 2207}
2110 2208
2111static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) 2209static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
2112{ 2210{
2113 struct buffer_head *bh; 2211 struct buffer_head *bh;
2114 struct nilfs_btree *btree;
2115 struct nilfs_btree_path *path; 2212 struct nilfs_btree_path *path;
2116 __u64 ptr; 2213 __u64 ptr;
2117 int ret; 2214 int ret;
2118 2215
2119 btree = (struct nilfs_btree *)bmap;
2120 path = nilfs_btree_alloc_path(); 2216 path = nilfs_btree_alloc_path();
2121 if (path == NULL) 2217 if (path == NULL)
2122 return -ENOMEM; 2218 return -ENOMEM;
2123 2219
2124 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2220 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
2125 if (ret < 0) { 2221 if (ret < 0) {
2126 WARN_ON(ret == -ENOENT); 2222 WARN_ON(ret == -ENOENT);
2127 goto out; 2223 goto out;
@@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2135 if (!buffer_dirty(bh)) 2231 if (!buffer_dirty(bh))
2136 nilfs_btnode_mark_dirty(bh); 2232 nilfs_btnode_mark_dirty(bh);
2137 brelse(bh); 2233 brelse(bh);
2138 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 2234 if (!nilfs_bmap_dirty(btree))
2139 nilfs_bmap_set_dirty(&btree->bt_bmap); 2235 nilfs_bmap_set_dirty(btree);
2140 2236
2141 out: 2237 out:
2142 nilfs_btree_free_path(path); 2238 nilfs_btree_free_path(path);
@@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2186int nilfs_btree_init(struct nilfs_bmap *bmap) 2282int nilfs_btree_init(struct nilfs_bmap *bmap)
2187{ 2283{
2188 bmap->b_ops = &nilfs_btree_ops; 2284 bmap->b_ops = &nilfs_btree_ops;
2285 bmap->b_nchildren_per_block =
2286 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2189 return 0; 2287 return 0;
2190} 2288}
2191 2289
2192void nilfs_btree_init_gc(struct nilfs_bmap *bmap) 2290void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2193{ 2291{
2194 bmap->b_ops = &nilfs_btree_ops_gc; 2292 bmap->b_ops = &nilfs_btree_ops_gc;
2293 bmap->b_nchildren_per_block =
2294 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2195} 2295}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 43c8c5b541f..22c02e35b6e 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -31,14 +31,6 @@
31#include "bmap.h" 31#include "bmap.h"
32 32
33/** 33/**
34 * struct nilfs_btree - B-tree structure
35 * @bt_bmap: bmap base structure
36 */
37struct nilfs_btree {
38 struct nilfs_bmap bt_bmap;
39};
40
41/**
42 * struct nilfs_btree_path - A path on which B-tree operations are executed 34 * struct nilfs_btree_path - A path on which B-tree operations are executed
43 * @bp_bh: buffer head of node block 35 * @bp_bh: buffer head of node block
44 * @bp_sib_bh: buffer head of sibling node block 36 * @bp_sib_bh: buffer head of sibling node block
@@ -54,7 +46,7 @@ struct nilfs_btree_path {
54 union nilfs_bmap_ptr_req bp_oldreq; 46 union nilfs_bmap_ptr_req bp_oldreq;
55 union nilfs_bmap_ptr_req bp_newreq; 47 union nilfs_bmap_ptr_req bp_newreq;
56 struct nilfs_btnode_chkey_ctxt bp_ctxt; 48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
57 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, 49 void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
58 int, __u64 *, __u64 *); 50 int, __u64 *, __u64 *);
59}; 51};
60 52
@@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
80 const __u64 *, const __u64 *, int); 72 const __u64 *, const __u64 *, int);
81void nilfs_btree_init_gc(struct nilfs_bmap *); 73void nilfs_btree_init_gc(struct nilfs_bmap *);
82 74
75int nilfs_btree_broken_node_block(struct buffer_head *bh);
76
83#endif /* _NILFS_BTREE_H */ 77#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db6..5ff15a8a102 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
863 */ 863 */
864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) 864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
865{ 865{
866 struct the_nilfs *nilfs;
867 int ret; 866 int ret;
868 867
869 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
870
871 switch (mode) { 868 switch (mode) {
872 case NILFS_CHECKPOINT: 869 case NILFS_CHECKPOINT:
873 /* 870 if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
874 * Check for protecting existing snapshot mounts: 871 /*
875 * ns_mount_mutex is used to make this operation atomic and 872 * Current implementation does not have to protect
876 * exclusive with a new mount job. Though it doesn't cover 873 * plain read-only mounts since they are exclusive
877 * umount, it's enough for the purpose. 874 * with a read/write mount and are protected from the
878 */ 875 * cleaner.
879 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 876 */
880 /* Current implementation does not have to protect
881 plain read-only mounts since they are exclusive
882 with a read/write mount and are protected from the
883 cleaner. */
884 ret = -EBUSY; 877 ret = -EBUSY;
885 } else 878 else
886 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
887 return ret; 880 return ret;
888 case NILFS_SNAPSHOT: 881 case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
933} 926}
934 927
935/** 928/**
936 * nilfs_cpfile_read - read cpfile inode 929 * nilfs_cpfile_read - read or get cpfile inode
937 * @cpfile: cpfile inode 930 * @sb: super block instance
938 * @raw_inode: on-disk cpfile inode
939 */
940int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
941{
942 return nilfs_read_inode_common(cpfile, raw_inode);
943}
944
945/**
946 * nilfs_cpfile_new - create cpfile
947 * @nilfs: nilfs object
948 * @cpsize: size of a checkpoint entry 931 * @cpsize: size of a checkpoint entry
932 * @raw_inode: on-disk cpfile inode
933 * @inodep: buffer to store the inode
949 */ 934 */
950struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) 935int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
936 struct nilfs_inode *raw_inode, struct inode **inodep)
951{ 937{
952 struct inode *cpfile; 938 struct inode *cpfile;
939 int err;
940
941 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
942 if (unlikely(!cpfile))
943 return -ENOMEM;
944 if (!(cpfile->i_state & I_NEW))
945 goto out;
946
947 err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
948 if (err)
949 goto failed;
953 950
954 cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); 951 nilfs_mdt_set_entry_size(cpfile, cpsize,
955 if (cpfile) 952 sizeof(struct nilfs_cpfile_header));
956 nilfs_mdt_set_entry_size(cpfile, cpsize, 953
957 sizeof(struct nilfs_cpfile_header)); 954 err = nilfs_read_inode_common(cpfile, raw_inode);
958 return cpfile; 955 if (err)
956 goto failed;
957
958 unlock_new_inode(cpfile);
959 out:
960 *inodep = cpfile;
961 return 0;
962 failed:
963 iget_failed(cpfile);
964 return err;
959} 965}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab4..a242b9a314f 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, 40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
41 size_t); 41 size_t);
42 42
43int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); 43int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
44struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); 44 struct nilfs_inode *raw_inode, struct inode **inodep);
45 45
46#endif /* _NILFS_CPFILE_H */ 46#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 01314675568..49c844dab33 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
36struct nilfs_dat_info { 36struct nilfs_dat_info {
37 struct nilfs_mdt_info mi; 37 struct nilfs_mdt_info mi;
38 struct nilfs_palloc_cache palloc_cache; 38 struct nilfs_palloc_cache palloc_cache;
39 struct nilfs_shadow_map shadow;
39}; 40};
40 41
41static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) 42static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
102 nilfs_palloc_abort_alloc_entry(dat, req); 103 nilfs_palloc_abort_alloc_entry(dat, req);
103} 104}
104 105
105void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) 106static void nilfs_dat_commit_free(struct inode *dat,
107 struct nilfs_palloc_req *req)
106{ 108{
107 struct nilfs_dat_entry *entry; 109 struct nilfs_dat_entry *entry;
108 void *kaddr; 110 void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
327 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); 329 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
328 if (ret < 0) 330 if (ret < 0)
329 return ret; 331 return ret;
332
333 /*
334 * The given disk block number (blocknr) is not yet written to
335 * the device at this point.
336 *
337 * To prevent nilfs_dat_translate() from returning the
338 * uncommited block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */
341 if (!buffer_nilfs_redirected(entry_bh)) {
342 ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
343 if (ret) {
344 brelse(entry_bh);
345 return ret;
346 }
347 }
348
330 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 349 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
331 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 350 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
332 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { 351 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
371 */ 390 */
372int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) 391int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
373{ 392{
374 struct buffer_head *entry_bh; 393 struct buffer_head *entry_bh, *bh;
375 struct nilfs_dat_entry *entry; 394 struct nilfs_dat_entry *entry;
376 sector_t blocknr; 395 sector_t blocknr;
377 void *kaddr; 396 void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
381 if (ret < 0) 400 if (ret < 0)
382 return ret; 401 return ret;
383 402
403 if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
404 bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
405 if (bh) {
406 WARN_ON(!buffer_uptodate(bh));
407 brelse(entry_bh);
408 entry_bh = bh;
409 }
410 }
411
384 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 412 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
385 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 413 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
386 blocknr = le64_to_cpu(entry->de_blocknr); 414 blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
436} 464}
437 465
438/** 466/**
439 * nilfs_dat_read - read dat inode 467 * nilfs_dat_read - read or get dat inode
440 * @dat: dat inode 468 * @sb: super block instance
441 * @raw_inode: on-disk dat inode
442 */
443int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
444{
445 return nilfs_read_inode_common(dat, raw_inode);
446}
447
448/**
449 * nilfs_dat_new - create dat file
450 * @nilfs: nilfs object
451 * @entry_size: size of a dat entry 469 * @entry_size: size of a dat entry
470 * @raw_inode: on-disk dat inode
471 * @inodep: buffer to store the inode
452 */ 472 */
453struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) 473int nilfs_dat_read(struct super_block *sb, size_t entry_size,
474 struct nilfs_inode *raw_inode, struct inode **inodep)
454{ 475{
455 static struct lock_class_key dat_lock_key; 476 static struct lock_class_key dat_lock_key;
456 struct inode *dat; 477 struct inode *dat;
457 struct nilfs_dat_info *di; 478 struct nilfs_dat_info *di;
458 int err; 479 int err;
459 480
460 dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); 481 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
461 if (dat) { 482 if (unlikely(!dat))
462 err = nilfs_palloc_init_blockgroup(dat, entry_size); 483 return -ENOMEM;
463 if (unlikely(err)) { 484 if (!(dat->i_state & I_NEW))
464 nilfs_mdt_destroy(dat); 485 goto out;
465 return NULL;
466 }
467 486
468 di = NILFS_DAT_I(dat); 487 err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
469 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); 488 if (err)
470 nilfs_palloc_setup_cache(dat, &di->palloc_cache); 489 goto failed;
471 } 490
472 return dat; 491 err = nilfs_palloc_init_blockgroup(dat, entry_size);
492 if (err)
493 goto failed;
494
495 di = NILFS_DAT_I(dat);
496 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
497 nilfs_palloc_setup_cache(dat, &di->palloc_cache);
498 nilfs_mdt_setup_shadow_map(dat, &di->shadow);
499
500 err = nilfs_read_inode_common(dat, raw_inode);
501 if (err)
502 goto failed;
503
504 unlock_new_inode(dat);
505 out:
506 *inodep = dat;
507 return 0;
508 failed:
509 iget_failed(dat);
510 return err;
473} 511}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0ef..cbd8e973250 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
53int nilfs_dat_move(struct inode *, __u64, sector_t); 53int nilfs_dat_move(struct inode *, __u64, sector_t);
54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); 54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
55 55
56int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); 56int nilfs_dat_read(struct super_block *sb, size_t entry_size,
57struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); 57 struct nilfs_inode *raw_inode, struct inode **inodep);
58 58
59#endif /* _NILFS_DAT_H */ 59#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f..cb003c8ee1f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -80,23 +80,10 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
80 return last_byte; 80 return last_byte;
81} 81}
82 82
83static int nilfs_prepare_chunk_uninterruptible(struct page *page, 83static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
84 struct address_space *mapping,
85 unsigned from, unsigned to)
86{ 84{
87 loff_t pos = page_offset(page) + from; 85 loff_t pos = page_offset(page) + from;
88 return block_write_begin(NULL, mapping, pos, to - from, 86 return __block_write_begin(page, pos, to - from, nilfs_get_block);
89 AOP_FLAG_UNINTERRUPTIBLE, &page,
90 NULL, nilfs_get_block);
91}
92
93static int nilfs_prepare_chunk(struct page *page,
94 struct address_space *mapping,
95 unsigned from, unsigned to)
96{
97 loff_t pos = page_offset(page) + from;
98 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
99 NULL, nilfs_get_block);
100} 87}
101 88
102static void nilfs_commit_chunk(struct page *page, 89static void nilfs_commit_chunk(struct page *page,
@@ -141,7 +128,7 @@ static void nilfs_check_page(struct page *page)
141 } 128 }
142 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { 129 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
143 p = (struct nilfs_dir_entry *)(kaddr + offs); 130 p = (struct nilfs_dir_entry *)(kaddr + offs);
144 rec_len = le16_to_cpu(p->rec_len); 131 rec_len = nilfs_rec_len_from_disk(p->rec_len);
145 132
146 if (rec_len < NILFS_DIR_REC_LEN(1)) 133 if (rec_len < NILFS_DIR_REC_LEN(1))
147 goto Eshort; 134 goto Eshort;
@@ -199,13 +186,10 @@ fail:
199static struct page *nilfs_get_page(struct inode *dir, unsigned long n) 186static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
200{ 187{
201 struct address_space *mapping = dir->i_mapping; 188 struct address_space *mapping = dir->i_mapping;
202 struct page *page = read_cache_page(mapping, n, 189 struct page *page = read_mapping_page(mapping, n, NULL);
203 (filler_t *)mapping->a_ops->readpage, NULL); 190
204 if (!IS_ERR(page)) { 191 if (!IS_ERR(page)) {
205 wait_on_page_locked(page);
206 kmap(page); 192 kmap(page);
207 if (!PageUptodate(page))
208 goto fail;
209 if (!PageChecked(page)) 193 if (!PageChecked(page))
210 nilfs_check_page(page); 194 nilfs_check_page(page);
211 if (PageError(page)) 195 if (PageError(page))
@@ -238,7 +222,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
238 */ 222 */
239static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) 223static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
240{ 224{
241 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); 225 return (struct nilfs_dir_entry *)((char *)p +
226 nilfs_rec_len_from_disk(p->rec_len));
242} 227}
243 228
244static unsigned char 229static unsigned char
@@ -329,7 +314,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
329 goto success; 314 goto success;
330 } 315 }
331 } 316 }
332 filp->f_pos += le16_to_cpu(de->rec_len); 317 filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
333 } 318 }
334 nilfs_put_page(page); 319 nilfs_put_page(page);
335 } 320 }
@@ -444,12 +429,12 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
444 struct page *page, struct inode *inode) 429 struct page *page, struct inode *inode)
445{ 430{
446 unsigned from = (char *) de - (char *) page_address(page); 431 unsigned from = (char *) de - (char *) page_address(page);
447 unsigned to = from + le16_to_cpu(de->rec_len); 432 unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
448 struct address_space *mapping = page->mapping; 433 struct address_space *mapping = page->mapping;
449 int err; 434 int err;
450 435
451 lock_page(page); 436 lock_page(page);
452 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to); 437 err = nilfs_prepare_chunk(page, from, to);
453 BUG_ON(err); 438 BUG_ON(err);
454 de->inode = cpu_to_le64(inode->i_ino); 439 de->inode = cpu_to_le64(inode->i_ino);
455 nilfs_set_de_type(de, inode); 440 nilfs_set_de_type(de, inode);
@@ -500,7 +485,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
500 /* We hit i_size */ 485 /* We hit i_size */
501 name_len = 0; 486 name_len = 0;
502 rec_len = chunk_size; 487 rec_len = chunk_size;
503 de->rec_len = cpu_to_le16(chunk_size); 488 de->rec_len = nilfs_rec_len_to_disk(chunk_size);
504 de->inode = 0; 489 de->inode = 0;
505 goto got_it; 490 goto got_it;
506 } 491 }
@@ -514,7 +499,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
514 if (nilfs_match(namelen, name, de)) 499 if (nilfs_match(namelen, name, de))
515 goto out_unlock; 500 goto out_unlock;
516 name_len = NILFS_DIR_REC_LEN(de->name_len); 501 name_len = NILFS_DIR_REC_LEN(de->name_len);
517 rec_len = le16_to_cpu(de->rec_len); 502 rec_len = nilfs_rec_len_from_disk(de->rec_len);
518 if (!de->inode && rec_len >= reclen) 503 if (!de->inode && rec_len >= reclen)
519 goto got_it; 504 goto got_it;
520 if (rec_len >= name_len + reclen) 505 if (rec_len >= name_len + reclen)
@@ -530,15 +515,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
530got_it: 515got_it:
531 from = (char *)de - (char *)page_address(page); 516 from = (char *)de - (char *)page_address(page);
532 to = from + rec_len; 517 to = from + rec_len;
533 err = nilfs_prepare_chunk(page, page->mapping, from, to); 518 err = nilfs_prepare_chunk(page, from, to);
534 if (err) 519 if (err)
535 goto out_unlock; 520 goto out_unlock;
536 if (de->inode) { 521 if (de->inode) {
537 struct nilfs_dir_entry *de1; 522 struct nilfs_dir_entry *de1;
538 523
539 de1 = (struct nilfs_dir_entry *)((char *)de + name_len); 524 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
540 de1->rec_len = cpu_to_le16(rec_len - name_len); 525 de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
541 de->rec_len = cpu_to_le16(name_len); 526 de->rec_len = nilfs_rec_len_to_disk(name_len);
542 de = de1; 527 de = de1;
543 } 528 }
544 de->name_len = namelen; 529 de->name_len = namelen;
@@ -569,7 +554,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
569 struct inode *inode = mapping->host; 554 struct inode *inode = mapping->host;
570 char *kaddr = page_address(page); 555 char *kaddr = page_address(page);
571 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); 556 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
572 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); 557 unsigned to = ((char *)dir - kaddr) +
558 nilfs_rec_len_from_disk(dir->rec_len);
573 struct nilfs_dir_entry *pde = NULL; 559 struct nilfs_dir_entry *pde = NULL;
574 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); 560 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
575 int err; 561 int err;
@@ -587,10 +573,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
587 if (pde) 573 if (pde)
588 from = (char *)pde - (char *)page_address(page); 574 from = (char *)pde - (char *)page_address(page);
589 lock_page(page); 575 lock_page(page);
590 err = nilfs_prepare_chunk(page, mapping, from, to); 576 err = nilfs_prepare_chunk(page, from, to);
591 BUG_ON(err); 577 BUG_ON(err);
592 if (pde) 578 if (pde)
593 pde->rec_len = cpu_to_le16(to - from); 579 pde->rec_len = nilfs_rec_len_to_disk(to - from);
594 dir->inode = 0; 580 dir->inode = 0;
595 nilfs_commit_chunk(page, mapping, from, to); 581 nilfs_commit_chunk(page, mapping, from, to);
596 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 582 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -615,7 +601,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
615 if (!page) 601 if (!page)
616 return -ENOMEM; 602 return -ENOMEM;
617 603
618 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size); 604 err = nilfs_prepare_chunk(page, 0, chunk_size);
619 if (unlikely(err)) { 605 if (unlikely(err)) {
620 unlock_page(page); 606 unlock_page(page);
621 goto fail; 607 goto fail;
@@ -624,14 +610,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
624 memset(kaddr, 0, chunk_size); 610 memset(kaddr, 0, chunk_size);
625 de = (struct nilfs_dir_entry *)kaddr; 611 de = (struct nilfs_dir_entry *)kaddr;
626 de->name_len = 1; 612 de->name_len = 1;
627 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); 613 de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
628 memcpy(de->name, ".\0\0", 4); 614 memcpy(de->name, ".\0\0", 4);
629 de->inode = cpu_to_le64(inode->i_ino); 615 de->inode = cpu_to_le64(inode->i_ino);
630 nilfs_set_de_type(de, inode); 616 nilfs_set_de_type(de, inode);
631 617
632 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); 618 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
633 de->name_len = 2; 619 de->name_len = 2;
634 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); 620 de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
635 de->inode = cpu_to_le64(parent->i_ino); 621 de->inode = cpu_to_le64(parent->i_ino);
636 memcpy(de->name, "..\0", 4); 622 memcpy(de->name, "..\0", 4);
637 nilfs_set_de_type(de, inode); 623 nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cd..324d80c5751 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
27#include "alloc.h" 27#include "alloc.h"
28#include "dat.h" 28#include "dat.h"
29 29
30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) 30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
31{ 31{
32 return (__le64 *) 32 return (__le64 *)
33 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); 33 ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
34} 34}
35 35
36static inline __u64 36static inline __u64
37nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) 37nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
38{ 38{
39 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); 39 return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
40} 40}
41 41
42static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, 42static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
43 __u64 key, __u64 ptr) 43 __u64 key, __u64 ptr)
44{ 44{
45 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); 45 *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
46} 46}
47 47
48static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, 48static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
49 __u64 key, int level, __u64 *ptrp) 49 __u64 key, int level, __u64 *ptrp)
50{ 50{
51 struct nilfs_direct *direct;
52 __u64 ptr; 51 __u64 ptr;
53 52
54 direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */
55 if (key > NILFS_DIRECT_KEY_MAX || level != 1) 53 if (key > NILFS_DIRECT_KEY_MAX || level != 1)
56 return -ENOENT; 54 return -ENOENT;
57 ptr = nilfs_direct_get_ptr(direct, key); 55 ptr = nilfs_direct_get_ptr(direct, key);
58 if (ptr == NILFS_BMAP_INVALID_PTR) 56 if (ptr == NILFS_BMAP_INVALID_PTR)
59 return -ENOENT; 57 return -ENOENT;
60 58
61 if (ptrp != NULL) 59 *ptrp = ptr;
62 *ptrp = ptr;
63 return 0; 60 return 0;
64} 61}
65 62
66static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, 63static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
67 __u64 key, __u64 *ptrp, 64 __u64 key, __u64 *ptrp,
68 unsigned maxblocks) 65 unsigned maxblocks)
69{ 66{
70 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
71 struct inode *dat = NULL; 67 struct inode *dat = NULL;
72 __u64 ptr, ptr2; 68 __u64 ptr, ptr2;
73 sector_t blocknr; 69 sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
79 if (ptr == NILFS_BMAP_INVALID_PTR) 75 if (ptr == NILFS_BMAP_INVALID_PTR)
80 return -ENOENT; 76 return -ENOENT;
81 77
82 if (NILFS_BMAP_USE_VBN(bmap)) { 78 if (NILFS_BMAP_USE_VBN(direct)) {
83 dat = nilfs_bmap_get_dat(bmap); 79 dat = nilfs_bmap_get_dat(direct);
84 ret = nilfs_dat_translate(dat, ptr, &blocknr); 80 ret = nilfs_dat_translate(dat, ptr, &blocknr);
85 if (ret < 0) 81 if (ret < 0)
86 return ret; 82 return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
106} 102}
107 103
108static __u64 104static __u64
109nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) 105nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
110{ 106{
111 __u64 ptr; 107 __u64 ptr;
112 108
113 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); 109 ptr = nilfs_bmap_find_target_seq(direct, key);
114 if (ptr != NILFS_BMAP_INVALID_PTR) 110 if (ptr != NILFS_BMAP_INVALID_PTR)
115 /* sequential access */ 111 /* sequential access */
116 return ptr; 112 return ptr;
117 else 113 else
118 /* block group */ 114 /* block group */
119 return nilfs_bmap_find_target_in_group(&direct->d_bmap); 115 return nilfs_bmap_find_target_in_group(direct);
120}
121
122static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
123 __u64 key, __u64 ptr)
124{
125 direct->d_bmap.b_last_allocated_key = key;
126 direct->d_bmap.b_last_allocated_ptr = ptr;
127} 116}
128 117
129static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 118static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
130{ 119{
131 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
132 union nilfs_bmap_ptr_req req; 120 union nilfs_bmap_ptr_req req;
133 struct inode *dat = NULL; 121 struct inode *dat = NULL;
134 struct buffer_head *bh; 122 struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
136 124
137 if (key > NILFS_DIRECT_KEY_MAX) 125 if (key > NILFS_DIRECT_KEY_MAX)
138 return -ENOENT; 126 return -ENOENT;
139 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 127 if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
140 return -EEXIST; 128 return -EEXIST;
141 129
142 if (NILFS_BMAP_USE_VBN(bmap)) { 130 if (NILFS_BMAP_USE_VBN(bmap)) {
143 req.bpr_ptr = nilfs_direct_find_target_v(direct, key); 131 req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
144 dat = nilfs_bmap_get_dat(bmap); 132 dat = nilfs_bmap_get_dat(bmap);
145 } 133 }
146 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); 134 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
150 set_buffer_nilfs_volatile(bh); 138 set_buffer_nilfs_volatile(bh);
151 139
152 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); 140 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
153 nilfs_direct_set_ptr(direct, key, req.bpr_ptr); 141 nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
154 142
155 if (!nilfs_bmap_dirty(bmap)) 143 if (!nilfs_bmap_dirty(bmap))
156 nilfs_bmap_set_dirty(bmap); 144 nilfs_bmap_set_dirty(bmap);
157 145
158 if (NILFS_BMAP_USE_VBN(bmap)) 146 if (NILFS_BMAP_USE_VBN(bmap))
159 nilfs_direct_set_target_v(direct, key, req.bpr_ptr); 147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
160 148
161 nilfs_bmap_add_blocks(bmap, 1); 149 nilfs_bmap_add_blocks(bmap, 1);
162 } 150 }
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
165 153
166static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 154static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
167{ 155{
168 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
169 union nilfs_bmap_ptr_req req; 156 union nilfs_bmap_ptr_req req;
170 struct inode *dat; 157 struct inode *dat;
171 int ret; 158 int ret;
172 159
173 if (key > NILFS_DIRECT_KEY_MAX || 160 if (key > NILFS_DIRECT_KEY_MAX ||
174 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 161 nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
175 return -ENOENT; 162 return -ENOENT;
176 163
177 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 164 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
178 req.bpr_ptr = nilfs_direct_get_ptr(direct, key); 165 req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
179 166
180 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); 167 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
181 if (!ret) { 168 if (!ret) {
182 nilfs_bmap_commit_end_ptr(bmap, &req, dat); 169 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
183 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); 170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
184 nilfs_bmap_sub_blocks(bmap, 1); 171 nilfs_bmap_sub_blocks(bmap, 1);
185 } 172 }
186 return ret; 173 return ret;
187} 174}
188 175
189static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 176static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
190{ 177{
191 struct nilfs_direct *direct;
192 __u64 key, lastkey; 178 __u64 key, lastkey;
193 179
194 direct = (struct nilfs_direct *)bmap;
195 lastkey = NILFS_DIRECT_KEY_MAX + 1; 180 lastkey = NILFS_DIRECT_KEY_MAX + 1;
196 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) 181 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
197 if (nilfs_direct_get_ptr(direct, key) != 182 if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
211 return key > NILFS_DIRECT_KEY_MAX; 196 return key > NILFS_DIRECT_KEY_MAX;
212} 197}
213 198
214static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, 199static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
215 __u64 *keys, __u64 *ptrs, int nitems) 200 __u64 *keys, __u64 *ptrs, int nitems)
216{ 201{
217 struct nilfs_direct *direct;
218 __u64 key; 202 __u64 key;
219 __u64 ptr; 203 __u64 ptr;
220 int n; 204 int n;
221 205
222 direct = (struct nilfs_direct *)bmap;
223 if (nitems > NILFS_DIRECT_NBLOCKS) 206 if (nitems > NILFS_DIRECT_NBLOCKS)
224 nitems = NILFS_DIRECT_NBLOCKS; 207 nitems = NILFS_DIRECT_NBLOCKS;
225 n = 0; 208 n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
237int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, 220int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
238 __u64 key, __u64 *keys, __u64 *ptrs, int n) 221 __u64 key, __u64 *keys, __u64 *ptrs, int n)
239{ 222{
240 struct nilfs_direct *direct;
241 __le64 *dptrs; 223 __le64 *dptrs;
242 int ret, i, j; 224 int ret, i, j;
243 225
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
253 bmap->b_ops->bop_clear(bmap); 235 bmap->b_ops->bop_clear(bmap);
254 236
255 /* convert */ 237 /* convert */
256 direct = (struct nilfs_direct *)bmap; 238 dptrs = nilfs_direct_dptrs(bmap);
257 dptrs = nilfs_direct_dptrs(direct);
258 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { 239 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
259 if ((j < n) && (i == keys[j])) { 240 if ((j < n) && (i == keys[j])) {
260 dptrs[i] = (i != key) ? 241 dptrs[i] = (i != key) ?
261 nilfs_bmap_ptr_to_dptr(ptrs[j]) : 242 cpu_to_le64(ptrs[j]) :
262 NILFS_BMAP_INVALID_PTR; 243 NILFS_BMAP_INVALID_PTR;
263 j++; 244 j++;
264 } else 245 } else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
269 return 0; 250 return 0;
270} 251}
271 252
272static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, 253static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
273 struct buffer_head *bh) 254 struct buffer_head *bh)
274{ 255{
275 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
276 struct nilfs_palloc_req oldreq, newreq; 256 struct nilfs_palloc_req oldreq, newreq;
277 struct inode *dat; 257 struct inode *dat;
278 __u64 key; 258 __u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
284 264
285 dat = nilfs_bmap_get_dat(bmap); 265 dat = nilfs_bmap_get_dat(bmap);
286 key = nilfs_bmap_data_get_key(bmap, bh); 266 key = nilfs_bmap_data_get_key(bmap, bh);
287 ptr = nilfs_direct_get_ptr(direct, key); 267 ptr = nilfs_direct_get_ptr(bmap, key);
288 if (!buffer_nilfs_volatile(bh)) { 268 if (!buffer_nilfs_volatile(bh)) {
289 oldreq.pr_entry_nr = ptr; 269 oldreq.pr_entry_nr = ptr;
290 newreq.pr_entry_nr = ptr; 270 newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
294 nilfs_dat_commit_update(dat, &oldreq, &newreq, 274 nilfs_dat_commit_update(dat, &oldreq, &newreq,
295 bmap->b_ptr_type == NILFS_BMAP_PTR_VS); 275 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
296 set_buffer_nilfs_volatile(bh); 276 set_buffer_nilfs_volatile(bh);
297 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); 277 nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
298 } else 278 } else
299 ret = nilfs_dat_mark_dirty(dat, ptr); 279 ret = nilfs_dat_mark_dirty(dat, ptr);
300 280
301 return ret; 281 return ret;
302} 282}
303 283
304static int nilfs_direct_assign_v(struct nilfs_direct *direct, 284static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
305 __u64 key, __u64 ptr, 285 __u64 key, __u64 ptr,
306 struct buffer_head **bh, 286 struct buffer_head **bh,
307 sector_t blocknr, 287 sector_t blocknr,
308 union nilfs_binfo *binfo) 288 union nilfs_binfo *binfo)
309{ 289{
310 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); 290 struct inode *dat = nilfs_bmap_get_dat(direct);
311 union nilfs_bmap_ptr_req req; 291 union nilfs_bmap_ptr_req req;
312 int ret; 292 int ret;
313 293
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
315 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 295 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
316 if (!ret) { 296 if (!ret) {
317 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); 297 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
318 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 298 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
319 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 299 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
320 } 300 }
321 return ret; 301 return ret;
322} 302}
323 303
324static int nilfs_direct_assign_p(struct nilfs_direct *direct, 304static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
325 __u64 key, __u64 ptr, 305 __u64 key, __u64 ptr,
326 struct buffer_head **bh, 306 struct buffer_head **bh,
327 sector_t blocknr, 307 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
329{ 309{
330 nilfs_direct_set_ptr(direct, key, blocknr); 310 nilfs_direct_set_ptr(direct, key, blocknr);
331 311
332 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 312 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
333 binfo->bi_dat.bi_level = 0; 313 binfo->bi_dat.bi_level = 0;
334 314
335 return 0; 315 return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
340 sector_t blocknr, 320 sector_t blocknr,
341 union nilfs_binfo *binfo) 321 union nilfs_binfo *binfo)
342{ 322{
343 struct nilfs_direct *direct;
344 __u64 key; 323 __u64 key;
345 __u64 ptr; 324 __u64 ptr;
346 325
347 direct = (struct nilfs_direct *)bmap;
348 key = nilfs_bmap_data_get_key(bmap, *bh); 326 key = nilfs_bmap_data_get_key(bmap, *bh);
349 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { 327 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
350 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, 328 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
351 (unsigned long long)key); 329 (unsigned long long)key);
352 return -EINVAL; 330 return -EINVAL;
353 } 331 }
354 ptr = nilfs_direct_get_ptr(direct, key); 332 ptr = nilfs_direct_get_ptr(bmap, key);
355 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { 333 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
356 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, 334 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
357 (unsigned long long)ptr); 335 (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
359 } 337 }
360 338
361 return NILFS_BMAP_USE_VBN(bmap) ? 339 return NILFS_BMAP_USE_VBN(bmap) ?
362 nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : 340 nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
363 nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); 341 nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
364} 342}
365 343
366static const struct nilfs_bmap_operations nilfs_direct_ops = { 344static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d..dc643de20a2 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
28#include "bmap.h" 28#include "bmap.h"
29 29
30 30
31struct nilfs_direct;
32
33/** 31/**
34 * struct nilfs_direct_node - direct node 32 * struct nilfs_direct_node - direct node
35 * @dn_flags: flags 33 * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
40 __u8 pad[7]; 38 __u8 pad[7];
41}; 39};
42 40
43/**
44 * struct nilfs_direct - direct mapping
45 * @d_bmap: bmap structure
46 */
47struct nilfs_direct {
48 struct nilfs_bmap d_bmap;
49};
50
51
52#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) 41#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
53#define NILFS_DIRECT_KEY_MIN 0 42#define NILFS_DIRECT_KEY_MIN 0
54#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) 43#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 00000000000..a71cc412b65
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
1#ifndef NILFS_EXPORT_H
2#define NILFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations nilfs_export_ops;
7
8struct nilfs_fid {
9 u64 cno;
10 u64 ino;
11 u32 gen;
12
13 u32 parent_gen;
14 u64 parent_ino;
15} __attribute__ ((packed));
16
17#endif
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index dd5f7e0a95f..00000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_palloc_clear_cache(dat);
65 nilfs_palloc_clear_cache(gcdat);
66 nilfs_clear_dirty_pages(mapping);
67 nilfs_copy_back_pages(mapping, gmapping);
68 /* note: mdt dirty flags should be cleared by segctor. */
69
70 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
71 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
72
73 up_write(&NILFS_MDT(dat)->mi_sem);
74}
75
76void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
77{
78 struct inode *gcdat = nilfs->ns_gc_dat;
79 struct nilfs_inode_info *gii = NILFS_I(gcdat);
80
81 gcdat->i_state = I_CLEAR;
82 gii->i_flags = 0;
83
84 nilfs_palloc_clear_cache(gcdat);
85 truncate_inode_pages(gcdat->i_mapping, 0);
86 truncate_inode_pages(&gii->i_btnode_cache, 0);
87}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3..33ad25ddd5c 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each 31 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the 32 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap 33 * current generation and the blocks to be moved by GC never overlap
@@ -48,6 +41,8 @@
48#include <linux/slab.h> 41#include <linux/slab.h>
49#include <linux/swap.h> 42#include <linux/swap.h>
50#include "nilfs.h" 43#include "nilfs.h"
44#include "btree.h"
45#include "btnode.h"
51#include "page.h" 46#include "page.h"
52#include "mdt.h" 47#include "mdt.h"
53#include "dat.h" 48#include "dat.h"
@@ -149,8 +144,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
149int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, 144int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
150 __u64 vbn, struct buffer_head **out_bh) 145 __u64 vbn, struct buffer_head **out_bh)
151{ 146{
152 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, 147 int ret;
153 vbn ? : pbn, pbn, out_bh); 148
149 ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
150 vbn ? : pbn, pbn, READ, out_bh, &pbn);
154 if (ret == -EEXIST) /* internal code (cache hit) */ 151 if (ret == -EEXIST) /* internal code (cache hit) */
155 ret = 0; 152 ret = 0;
156 return ret; 153 return ret;
@@ -164,127 +161,53 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
164 if (buffer_dirty(bh)) 161 if (buffer_dirty(bh))
165 return -EEXIST; 162 return -EEXIST;
166 163
167 if (buffer_nilfs_node(bh)) 164 if (buffer_nilfs_node(bh)) {
165 if (nilfs_btree_broken_node_block(bh)) {
166 clear_buffer_uptodate(bh);
167 return -EIO;
168 }
168 nilfs_btnode_mark_dirty(bh); 169 nilfs_btnode_mark_dirty(bh);
169 else 170 } else {
170 nilfs_mdt_mark_buffer_dirty(bh); 171 nilfs_mark_buffer_dirty(bh);
171 return 0;
172}
173
174/*
175 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
176 * @nilfs - the_nilfs
177 *
178 * Return Value: On success, 0.
179 * On error, a negative error code is returned.
180 */
181int nilfs_init_gccache(struct the_nilfs *nilfs)
182{
183 int loop;
184
185 BUG_ON(nilfs->ns_gc_inodes_h);
186
187 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
188
189 nilfs->ns_gc_inodes_h =
190 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
191 GFP_NOFS);
192 if (nilfs->ns_gc_inodes_h == NULL)
193 return -ENOMEM;
194
195 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
196 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
197 return 0;
198}
199
200/*
201 * nilfs_destroy_gccache() - free gc_inode hash table
202 * @nilfs - the nilfs
203 */
204void nilfs_destroy_gccache(struct the_nilfs *nilfs)
205{
206 if (nilfs->ns_gc_inodes_h) {
207 nilfs_remove_all_gcinode(nilfs);
208 kfree(nilfs->ns_gc_inodes_h);
209 nilfs->ns_gc_inodes_h = NULL;
210 } 172 }
173 return 0;
211} 174}
212 175
213static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, 176int nilfs_init_gcinode(struct inode *inode)
214 __u64 cno)
215{ 177{
216 struct inode *inode; 178 struct nilfs_inode_info *ii = NILFS_I(inode);
217 struct nilfs_inode_info *ii; 179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
218
219 inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
220 if (!inode)
221 return NULL;
222 180
223 inode->i_op = NULL; 181 inode->i_mode = S_IFREG;
224 inode->i_fop = NULL; 182 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
225 inode->i_mapping->a_ops = &def_gcinode_aops; 183 inode->i_mapping->a_ops = &def_gcinode_aops;
184 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
226 185
227 ii = NILFS_I(inode);
228 ii->i_cno = cno;
229 ii->i_flags = 0; 186 ii->i_flags = 0;
230 ii->i_state = 1 << NILFS_I_GCINODE;
231 ii->i_bh = NULL;
232 nilfs_bmap_init_gc(ii->i_bmap); 187 nilfs_bmap_init_gc(ii->i_bmap);
233 188
234 return inode; 189 /*
235} 190 * Add the inode to GC inode list. Garbage Collection
236 191 * is serialized and no two processes manipulate the
237static unsigned long ihash(ino_t ino, __u64 cno) 192 * list simultaneously.
238{ 193 */
239 return hash_long((unsigned long)((ino << 2) + cno), 194 igrab(inode);
240 NILFS_GCINODE_HASH_BITS); 195 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
241}
242
243/*
244 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
245 */
246struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
247{
248 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
249 struct hlist_node *node;
250 struct inode *inode;
251
252 hlist_for_each_entry(inode, node, head, i_hash) {
253 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
254 return inode;
255 }
256
257 inode = alloc_gcinode(nilfs, ino, cno);
258 if (likely(inode)) {
259 hlist_add_head(&inode->i_hash, head);
260 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
261 }
262 return inode;
263}
264 196
265/* 197 return 0;
266 * nilfs_clear_gcinode() - clear and free a gc inode
267 */
268void nilfs_clear_gcinode(struct inode *inode)
269{
270 nilfs_mdt_destroy(inode);
271} 198}
272 199
273/* 200/**
274 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs 201 * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
275 */ 202 */
276void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) 203void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
277{ 204{
278 struct hlist_head *head = nilfs->ns_gc_inodes_h; 205 struct list_head *head = &nilfs->ns_gc_inodes;
279 struct hlist_node *node, *n; 206 struct nilfs_inode_info *ii;
280 struct inode *inode;
281 int loop;
282 207
283 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { 208 while (!list_empty(head)) {
284 hlist_for_each_entry_safe(inode, node, n, head, i_hash) { 209 ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
285 hlist_del_init(&inode->i_hash); 210 list_del_init(&ii->i_dirty);
286 list_del_init(&NILFS_I(inode)->i_dirty); 211 iput(&ii->vfs_inode);
287 nilfs_clear_gcinode(inode); /* might sleep */
288 }
289 } 212 }
290} 213}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8..9f8a2da67f9 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
161} 161}
162 162
163/** 163/**
164 * nilfs_ifile_new - create inode file 164 * nilfs_ifile_read - read or get ifile inode
165 * @sbi: nilfs_sb_info struct 165 * @sb: super block instance
166 * @root: root object
166 * @inode_size: size of an inode 167 * @inode_size: size of an inode
168 * @raw_inode: on-disk ifile inode
169 * @inodep: buffer to store the inode
167 */ 170 */
168struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) 171int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
172 size_t inode_size, struct nilfs_inode *raw_inode,
173 struct inode **inodep)
169{ 174{
170 struct inode *ifile; 175 struct inode *ifile;
171 int err; 176 int err;
172 177
173 ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, 178 ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
174 sizeof(struct nilfs_ifile_info)); 179 if (unlikely(!ifile))
175 if (ifile) { 180 return -ENOMEM;
176 err = nilfs_palloc_init_blockgroup(ifile, inode_size); 181 if (!(ifile->i_state & I_NEW))
177 if (unlikely(err)) { 182 goto out;
178 nilfs_mdt_destroy(ifile); 183
179 return NULL; 184 err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
180 } 185 sizeof(struct nilfs_ifile_info));
181 nilfs_palloc_setup_cache(ifile, 186 if (err)
182 &NILFS_IFILE_I(ifile)->palloc_cache); 187 goto failed;
183 } 188
184 return ifile; 189 err = nilfs_palloc_init_blockgroup(ifile, inode_size);
190 if (err)
191 goto failed;
192
193 nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
194
195 err = nilfs_read_inode_common(ifile, raw_inode);
196 if (err)
197 goto failed;
198
199 unlock_new_inode(ifile);
200 out:
201 *inodep = ifile;
202 return 0;
203 failed:
204 iget_failed(ifile);
205 return err;
185} 206}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f..59b6f2b51df 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); 52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep);
53 55
54#endif /* _NILFS_IFILE_H */ 56#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 39e038ac8fc..71d4bc8464e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -27,12 +27,19 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/uio.h> 28#include <linux/uio.h>
29#include "nilfs.h" 29#include "nilfs.h"
30#include "btnode.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
32#include "mdt.h" 33#include "mdt.h"
33#include "cpfile.h" 34#include "cpfile.h"
34#include "ifile.h" 35#include "ifile.h"
35 36
37struct nilfs_iget_args {
38 u64 ino;
39 __u64 cno;
40 struct nilfs_root *root;
41 int for_gc;
42};
36 43
37/** 44/**
38 * nilfs_get_block() - get a file block on the filesystem (callback function) 45 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -197,11 +204,15 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
197 if (unlikely(err)) 204 if (unlikely(err))
198 return err; 205 return err;
199 206
200 *pagep = NULL; 207 err = block_write_begin(mapping, pos, len, flags, pagep,
201 err = block_write_begin(file, mapping, pos, len, flags, pagep, 208 nilfs_get_block);
202 fsdata, nilfs_get_block); 209 if (unlikely(err)) {
203 if (unlikely(err)) 210 loff_t isize = mapping->host->i_size;
211 if (pos + len > isize)
212 vmtruncate(mapping->host, isize);
213
204 nilfs_transaction_abort(inode->i_sb); 214 nilfs_transaction_abort(inode->i_sb);
215 }
205 return err; 216 return err;
206} 217}
207 218
@@ -237,6 +248,19 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
237 /* Needs synchronization with the cleaner */ 248 /* Needs synchronization with the cleaner */
238 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 249 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
239 offset, nr_segs, nilfs_get_block, NULL); 250 offset, nr_segs, nilfs_get_block, NULL);
251
252 /*
253 * In case of error extending write may have instantiated a few
254 * blocks outside i_size. Trim these off again.
255 */
256 if (unlikely((rw & WRITE) && size < 0)) {
257 loff_t isize = i_size_read(inode);
258 loff_t end = offset + iov_length(iov, nr_segs);
259
260 if (end > isize)
261 vmtruncate(inode, isize);
262 }
263
240 return size; 264 return size;
241} 265}
242 266
@@ -261,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
261 struct nilfs_sb_info *sbi = NILFS_SB(sb); 285 struct nilfs_sb_info *sbi = NILFS_SB(sb);
262 struct inode *inode; 286 struct inode *inode;
263 struct nilfs_inode_info *ii; 287 struct nilfs_inode_info *ii;
288 struct nilfs_root *root;
264 int err = -ENOMEM; 289 int err = -ENOMEM;
265 ino_t ino; 290 ino_t ino;
266 291
@@ -271,15 +296,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
271 mapping_set_gfp_mask(inode->i_mapping, 296 mapping_set_gfp_mask(inode->i_mapping,
272 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 297 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
273 298
299 root = NILFS_I(dir)->i_root;
274 ii = NILFS_I(inode); 300 ii = NILFS_I(inode);
275 ii->i_state = 1 << NILFS_I_NEW; 301 ii->i_state = 1 << NILFS_I_NEW;
302 ii->i_root = root;
276 303
277 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); 304 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
278 if (unlikely(err)) 305 if (unlikely(err))
279 goto failed_ifile_create_inode; 306 goto failed_ifile_create_inode;
280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 307 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
281 308
282 atomic_inc(&sbi->s_inodes_count); 309 atomic_inc(&root->inodes_count);
283 inode_init_owner(inode, dir, mode); 310 inode_init_owner(inode, dir, mode);
284 inode->i_ino = ino; 311 inode->i_ino = ino;
285 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 312 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -302,7 +329,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
302 /* ii->i_file_acl = 0; */ 329 /* ii->i_file_acl = 0; */
303 /* ii->i_dir_acl = 0; */ 330 /* ii->i_dir_acl = 0; */
304 ii->i_dir_start_lookup = 0; 331 ii->i_dir_start_lookup = 0;
305 ii->i_cno = 0;
306 nilfs_set_inode_flags(inode); 332 nilfs_set_inode_flags(inode);
307 spin_lock(&sbi->s_next_gen_lock); 333 spin_lock(&sbi->s_next_gen_lock);
308 inode->i_generation = sbi->s_next_generation++; 334 inode->i_generation = sbi->s_next_generation++;
@@ -332,17 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
332 return ERR_PTR(err); 358 return ERR_PTR(err);
333} 359}
334 360
335void nilfs_free_inode(struct inode *inode)
336{
337 struct super_block *sb = inode->i_sb;
338 struct nilfs_sb_info *sbi = NILFS_SB(sb);
339
340 clear_inode(inode);
341 /* XXX: check error code? Is there any thing I can do? */
342 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
343 atomic_dec(&sbi->s_inodes_count);
344}
345
346void nilfs_set_inode_flags(struct inode *inode) 361void nilfs_set_inode_flags(struct inode *inode)
347{ 362{
348 unsigned int flags = NILFS_I(inode)->i_flags; 363 unsigned int flags = NILFS_I(inode)->i_flags;
@@ -393,7 +408,6 @@ int nilfs_read_inode_common(struct inode *inode,
393 0 : le32_to_cpu(raw_inode->i_dir_acl); 408 0 : le32_to_cpu(raw_inode->i_dir_acl);
394#endif 409#endif
395 ii->i_dir_start_lookup = 0; 410 ii->i_dir_start_lookup = 0;
396 ii->i_cno = 0;
397 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 411 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
398 412
399 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 413 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -407,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
407 return 0; 421 return 0;
408} 422}
409 423
410static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, 424static int __nilfs_read_inode(struct super_block *sb,
425 struct nilfs_root *root, unsigned long ino,
411 struct inode *inode) 426 struct inode *inode)
412{ 427{
413 struct nilfs_sb_info *sbi = NILFS_SB(sb); 428 struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -417,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
417 int err; 432 int err;
418 433
419 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 434 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
420 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); 435 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
421 if (unlikely(err)) 436 if (unlikely(err))
422 goto bad_inode; 437 goto bad_inode;
423 438
424 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 439 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
425 440
426 err = nilfs_read_inode_common(inode, raw_inode); 441 err = nilfs_read_inode_common(inode, raw_inode);
427 if (err) 442 if (err)
@@ -444,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
444 inode, inode->i_mode, 459 inode, inode->i_mode,
445 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 460 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
446 } 461 }
447 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 462 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
448 brelse(bh); 463 brelse(bh);
449 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 464 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
450 nilfs_set_inode_flags(inode); 465 nilfs_set_inode_flags(inode);
451 return 0; 466 return 0;
452 467
453 failed_unmap: 468 failed_unmap:
454 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 469 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
455 brelse(bh); 470 brelse(bh);
456 471
457 bad_inode: 472 bad_inode:
@@ -459,18 +474,95 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
459 return err; 474 return err;
460} 475}
461 476
462struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) 477static int nilfs_iget_test(struct inode *inode, void *opaque)
478{
479 struct nilfs_iget_args *args = opaque;
480 struct nilfs_inode_info *ii;
481
482 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
483 return 0;
484
485 ii = NILFS_I(inode);
486 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
487 return !args->for_gc;
488
489 return args->for_gc && args->cno == ii->i_cno;
490}
491
492static int nilfs_iget_set(struct inode *inode, void *opaque)
493{
494 struct nilfs_iget_args *args = opaque;
495
496 inode->i_ino = args->ino;
497 if (args->for_gc) {
498 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
499 NILFS_I(inode)->i_cno = args->cno;
500 NILFS_I(inode)->i_root = NULL;
501 } else {
502 if (args->root && args->ino == NILFS_ROOT_INO)
503 nilfs_get_root(args->root);
504 NILFS_I(inode)->i_root = args->root;
505 }
506 return 0;
507}
508
509struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
510 unsigned long ino)
511{
512 struct nilfs_iget_args args = {
513 .ino = ino, .root = root, .cno = 0, .for_gc = 0
514 };
515
516 return ilookup5(sb, ino, nilfs_iget_test, &args);
517}
518
519struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
520 unsigned long ino)
521{
522 struct nilfs_iget_args args = {
523 .ino = ino, .root = root, .cno = 0, .for_gc = 0
524 };
525
526 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
527}
528
529struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
530 unsigned long ino)
531{
532 struct inode *inode;
533 int err;
534
535 inode = nilfs_iget_locked(sb, root, ino);
536 if (unlikely(!inode))
537 return ERR_PTR(-ENOMEM);
538 if (!(inode->i_state & I_NEW))
539 return inode;
540
541 err = __nilfs_read_inode(sb, root, ino, inode);
542 if (unlikely(err)) {
543 iget_failed(inode);
544 return ERR_PTR(err);
545 }
546 unlock_new_inode(inode);
547 return inode;
548}
549
550struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
551 __u64 cno)
463{ 552{
553 struct nilfs_iget_args args = {
554 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
555 };
464 struct inode *inode; 556 struct inode *inode;
465 int err; 557 int err;
466 558
467 inode = iget_locked(sb, ino); 559 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
468 if (unlikely(!inode)) 560 if (unlikely(!inode))
469 return ERR_PTR(-ENOMEM); 561 return ERR_PTR(-ENOMEM);
470 if (!(inode->i_state & I_NEW)) 562 if (!(inode->i_state & I_NEW))
471 return inode; 563 return inode;
472 564
473 err = __nilfs_read_inode(sb, ino, inode); 565 err = nilfs_init_gcinode(inode);
474 if (unlikely(err)) { 566 if (unlikely(err)) {
475 iget_failed(inode); 567 iget_failed(inode);
476 return ERR_PTR(err); 568 return ERR_PTR(err);
@@ -511,21 +603,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
511{ 603{
512 ino_t ino = inode->i_ino; 604 ino_t ino = inode->i_ino;
513 struct nilfs_inode_info *ii = NILFS_I(inode); 605 struct nilfs_inode_info *ii = NILFS_I(inode);
514 struct super_block *sb = inode->i_sb; 606 struct inode *ifile = ii->i_root->ifile;
515 struct nilfs_sb_info *sbi = NILFS_SB(sb);
516 struct nilfs_inode *raw_inode; 607 struct nilfs_inode *raw_inode;
517 608
518 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 609 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
519 610
520 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 611 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
521 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 612 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
522 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 613 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
523 614
524 nilfs_write_inode_common(inode, raw_inode, 0); 615 nilfs_write_inode_common(inode, raw_inode, 0);
525 /* XXX: call with has_bmap = 0 is a workaround to avoid 616 /* XXX: call with has_bmap = 0 is a workaround to avoid
526 deadlock of bmap. This delays update of i_bmap to just 617 deadlock of bmap. This delays update of i_bmap to just
527 before writing */ 618 before writing */
528 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); 619 nilfs_ifile_unmap_inode(ifile, ino, ibh);
529} 620}
530 621
531#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 622#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
@@ -597,16 +688,41 @@ void nilfs_truncate(struct inode *inode)
597 But truncate has no return value. */ 688 But truncate has no return value. */
598} 689}
599 690
600void nilfs_delete_inode(struct inode *inode) 691static void nilfs_clear_inode(struct inode *inode)
692{
693 struct nilfs_inode_info *ii = NILFS_I(inode);
694 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
695
696 /*
697 * Free resources allocated in nilfs_read_inode(), here.
698 */
699 BUG_ON(!list_empty(&ii->i_dirty));
700 brelse(ii->i_bh);
701 ii->i_bh = NULL;
702
703 if (mdi && mdi->mi_palloc_cache)
704 nilfs_palloc_destroy_cache(inode);
705
706 if (test_bit(NILFS_I_BMAP, &ii->i_state))
707 nilfs_bmap_clear(ii->i_bmap);
708
709 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
710
711 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
712 nilfs_put_root(ii->i_root);
713}
714
715void nilfs_evict_inode(struct inode *inode)
601{ 716{
602 struct nilfs_transaction_info ti; 717 struct nilfs_transaction_info ti;
603 struct super_block *sb = inode->i_sb; 718 struct super_block *sb = inode->i_sb;
604 struct nilfs_inode_info *ii = NILFS_I(inode); 719 struct nilfs_inode_info *ii = NILFS_I(inode);
605 720
606 if (unlikely(is_bad_inode(inode))) { 721 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
607 if (inode->i_data.nrpages) 722 if (inode->i_data.nrpages)
608 truncate_inode_pages(&inode->i_data, 0); 723 truncate_inode_pages(&inode->i_data, 0);
609 clear_inode(inode); 724 end_writeback(inode);
725 nilfs_clear_inode(inode);
610 return; 726 return;
611 } 727 }
612 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 728 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
@@ -614,10 +730,16 @@ void nilfs_delete_inode(struct inode *inode)
614 if (inode->i_data.nrpages) 730 if (inode->i_data.nrpages)
615 truncate_inode_pages(&inode->i_data, 0); 731 truncate_inode_pages(&inode->i_data, 0);
616 732
733 /* TODO: some of the following operations may fail. */
617 nilfs_truncate_bmap(ii, 0); 734 nilfs_truncate_bmap(ii, 0);
618 nilfs_mark_inode_dirty(inode); 735 nilfs_mark_inode_dirty(inode);
619 nilfs_free_inode(inode); 736 end_writeback(inode);
620 /* nilfs_free_inode() marks inode buffer dirty */ 737
738 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
739 atomic_dec(&ii->i_root->inodes_count);
740
741 nilfs_clear_inode(inode);
742
621 if (IS_SYNC(inode)) 743 if (IS_SYNC(inode))
622 nilfs_set_transaction_flag(NILFS_TI_SYNC); 744 nilfs_set_transaction_flag(NILFS_TI_SYNC);
623 nilfs_transaction_commit(sb); 745 nilfs_transaction_commit(sb);
@@ -639,17 +761,41 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
639 err = nilfs_transaction_begin(sb, &ti, 0); 761 err = nilfs_transaction_begin(sb, &ti, 0);
640 if (unlikely(err)) 762 if (unlikely(err))
641 return err; 763 return err;
642 err = inode_setattr(inode, iattr); 764
643 if (!err && (iattr->ia_valid & ATTR_MODE)) 765 if ((iattr->ia_valid & ATTR_SIZE) &&
766 iattr->ia_size != i_size_read(inode)) {
767 err = vmtruncate(inode, iattr->ia_size);
768 if (unlikely(err))
769 goto out_err;
770 }
771
772 setattr_copy(inode, iattr);
773 mark_inode_dirty(inode);
774
775 if (iattr->ia_valid & ATTR_MODE) {
644 err = nilfs_acl_chmod(inode); 776 err = nilfs_acl_chmod(inode);
645 if (likely(!err)) 777 if (unlikely(err))
646 err = nilfs_transaction_commit(sb); 778 goto out_err;
647 else 779 }
648 nilfs_transaction_abort(sb); 780
781 return nilfs_transaction_commit(sb);
649 782
783out_err:
784 nilfs_transaction_abort(sb);
650 return err; 785 return err;
651} 786}
652 787
788int nilfs_permission(struct inode *inode, int mask)
789{
790 struct nilfs_root *root = NILFS_I(inode)->i_root;
791
792 if ((mask & MAY_WRITE) && root &&
793 root->cno != NILFS_CPTREE_CURRENT_CNO)
794 return -EROFS; /* snapshot is not writable */
795
796 return generic_permission(inode, mask, NULL);
797}
798
653int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 799int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
654 struct buffer_head **pbh) 800 struct buffer_head **pbh)
655{ 801{
@@ -659,8 +805,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
659 spin_lock(&sbi->s_inode_lock); 805 spin_lock(&sbi->s_inode_lock);
660 if (ii->i_bh == NULL) { 806 if (ii->i_bh == NULL) {
661 spin_unlock(&sbi->s_inode_lock); 807 spin_unlock(&sbi->s_inode_lock);
662 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 808 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
663 pbh); 809 inode->i_ino, pbh);
664 if (unlikely(err)) 810 if (unlikely(err))
665 return err; 811 return err;
666 spin_lock(&sbi->s_inode_lock); 812 spin_lock(&sbi->s_inode_lock);
@@ -740,7 +886,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
740 } 886 }
741 nilfs_update_inode(inode, ibh); 887 nilfs_update_inode(inode, ibh);
742 nilfs_mdt_mark_buffer_dirty(ibh); 888 nilfs_mdt_mark_buffer_dirty(ibh);
743 nilfs_mdt_mark_dirty(sbi->s_ifile); 889 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
744 brelse(ibh); 890 brelse(ibh);
745 return 0; 891 return 0;
746} 892}
@@ -758,6 +904,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
758void nilfs_dirty_inode(struct inode *inode) 904void nilfs_dirty_inode(struct inode *inode)
759{ 905{
760 struct nilfs_transaction_info ti; 906 struct nilfs_transaction_info ti;
907 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
761 908
762 if (is_bad_inode(inode)) { 909 if (is_bad_inode(inode)) {
763 nilfs_warning(inode->i_sb, __func__, 910 nilfs_warning(inode->i_sb, __func__,
@@ -765,6 +912,10 @@ void nilfs_dirty_inode(struct inode *inode)
765 dump_stack(); 912 dump_stack();
766 return; 913 return;
767 } 914 }
915 if (mdi) {
916 nilfs_mdt_mark_dirty(inode);
917 return;
918 }
768 nilfs_transaction_begin(inode->i_sb, &ti, 0); 919 nilfs_transaction_begin(inode->i_sb, &ti, 0);
769 nilfs_mark_inode_dirty(inode); 920 nilfs_mark_inode_dirty(inode);
770 nilfs_transaction_commit(inode->i_sb); /* never fails */ 921 nilfs_transaction_commit(inode->i_sb); /* never fails */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b..3e90f86d5bf 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
118 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
119 goto out; 118 goto out;
120 119
121 mutex_lock(&nilfs->ns_mount_mutex); 120 down_read(&inode->i_sb->s_umount);
122 121
123 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
124 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
128 else 127 else
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 129
131 mutex_unlock(&nilfs->ns_mount_mutex); 130 up_read(&inode->i_sb->s_umount);
132out: 131out:
133 mnt_drop_write(filp->f_path.mnt); 132 mnt_drop_write(filp->f_path.mnt);
134 return ret; 133 return ret;
@@ -334,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
334 return 0; 333 return 0;
335} 334}
336 335
337static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, 336static int nilfs_ioctl_move_blocks(struct super_block *sb,
338 struct nilfs_argv *argv, void *buf) 337 struct nilfs_argv *argv, void *buf)
339{ 338{
340 size_t nmembs = argv->v_nmembs; 339 size_t nmembs = argv->v_nmembs;
@@ -349,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
349 for (i = 0, vdesc = buf; i < nmembs; ) { 348 for (i = 0, vdesc = buf; i < nmembs; ) {
350 ino = vdesc->vd_ino; 349 ino = vdesc->vd_ino;
351 cno = vdesc->vd_cno; 350 cno = vdesc->vd_cno;
352 inode = nilfs_gc_iget(nilfs, ino, cno); 351 inode = nilfs_iget_for_gc(sb, ino, cno);
353 if (unlikely(inode == NULL)) { 352 if (unlikely(inode == NULL)) {
354 ret = -ENOMEM; 353 ret = -ENOMEM;
355 goto failed; 354 goto failed;
@@ -357,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
357 do { 356 do {
358 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 357 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
359 &buffers); 358 &buffers);
360 if (unlikely(ret < 0)) 359 if (unlikely(ret < 0)) {
360 iput(inode);
361 goto failed; 361 goto failed;
362 }
362 vdesc++; 363 vdesc++;
363 } while (++i < nmembs && 364 } while (++i < nmembs &&
364 vdesc->vd_ino == ino && vdesc->vd_cno == cno); 365 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
366
367 iput(inode); /* The inode still remains in GC inode list */
365 } 368 }
366 369
367 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 370 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -567,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
567 } 570 }
568 571
569 /* 572 /*
570 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), 573 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
571 * which will operates an inode list without blocking. 574 * which will operates an inode list without blocking.
572 * To protect the list from concurrent operations, 575 * To protect the list from concurrent operations,
573 * nilfs_ioctl_move_blocks should be atomic operation. 576 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +580,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
577 goto out_free; 580 goto out_free;
578 } 581 }
579 582
580 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); 583 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
584
585 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
581 if (ret < 0) 586 if (ret < 0)
582 printk(KERN_ERR "NILFS: GC failed during preparation: " 587 printk(KERN_ERR "NILFS: GC failed during preparation: "
583 "cannot read source blocks: err=%d\n", ret); 588 "cannot read source blocks: err=%d\n", ret);
584 else 589 else
585 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 590 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
586 591
587 if (ret < 0) 592 nilfs_remove_all_gcinodes(nilfs);
588 nilfs_remove_all_gcinode(nilfs);
589 clear_nilfs_gc_running(nilfs); 593 clear_nilfs_gc_running(nilfs);
590 594
591out_free: 595out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb..39a5b84e2c9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include "nilfs.h" 30#include "nilfs.h"
31#include "btnode.h"
31#include "segment.h" 32#include "segment.h"
32#include "page.h" 33#include "page.h"
33#include "mdt.h" 34#include "mdt.h"
@@ -35,7 +36,6 @@
35 36
36#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
37 38
38#define INIT_UNUSED_INODE_FIELDS
39 39
40static int 40static int
41nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 41nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -77,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
77 struct buffer_head *, 77 struct buffer_head *,
78 void *)) 78 void *))
79{ 79{
80 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
81 struct super_block *sb = inode->i_sb; 80 struct super_block *sb = inode->i_sb;
82 struct nilfs_transaction_info ti; 81 struct nilfs_transaction_info ti;
83 struct buffer_head *bh; 82 struct buffer_head *bh;
84 int err; 83 int err;
85 84
86 if (!sb) {
87 /*
88 * Make sure this function is not called from any
89 * read-only context.
90 */
91 if (!nilfs->ns_writer) {
92 WARN_ON(1);
93 err = -EROFS;
94 goto out;
95 }
96 sb = nilfs->ns_writer->s_super;
97 }
98
99 nilfs_transaction_begin(sb, &ti, 0); 85 nilfs_transaction_begin(sb, &ti, 0);
100 86
101 err = -ENOMEM; 87 err = -ENOMEM;
@@ -111,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
111 if (buffer_uptodate(bh)) 97 if (buffer_uptodate(bh))
112 goto failed_bh; 98 goto failed_bh;
113 99
114 bh->b_bdev = nilfs->ns_bdev; 100 bh->b_bdev = sb->s_bdev;
115 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 101 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
116 if (likely(!err)) { 102 if (likely(!err)) {
117 get_bh(bh); 103 get_bh(bh);
@@ -128,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
128 err = nilfs_transaction_commit(sb); 114 err = nilfs_transaction_commit(sb);
129 else 115 else
130 nilfs_transaction_abort(sb); 116 nilfs_transaction_abort(sb);
131 out: 117
132 return err; 118 return err;
133} 119}
134 120
@@ -166,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
166 unlock_buffer(bh); 152 unlock_buffer(bh);
167 goto failed_bh; 153 goto failed_bh;
168 } 154 }
169 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; 155 map_bh(bh, inode->i_sb, (sector_t)blknum);
170 bh->b_blocknr = (sector_t)blknum;
171 set_buffer_mapped(bh);
172 156
173 bh->b_end_io = end_buffer_read_sync; 157 bh->b_end_io = end_buffer_read_sync;
174 get_bh(bh); 158 get_bh(bh);
@@ -397,35 +381,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
397static int 381static int
398nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 382nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
399{ 383{
400 struct inode *inode = container_of(page->mapping, 384 struct inode *inode;
401 struct inode, i_data); 385 struct super_block *sb;
402 struct super_block *sb = inode->i_sb;
403 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
404 struct nilfs_sb_info *writer = NULL;
405 int err = 0; 386 int err = 0;
406 387
407 redirty_page_for_writepage(wbc, page); 388 redirty_page_for_writepage(wbc, page);
408 unlock_page(page); 389 unlock_page(page);
409 390
410 if (page->mapping->assoc_mapping) 391 inode = page->mapping->host;
411 return 0; /* Do not request flush for shadow page cache */ 392 if (!inode)
412 if (!sb) { 393 return 0;
413 down_read(&nilfs->ns_writer_sem); 394
414 writer = nilfs->ns_writer; 395 sb = inode->i_sb;
415 if (!writer) {
416 up_read(&nilfs->ns_writer_sem);
417 return -EROFS;
418 }
419 sb = writer->s_super;
420 }
421 396
422 if (wbc->sync_mode == WB_SYNC_ALL) 397 if (wbc->sync_mode == WB_SYNC_ALL)
423 err = nilfs_construct_segment(sb); 398 err = nilfs_construct_segment(sb);
424 else if (wbc->for_reclaim) 399 else if (wbc->for_reclaim)
425 nilfs_flush_segment(sb, inode->i_ino); 400 nilfs_flush_segment(sb, inode->i_ino);
426 401
427 if (writer)
428 up_read(&nilfs->ns_writer_sem);
429 return err; 402 return err;
430} 403}
431 404
@@ -438,105 +411,27 @@ static const struct address_space_operations def_mdt_aops = {
438static const struct inode_operations def_mdt_iops; 411static const struct inode_operations def_mdt_iops;
439static const struct file_operations def_mdt_fops; 412static const struct file_operations def_mdt_fops;
440 413
441/* 414
442 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 415int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
443 * ifile, or gcinodes. This allows the B-tree code and segment constructor
444 * to treat them like regular files, and this helps to simplify the
445 * implementation.
446 * On the other hand, some of the pseudo inodes have an irregular point:
447 * They don't have valid inode->i_sb pointer because their lifetimes are
448 * longer than those of the super block structs; they may continue for
449 * several consecutive mounts/umounts. This would need discussions.
450 */
451/**
452 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
453 * @nilfs: nilfs object
454 * @sb: super block instance the metadata file belongs to
455 * @ino: inode number
456 * @gfp_mask: gfp mask for data pages
457 * @objsz: size of the private object attached to inode->i_private
458 */
459struct inode *
460nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
461 ino_t ino, gfp_t gfp_mask, size_t objsz)
462{ 416{
463 struct inode *inode = nilfs_alloc_inode_common(nilfs); 417 struct nilfs_mdt_info *mi;
464 418
465 if (!inode) 419 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
466 return NULL; 420 if (!mi)
467 else { 421 return -ENOMEM;
468 struct address_space * const mapping = &inode->i_data;
469 struct nilfs_mdt_info *mi;
470
471 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
472 if (!mi) {
473 nilfs_destroy_inode(inode);
474 return NULL;
475 }
476 mi->mi_nilfs = nilfs;
477 init_rwsem(&mi->mi_sem);
478
479 inode->i_sb = sb; /* sb may be NULL for some meta data files */
480 inode->i_blkbits = nilfs->ns_blocksize_bits;
481 inode->i_flags = 0;
482 atomic_set(&inode->i_count, 1);
483 inode->i_nlink = 1;
484 inode->i_ino = ino;
485 inode->i_mode = S_IFREG;
486 inode->i_private = mi;
487
488#ifdef INIT_UNUSED_INODE_FIELDS
489 atomic_set(&inode->i_writecount, 0);
490 inode->i_size = 0;
491 inode->i_blocks = 0;
492 inode->i_bytes = 0;
493 inode->i_generation = 0;
494#ifdef CONFIG_QUOTA
495 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
496#endif
497 inode->i_pipe = NULL;
498 inode->i_bdev = NULL;
499 inode->i_cdev = NULL;
500 inode->i_rdev = 0;
501#ifdef CONFIG_SECURITY
502 inode->i_security = NULL;
503#endif
504 inode->dirtied_when = 0;
505
506 INIT_LIST_HEAD(&inode->i_list);
507 INIT_LIST_HEAD(&inode->i_sb_list);
508 inode->i_state = 0;
509#endif
510
511 spin_lock_init(&inode->i_lock);
512 mutex_init(&inode->i_mutex);
513 init_rwsem(&inode->i_alloc_sem);
514
515 mapping->host = NULL; /* instead of inode */
516 mapping->flags = 0;
517 mapping_set_gfp_mask(mapping, gfp_mask);
518 mapping->assoc_mapping = NULL;
519 mapping->backing_dev_info = nilfs->ns_bdi;
520
521 inode->i_mapping = mapping;
522 }
523 422
524 return inode; 423 init_rwsem(&mi->mi_sem);
525} 424 inode->i_private = mi;
526 425
527struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 426 inode->i_mode = S_IFREG;
528 ino_t ino, size_t objsz) 427 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
529{ 428 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
530 struct inode *inode;
531
532 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
533 if (!inode)
534 return NULL;
535 429
536 inode->i_op = &def_mdt_iops; 430 inode->i_op = &def_mdt_iops;
537 inode->i_fop = &def_mdt_fops; 431 inode->i_fop = &def_mdt_fops;
538 inode->i_mapping->a_ops = &def_mdt_aops; 432 inode->i_mapping->a_ops = &def_mdt_aops;
539 return inode; 433
434 return 0;
540} 435}
541 436
542void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 437void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -549,34 +444,159 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
549 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 444 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
550} 445}
551 446
552void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) 447static const struct address_space_operations shadow_map_aops = {
448 .sync_page = block_sync_page,
449};
450
451/**
452 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
453 * @inode: inode of the metadata file
454 * @shadow: shadow mapping
455 */
456int nilfs_mdt_setup_shadow_map(struct inode *inode,
457 struct nilfs_shadow_map *shadow)
553{ 458{
554 shadow->i_mapping->assoc_mapping = orig->i_mapping; 459 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
555 NILFS_I(shadow)->i_btnode_cache.assoc_mapping = 460 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
556 &NILFS_I(orig)->i_btnode_cache; 461
462 INIT_LIST_HEAD(&shadow->frozen_buffers);
463 nilfs_mapping_init_once(&shadow->frozen_data);
464 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
465 nilfs_mapping_init_once(&shadow->frozen_btnodes);
466 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
467 mi->mi_shadow = shadow;
468 return 0;
557} 469}
558 470
559static void nilfs_mdt_clear(struct inode *inode) 471/**
472 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
473 * @inode: inode of the metadata file
474 */
475int nilfs_mdt_save_to_shadow_map(struct inode *inode)
560{ 476{
477 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
561 struct nilfs_inode_info *ii = NILFS_I(inode); 478 struct nilfs_inode_info *ii = NILFS_I(inode);
479 struct nilfs_shadow_map *shadow = mi->mi_shadow;
480 int ret;
562 481
563 invalidate_mapping_pages(inode->i_mapping, 0, -1); 482 ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
564 truncate_inode_pages(inode->i_mapping, 0); 483 if (ret)
484 goto out;
485
486 ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
487 &ii->i_btnode_cache);
488 if (ret)
489 goto out;
565 490
566 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 491 nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
567 nilfs_bmap_clear(ii->i_bmap); 492 out:
568 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 493 return ret;
569} 494}
570 495
571void nilfs_mdt_destroy(struct inode *inode) 496int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
572{ 497{
573 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 498 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
499 struct buffer_head *bh_frozen;
500 struct page *page;
501 int blkbits = inode->i_blkbits;
502 int ret = -ENOMEM;
503
504 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
505 if (!page)
506 return ret;
507
508 if (!page_has_buffers(page))
509 create_empty_buffers(page, 1 << blkbits, 0);
510
511 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
512 if (bh_frozen) {
513 if (!buffer_uptodate(bh_frozen))
514 nilfs_copy_buffer(bh_frozen, bh);
515 if (list_empty(&bh_frozen->b_assoc_buffers)) {
516 list_add_tail(&bh_frozen->b_assoc_buffers,
517 &shadow->frozen_buffers);
518 set_buffer_nilfs_redirected(bh);
519 } else {
520 brelse(bh_frozen); /* already frozen */
521 }
522 ret = 0;
523 }
524 unlock_page(page);
525 page_cache_release(page);
526 return ret;
527}
528
529struct buffer_head *
530nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
531{
532 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
533 struct buffer_head *bh_frozen = NULL;
534 struct page *page;
535 int n;
536
537 page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
538 if (page) {
539 if (page_has_buffers(page)) {
540 n = bh_offset(bh) >> inode->i_blkbits;
541 bh_frozen = nilfs_page_get_nth_block(page, n);
542 }
543 unlock_page(page);
544 page_cache_release(page);
545 }
546 return bh_frozen;
547}
548
549static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
550{
551 struct list_head *head = &shadow->frozen_buffers;
552 struct buffer_head *bh;
553
554 while (!list_empty(head)) {
555 bh = list_first_entry(head, struct buffer_head,
556 b_assoc_buffers);
557 list_del_init(&bh->b_assoc_buffers);
558 brelse(bh); /* drop ref-count to make it releasable */
559 }
560}
561
562/**
563 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
564 * @inode: inode of the metadata file
565 */
566void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
567{
568 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
569 struct nilfs_inode_info *ii = NILFS_I(inode);
570 struct nilfs_shadow_map *shadow = mi->mi_shadow;
571
572 down_write(&mi->mi_sem);
574 573
575 if (mdi->mi_palloc_cache) 574 if (mi->mi_palloc_cache)
576 nilfs_palloc_destroy_cache(inode); 575 nilfs_palloc_clear_cache(inode);
577 nilfs_mdt_clear(inode); 576
577 nilfs_clear_dirty_pages(inode->i_mapping);
578 nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
579
580 nilfs_clear_dirty_pages(&ii->i_btnode_cache);
581 nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
582
583 nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
584
585 up_write(&mi->mi_sem);
586}
587
588/**
589 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
590 * @inode: inode of the metadata file
591 */
592void nilfs_mdt_clear_shadow_map(struct inode *inode)
593{
594 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
595 struct nilfs_shadow_map *shadow = mi->mi_shadow;
578 596
579 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 597 down_write(&mi->mi_sem);
580 kfree(mdi); 598 nilfs_release_frozen_buffers(shadow);
581 nilfs_destroy_inode(inode); 599 truncate_inode_pages(&shadow->frozen_data, 0);
600 truncate_inode_pages(&shadow->frozen_btnodes, 0);
601 up_write(&mi->mi_sem);
582} 602}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470f..b13734bf352 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
28#include "nilfs.h" 28#include "nilfs.h"
29#include "page.h" 29#include "page.h"
30 30
31struct nilfs_shadow_map {
32 struct nilfs_bmap_store bmap_store;
33 struct address_space frozen_data;
34 struct address_space frozen_btnodes;
35 struct list_head frozen_buffers;
36};
37
31/** 38/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files 39 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations 40 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking 41 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry 42 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry 43 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block 44 * @mi_entries_per_block: number of entries in a block
39 * @mi_palloc_cache: persistent object allocator cache 45 * @mi_palloc_cache: persistent object allocator cache
46 * @mi_shadow: shadow of bmap and page caches
40 * @mi_blocks_per_group: number of blocks in a group 47 * @mi_blocks_per_group: number of blocks in a group
41 * @mi_blocks_per_desc_block: number of blocks per descriptor block 48 * @mi_blocks_per_desc_block: number of blocks per descriptor block
42 */ 49 */
43struct nilfs_mdt_info { 50struct nilfs_mdt_info {
44 struct the_nilfs *mi_nilfs;
45 struct rw_semaphore mi_sem; 51 struct rw_semaphore mi_sem;
46 struct blockgroup_lock *mi_bgl; 52 struct blockgroup_lock *mi_bgl;
47 unsigned mi_entry_size; 53 unsigned mi_entry_size;
48 unsigned mi_first_entry_offset; 54 unsigned mi_first_entry_offset;
49 unsigned long mi_entries_per_block; 55 unsigned long mi_entries_per_block;
50 struct nilfs_palloc_cache *mi_palloc_cache; 56 struct nilfs_palloc_cache *mi_palloc_cache;
57 struct nilfs_shadow_map *mi_shadow;
51 unsigned long mi_blocks_per_group; 58 unsigned long mi_blocks_per_group;
52 unsigned long mi_blocks_per_desc_block; 59 unsigned long mi_blocks_per_desc_block;
53}; 60};
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
59 66
60static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
61{ 68{
62 struct super_block *sb = inode->i_sb; 69 return NILFS_SB(inode->i_sb)->s_nilfs;
63
64 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
65} 70}
66 71
67/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
76int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 81int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
77int nilfs_mdt_fetch_dirty(struct inode *); 82int nilfs_mdt_fetch_dirty(struct inode *);
78 83
79struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 84int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
80 size_t);
81struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
82 ino_t, gfp_t, size_t);
83void nilfs_mdt_destroy(struct inode *);
84void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); 85void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
85void nilfs_mdt_set_shadow(struct inode *, struct inode *);
86 86
87int nilfs_mdt_setup_shadow_map(struct inode *inode,
88 struct nilfs_shadow_map *shadow);
89int nilfs_mdt_save_to_shadow_map(struct inode *inode);
90void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
91void nilfs_mdt_clear_shadow_map(struct inode *inode);
92int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
93struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
94 struct buffer_head *bh);
87 95
88#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) 96#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
89 97
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
100 108
101static inline __u64 nilfs_mdt_cno(struct inode *inode) 109static inline __u64 nilfs_mdt_cno(struct inode *inode)
102{ 110{
103 return NILFS_MDT(inode)->mi_nilfs->ns_cno; 111 return NILFS_I_NILFS(inode)->ns_cno;
104} 112}
105 113
106#define nilfs_mdt_bgl_lock(inode, bg) \ 114#define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b..6e9557ecf16 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
40 40
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include "nilfs.h" 42#include "nilfs.h"
43#include "export.h"
43 44
45#define NILFS_FID_SIZE_NON_CONNECTABLE \
46 (offsetof(struct nilfs_fid, parent_gen) / 4)
47#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4)
44 48
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) 49static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{ 50{
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
70 ino = nilfs_inode_by_name(dir, &dentry->d_name); 74 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 75 inode = NULL;
72 if (ino) { 76 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 77 inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
74 if (IS_ERR(inode)) 78 if (IS_ERR(inode))
75 return ERR_CAST(inode); 79 return ERR_CAST(inode);
76 } 80 }
77 return d_splice_alias(inode, dentry); 81 return d_splice_alias(inode, dentry);
78} 82}
79 83
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
87 if (!ino)
88 return ERR_PTR(-ENOENT);
89
90 inode = nilfs_iget(child->d_inode->i_sb, ino);
91 if (IS_ERR(inode))
92 return ERR_CAST(inode);
93 return d_obtain_alias(inode);
94}
95
96/* 84/*
97 * By the time this is called, we already have created 85 * By the time this is called, we already have created
98 * the directory cache entry for the new file, but it 86 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
219 207
220 inode->i_ctime = CURRENT_TIME; 208 inode->i_ctime = CURRENT_TIME;
221 inode_inc_link_count(inode); 209 inode_inc_link_count(inode);
222 atomic_inc(&inode->i_count); 210 ihold(inode);
223 211
224 err = nilfs_add_nondir(dentry, inode); 212 err = nilfs_add_nondir(dentry, inode);
225 if (!err) 213 if (!err)
@@ -468,6 +456,115 @@ out:
468 return err; 456 return err;
469} 457}
470 458
459/*
460 * Export operations
461 */
462static struct dentry *nilfs_get_parent(struct dentry *child)
463{
464 unsigned long ino;
465 struct inode *inode;
466 struct qstr dotdot = {.name = "..", .len = 2};
467 struct nilfs_root *root;
468
469 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
470 if (!ino)
471 return ERR_PTR(-ENOENT);
472
473 root = NILFS_I(child->d_inode)->i_root;
474
475 inode = nilfs_iget(child->d_inode->i_sb, root, ino);
476 if (IS_ERR(inode))
477 return ERR_CAST(inode);
478
479 return d_obtain_alias(inode);
480}
481
482static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
483 u64 ino, u32 gen)
484{
485 struct nilfs_root *root;
486 struct inode *inode;
487
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE);
490
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
492 if (!root)
493 return ERR_PTR(-ESTALE);
494
495 inode = nilfs_iget(sb, root, ino);
496 nilfs_put_root(root);
497
498 if (IS_ERR(inode))
499 return ERR_CAST(inode);
500 if (gen && inode->i_generation != gen) {
501 iput(inode);
502 return ERR_PTR(-ESTALE);
503 }
504 return d_obtain_alias(inode);
505}
506
507static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
508 int fh_len, int fh_type)
509{
510 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
511
512 if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
513 fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
514 (fh_type != FILEID_NILFS_WITH_PARENT &&
515 fh_type != FILEID_NILFS_WITHOUT_PARENT))
516 return NULL;
517
518 return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
519}
520
521static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
522 int fh_len, int fh_type)
523{
524 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
525
526 if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
527 fh_type != FILEID_NILFS_WITH_PARENT)
528 return NULL;
529
530 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
531}
532
533static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
534 int connectable)
535{
536 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
537 struct inode *inode = dentry->d_inode;
538 struct nilfs_root *root = NILFS_I(inode)->i_root;
539 int type;
540
541 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
542 (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
543 return 255;
544
545 fid->cno = root->cno;
546 fid->ino = inode->i_ino;
547 fid->gen = inode->i_generation;
548
549 if (connectable && !S_ISDIR(inode->i_mode)) {
550 struct inode *parent;
551
552 spin_lock(&dentry->d_lock);
553 parent = dentry->d_parent->d_inode;
554 fid->parent_ino = parent->i_ino;
555 fid->parent_gen = parent->i_generation;
556 spin_unlock(&dentry->d_lock);
557
558 type = FILEID_NILFS_WITH_PARENT;
559 *lenp = NILFS_FID_SIZE_CONNECTABLE;
560 } else {
561 type = FILEID_NILFS_WITHOUT_PARENT;
562 *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
563 }
564
565 return type;
566}
567
471const struct inode_operations nilfs_dir_inode_operations = { 568const struct inode_operations nilfs_dir_inode_operations = {
472 .create = nilfs_create, 569 .create = nilfs_create,
473 .lookup = nilfs_lookup, 570 .lookup = nilfs_lookup,
@@ -491,4 +588,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
491 .readlink = generic_readlink, 588 .readlink = generic_readlink,
492 .follow_link = page_follow_link_light, 589 .follow_link = page_follow_link_light,
493 .put_link = page_put_link, 590 .put_link = page_put_link,
591 .permission = nilfs_permission,
592};
593
594const struct export_operations nilfs_export_ops = {
595 .encode_fh = nilfs_encode_fh,
596 .fh_to_dentry = nilfs_fh_to_dentry,
597 .fh_to_parent = nilfs_fh_to_parent,
598 .get_parent = nilfs_get_parent,
494}; 599};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 47d6d792812..f7560da5a56 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
32#include "the_nilfs.h" 32#include "the_nilfs.h"
33#include "sb.h" 33#include "sb.h"
34#include "bmap.h" 34#include "bmap.h"
35#include "bmap_union.h"
36 35
37/* 36/*
38 * nilfs inode data in memory 37 * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
41 __u32 i_flags; 40 __u32 i_flags;
42 unsigned long i_state; /* Dynamic state flags */ 41 unsigned long i_state; /* Dynamic state flags */
43 struct nilfs_bmap *i_bmap; 42 struct nilfs_bmap *i_bmap;
44 union nilfs_bmap_union i_bmap_union; 43 struct nilfs_bmap i_bmap_data;
45 __u64 i_xattr; /* sector_t ??? */ 44 __u64 i_xattr; /* sector_t ??? */
46 __u32 i_dir_start_lookup; 45 __u32 i_dir_start_lookup;
47 __u64 i_cno; /* check point number for GC inode */ 46 __u64 i_cno; /* check point number for GC inode */
@@ -60,6 +59,7 @@ struct nilfs_inode_info {
60#endif 59#endif
61 struct buffer_head *i_bh; /* i_bh contains a new or dirty 60 struct buffer_head *i_bh; /* i_bh contains a new or dirty
62 disk inode */ 61 disk inode */
62 struct nilfs_root *i_root;
63 struct inode vfs_inode; 63 struct inode vfs_inode;
64}; 64};
65 65
@@ -71,9 +71,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
71static inline struct nilfs_inode_info * 71static inline struct nilfs_inode_info *
72NILFS_BMAP_I(const struct nilfs_bmap *bmap) 72NILFS_BMAP_I(const struct nilfs_bmap *bmap)
73{ 73{
74 return container_of((union nilfs_bmap_union *)bmap, 74 return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
75 struct nilfs_inode_info,
76 i_bmap_union);
77} 75}
78 76
79static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) 77static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -103,7 +101,14 @@ enum {
103 NILFS_I_INODE_DIRTY, /* write_inode is requested */ 101 NILFS_I_INODE_DIRTY, /* write_inode is requested */
104 NILFS_I_BMAP, /* has bmap and btnode_cache */ 102 NILFS_I_BMAP, /* has bmap and btnode_cache */
105 NILFS_I_GCINODE, /* inode for GC, on memory only */ 103 NILFS_I_GCINODE, /* inode for GC, on memory only */
106 NILFS_I_GCDAT, /* shadow DAT, on memory only */ 104};
105
106/*
107 * commit flags for nilfs_commit_super and nilfs_sync_super
108 */
109enum {
110 NILFS_SB_COMMIT = 0, /* Commit a super block alternately */
111 NILFS_SB_COMMIT_ALL /* Commit both super blocks */
107}; 112};
108 113
109/* 114/*
@@ -187,7 +192,7 @@ static inline int nilfs_doing_construction(void)
187 192
188static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) 193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
189{ 194{
190 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; 195 return nilfs->ns_dat;
191} 196}
192 197
193/* 198/*
@@ -195,12 +200,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
195 */ 200 */
196#ifdef CONFIG_NILFS_POSIX_ACL 201#ifdef CONFIG_NILFS_POSIX_ACL
197#error "NILFS: not yet supported POSIX ACL" 202#error "NILFS: not yet supported POSIX ACL"
198extern int nilfs_permission(struct inode *, int, struct nameidata *);
199extern int nilfs_acl_chmod(struct inode *); 203extern int nilfs_acl_chmod(struct inode *);
200extern int nilfs_init_acl(struct inode *, struct inode *); 204extern int nilfs_init_acl(struct inode *, struct inode *);
201#else 205#else
202#define nilfs_permission NULL
203
204static inline int nilfs_acl_chmod(struct inode *inode) 206static inline int nilfs_acl_chmod(struct inode *inode)
205{ 207{
206 return 0; 208 return 0;
@@ -242,11 +244,19 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
242extern void nilfs_set_inode_flags(struct inode *); 244extern void nilfs_set_inode_flags(struct inode *);
243extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); 245extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
244extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); 246extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
245extern struct inode *nilfs_iget(struct super_block *, unsigned long); 247struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
248 unsigned long ino);
249struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
250 unsigned long ino);
251struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
252 unsigned long ino);
253extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
254 unsigned long ino, __u64 cno);
246extern void nilfs_update_inode(struct inode *, struct buffer_head *); 255extern void nilfs_update_inode(struct inode *, struct buffer_head *);
247extern void nilfs_truncate(struct inode *); 256extern void nilfs_truncate(struct inode *);
248extern void nilfs_delete_inode(struct inode *); 257extern void nilfs_evict_inode(struct inode *);
249extern int nilfs_setattr(struct dentry *, struct iattr *); 258extern int nilfs_setattr(struct dentry *, struct iattr *);
259int nilfs_permission(struct inode *inode, int mask);
250extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 260extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
251 struct buffer_head **); 261 struct buffer_head **);
252extern int nilfs_inode_dirty(struct inode *); 262extern int nilfs_inode_dirty(struct inode *);
@@ -255,11 +265,7 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
255extern int nilfs_mark_inode_dirty(struct inode *); 265extern int nilfs_mark_inode_dirty(struct inode *);
256extern void nilfs_dirty_inode(struct inode *); 266extern void nilfs_dirty_inode(struct inode *);
257 267
258/* namei.c */
259extern struct dentry *nilfs_get_parent(struct dentry *);
260
261/* super.c */ 268/* super.c */
262extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
263extern struct inode *nilfs_alloc_inode(struct super_block *); 269extern struct inode *nilfs_alloc_inode(struct super_block *);
264extern void nilfs_destroy_inode(struct inode *); 270extern void nilfs_destroy_inode(struct inode *);
265extern void nilfs_error(struct super_block *, const char *, const char *, ...) 271extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -270,9 +276,17 @@ extern struct nilfs_super_block *
270nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); 276nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
271extern int nilfs_store_magic_and_option(struct super_block *, 277extern int nilfs_store_magic_and_option(struct super_block *,
272 struct nilfs_super_block *, char *); 278 struct nilfs_super_block *, char *);
279extern int nilfs_check_feature_compatibility(struct super_block *,
280 struct nilfs_super_block *);
281extern void nilfs_set_log_cursor(struct nilfs_super_block *,
282 struct the_nilfs *);
283extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
284 int flip);
273extern int nilfs_commit_super(struct nilfs_sb_info *, int); 285extern int nilfs_commit_super(struct nilfs_sb_info *, int);
274extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 286extern int nilfs_cleanup_super(struct nilfs_sb_info *);
275extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 287int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
288 struct nilfs_root **root);
289int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
276 290
277/* gcinode.c */ 291/* gcinode.c */
278int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, 292int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -280,16 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
280int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, 294int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
281 struct buffer_head **); 295 struct buffer_head **);
282int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); 296int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
283int nilfs_init_gccache(struct the_nilfs *); 297int nilfs_init_gcinode(struct inode *inode);
284void nilfs_destroy_gccache(struct the_nilfs *); 298void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
285void nilfs_clear_gcinode(struct inode *);
286struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
287void nilfs_remove_all_gcinode(struct the_nilfs *);
288
289/* gcdat.c */
290int nilfs_init_gcdat_inode(struct the_nilfs *);
291void nilfs_commit_gcdat_inode(struct the_nilfs *);
292void nilfs_clear_gcdat_inode(struct the_nilfs *);
293 299
294/* 300/*
295 * Inodes and files operations 301 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e4813..a6c3c2e817f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
37 37
38#define NILFS_BUFFER_INHERENT_BITS \ 38#define NILFS_BUFFER_INHERENT_BITS \
39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ 39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) 40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
41 (1UL << BH_NILFS_Checked))
41 42
42static struct buffer_head * 43static struct buffer_head *
43__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, 44__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -78,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
78{ 79{
79 int blkbits = inode->i_blkbits; 80 int blkbits = inode->i_blkbits;
80 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); 81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
81 struct page *page, *opage; 82 struct page *page;
82 struct buffer_head *bh, *obh; 83 struct buffer_head *bh;
83 84
84 page = grab_cache_page(mapping, index); 85 page = grab_cache_page(mapping, index);
85 if (unlikely(!page)) 86 if (unlikely(!page))
@@ -91,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
91 page_cache_release(page); 92 page_cache_release(page);
92 return NULL; 93 return NULL;
93 } 94 }
94 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
95 /*
96 * Shadow page cache uses assoc_mapping to point its original
97 * page cache. The following code tries the original cache
98 * if the given cache is a shadow and it didn't hit.
99 */
100 opage = find_lock_page(mapping->assoc_mapping, index);
101 if (!opage)
102 return bh;
103
104 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
105 b_state);
106 if (buffer_uptodate(obh)) {
107 nilfs_copy_buffer(bh, obh);
108 if (buffer_dirty(obh)) {
109 nilfs_mark_buffer_dirty(bh);
110 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
111 nilfs_mdt_mark_dirty(inode);
112 }
113 }
114 brelse(obh);
115 unlock_page(opage);
116 page_cache_release(opage);
117 }
118 return bh; 95 return bh;
119} 96}
120 97
@@ -129,6 +106,8 @@ void nilfs_forget_buffer(struct buffer_head *bh)
129 106
130 lock_buffer(bh); 107 lock_buffer(bh);
131 clear_buffer_nilfs_volatile(bh); 108 clear_buffer_nilfs_volatile(bh);
109 clear_buffer_nilfs_checked(bh);
110 clear_buffer_nilfs_redirected(bh);
132 clear_buffer_dirty(bh); 111 clear_buffer_dirty(bh);
133 if (nilfs_page_buffers_clean(page)) 112 if (nilfs_page_buffers_clean(page))
134 __nilfs_clear_page_dirty(page); 113 __nilfs_clear_page_dirty(page);
@@ -480,6 +459,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
480 lock_buffer(bh); 459 lock_buffer(bh);
481 clear_buffer_dirty(bh); 460 clear_buffer_dirty(bh);
482 clear_buffer_nilfs_volatile(bh); 461 clear_buffer_nilfs_volatile(bh);
462 clear_buffer_nilfs_checked(bh);
463 clear_buffer_nilfs_redirected(bh);
483 clear_buffer_uptodate(bh); 464 clear_buffer_uptodate(bh);
484 clear_buffer_mapped(bh); 465 clear_buffer_mapped(bh);
485 unlock_buffer(bh); 466 unlock_buffer(bh);
@@ -510,6 +491,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
510 } 491 }
511 return nc; 492 return nc;
512} 493}
494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi,
510 const struct address_space_operations *aops)
511{
512 mapping->host = NULL;
513 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops;
518}
513 519
514/* 520/*
515 * NILFS2 needs clear_page_dirty() in the following two cases: 521 * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f..fb9e8a8a203 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,15 @@ enum {
34 BH_NILFS_Allocated = BH_PrivateStart, 34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked,
38 BH_NILFS_Redirected,
37}; 39};
38 40
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 41BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 42BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 43BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
44BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
45BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
42 46
43 47
44void nilfs_mark_buffer_dirty(struct buffer_head *bh); 48void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -57,6 +61,10 @@ void nilfs_free_private_page(struct page *);
57int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
58void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
59void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops);
60unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
61 69
62#define NILFS_PAGE_BUG(page, m, a...) \ 70#define NILFS_PAGE_BUG(page, m, a...) \
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index bae2a516b4e..5d2711c28da 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err)
91 return -EINVAL; 91 return -EINVAL;
92} 92}
93 93
94static void store_segsum_info(struct nilfs_segsum_info *ssi,
95 struct nilfs_segment_summary *sum,
96 unsigned int blocksize)
97{
98 ssi->flags = le16_to_cpu(sum->ss_flags);
99 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
100 ssi->ctime = le64_to_cpu(sum->ss_create);
101 ssi->next = le64_to_cpu(sum->ss_next);
102 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
103 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
104 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
105
106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108
109 /* need to verify ->ss_bytes field if read ->ss_cno */
110}
111
112/** 94/**
113 * calc_crc_cont - check CRC of blocks continuously 95 * nilfs_compute_checksum - compute checksum of blocks continuously
114 * @sbi: nilfs_sb_info 96 * @nilfs: nilfs object
115 * @bhs: buffer head of start block 97 * @bhs: buffer head of start block
116 * @sum: place to store result 98 * @sum: place to store result
117 * @offset: offset bytes in the first block 99 * @offset: offset bytes in the first block
@@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
119 * @start: DBN of start block 101 * @start: DBN of start block
120 * @nblock: number of blocks to be checked 102 * @nblock: number of blocks to be checked
121 */ 103 */
122static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, 104static int nilfs_compute_checksum(struct the_nilfs *nilfs,
123 u32 *sum, unsigned long offset, u64 check_bytes, 105 struct buffer_head *bhs, u32 *sum,
124 sector_t start, unsigned long nblock) 106 unsigned long offset, u64 check_bytes,
107 sector_t start, unsigned long nblock)
125{ 108{
126 unsigned long blocksize = sbi->s_super->s_blocksize; 109 unsigned int blocksize = nilfs->ns_blocksize;
127 unsigned long size; 110 unsigned long size;
128 u32 crc; 111 u32 crc;
129 112
130 BUG_ON(offset >= blocksize); 113 BUG_ON(offset >= blocksize);
131 check_bytes -= offset; 114 check_bytes -= offset;
132 size = min_t(u64, check_bytes, blocksize - offset); 115 size = min_t(u64, check_bytes, blocksize - offset);
133 crc = crc32_le(sbi->s_nilfs->ns_crc_seed, 116 crc = crc32_le(nilfs->ns_crc_seed,
134 (unsigned char *)bhs->b_data + offset, size); 117 (unsigned char *)bhs->b_data + offset, size);
135 if (--nblock > 0) { 118 if (--nblock > 0) {
136 do { 119 do {
137 struct buffer_head *bh 120 struct buffer_head *bh;
138 = sb_bread(sbi->s_super, ++start); 121
122 bh = __bread(nilfs->ns_bdev, ++start, blocksize);
139 if (!bh) 123 if (!bh)
140 return -EIO; 124 return -EIO;
141 check_bytes -= size; 125 check_bytes -= size;
@@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
150 134
151/** 135/**
152 * nilfs_read_super_root_block - read super root block 136 * nilfs_read_super_root_block - read super root block
153 * @sb: super_block 137 * @nilfs: nilfs object
154 * @sr_block: disk block number of the super root block 138 * @sr_block: disk block number of the super root block
155 * @pbh: address of a buffer_head pointer to return super root buffer 139 * @pbh: address of a buffer_head pointer to return super root buffer
156 * @check: CRC check flag 140 * @check: CRC check flag
157 */ 141 */
158int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, 142int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
159 struct buffer_head **pbh, int check) 143 struct buffer_head **pbh, int check)
160{ 144{
161 struct buffer_head *bh_sr; 145 struct buffer_head *bh_sr;
@@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
164 int ret; 148 int ret;
165 149
166 *pbh = NULL; 150 *pbh = NULL;
167 bh_sr = sb_bread(sb, sr_block); 151 bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
168 if (unlikely(!bh_sr)) { 152 if (unlikely(!bh_sr)) {
169 ret = NILFS_SEG_FAIL_IO; 153 ret = NILFS_SEG_FAIL_IO;
170 goto failed; 154 goto failed;
@@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
174 if (check) { 158 if (check) {
175 unsigned bytes = le16_to_cpu(sr->sr_bytes); 159 unsigned bytes = le16_to_cpu(sr->sr_bytes);
176 160
177 if (bytes == 0 || bytes > sb->s_blocksize) { 161 if (bytes == 0 || bytes > nilfs->ns_blocksize) {
178 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; 162 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
179 goto failed_bh; 163 goto failed_bh;
180 } 164 }
181 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, 165 if (nilfs_compute_checksum(
182 sizeof(sr->sr_sum), bytes, sr_block, 1)) { 166 nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
167 sr_block, 1)) {
183 ret = NILFS_SEG_FAIL_IO; 168 ret = NILFS_SEG_FAIL_IO;
184 goto failed_bh; 169 goto failed_bh;
185 } 170 }
@@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
199} 184}
200 185
201/** 186/**
202 * load_segment_summary - read segment summary of the specified partial segment 187 * nilfs_read_log_header - read summary header of the specified log
203 * @sbi: nilfs_sb_info 188 * @nilfs: nilfs object
204 * @pseg_start: start disk block number of partial segment 189 * @start_blocknr: start block number of the log
205 * @seg_seq: sequence number requested 190 * @sum: pointer to return segment summary structure
206 * @ssi: pointer to nilfs_segsum_info struct to store information
207 */ 191 */
208static int 192static struct buffer_head *
209load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 193nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
210 u64 seg_seq, struct nilfs_segsum_info *ssi) 194 struct nilfs_segment_summary **sum)
211{ 195{
212 struct buffer_head *bh_sum; 196 struct buffer_head *bh_sum;
213 struct nilfs_segment_summary *sum; 197
198 bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
199 if (bh_sum)
200 *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
201 return bh_sum;
202}
203
204/**
205 * nilfs_validate_log - verify consistency of log
206 * @nilfs: nilfs object
207 * @seg_seq: sequence number of segment
208 * @bh_sum: buffer head of summary block
209 * @sum: segment summary struct
210 */
211static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
212 struct buffer_head *bh_sum,
213 struct nilfs_segment_summary *sum)
214{
214 unsigned long nblock; 215 unsigned long nblock;
215 u32 crc; 216 u32 crc;
216 int ret = NILFS_SEG_FAIL_IO; 217 int ret;
217 218
218 bh_sum = sb_bread(sbi->s_super, pseg_start); 219 ret = NILFS_SEG_FAIL_MAGIC;
219 if (!bh_sum) 220 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
220 goto out; 221 goto out;
221 222
222 sum = (struct nilfs_segment_summary *)bh_sum->b_data; 223 ret = NILFS_SEG_FAIL_SEQ;
223 224 if (le64_to_cpu(sum->ss_seq) != seg_seq)
224 /* Check consistency of segment summary */ 225 goto out;
225 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
226 ret = NILFS_SEG_FAIL_MAGIC;
227 goto failed;
228 }
229 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
230 if (seg_seq != ssi->seg_seq) {
231 ret = NILFS_SEG_FAIL_SEQ;
232 goto failed;
233 }
234 226
235 nblock = ssi->nblocks; 227 nblock = le32_to_cpu(sum->ss_nblocks);
236 if (unlikely(nblock == 0 || 228 ret = NILFS_SEG_FAIL_CONSISTENCY;
237 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 229 if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
238 /* This limits the number of blocks read in the CRC check */ 230 /* This limits the number of blocks read in the CRC check */
239 ret = NILFS_SEG_FAIL_CONSISTENCY; 231 goto out;
240 goto failed; 232
241 } 233 ret = NILFS_SEG_FAIL_IO;
242 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), 234 if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
243 ((u64)nblock << sbi->s_super->s_blocksize_bits), 235 ((u64)nblock << nilfs->ns_blocksize_bits),
244 pseg_start, nblock)) { 236 bh_sum->b_blocknr, nblock))
245 ret = NILFS_SEG_FAIL_IO; 237 goto out;
246 goto failed; 238
247 } 239 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 if (crc == le32_to_cpu(sum->ss_datasum)) 240 if (crc != le32_to_cpu(sum->ss_datasum))
249 ret = 0; 241 goto out;
250 else 242 ret = 0;
251 ret = NILFS_SEG_FAIL_CHECKSUM_FULL; 243out:
252 failed:
253 brelse(bh_sum);
254 out:
255 return ret; 244 return ret;
256} 245}
257 246
258static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, 247/**
259 unsigned int *offset, unsigned int bytes) 248 * nilfs_read_summary_info - read an item on summary blocks of a log
249 * @nilfs: nilfs object
250 * @pbh: the current buffer head on summary blocks [in, out]
251 * @offset: the current byte offset on summary blocks [in, out]
252 * @bytes: byte size of the item to be read
253 */
254static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
255 struct buffer_head **pbh,
256 unsigned int *offset, unsigned int bytes)
260{ 257{
261 void *ptr; 258 void *ptr;
262 sector_t blocknr; 259 sector_t blocknr;
@@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
265 if (bytes > (*pbh)->b_size - *offset) { 262 if (bytes > (*pbh)->b_size - *offset) {
266 blocknr = (*pbh)->b_blocknr; 263 blocknr = (*pbh)->b_blocknr;
267 brelse(*pbh); 264 brelse(*pbh);
268 *pbh = sb_bread(sb, blocknr + 1); 265 *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
266 nilfs->ns_blocksize);
269 if (unlikely(!*pbh)) 267 if (unlikely(!*pbh))
270 return NULL; 268 return NULL;
271 *offset = 0; 269 *offset = 0;
@@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
275 return ptr; 273 return ptr;
276} 274}
277 275
278static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, 276/**
279 unsigned int *offset, unsigned int bytes, 277 * nilfs_skip_summary_info - skip items on summary blocks of a log
280 unsigned long count) 278 * @nilfs: nilfs object
279 * @pbh: the current buffer head on summary blocks [in, out]
280 * @offset: the current byte offset on summary blocks [in, out]
281 * @bytes: byte size of the item to be skipped
282 * @count: number of items to be skipped
283 */
284static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
285 struct buffer_head **pbh,
286 unsigned int *offset, unsigned int bytes,
287 unsigned long count)
281{ 288{
282 unsigned int rest_item_in_current_block 289 unsigned int rest_item_in_current_block
283 = ((*pbh)->b_size - *offset) / bytes; 290 = ((*pbh)->b_size - *offset) / bytes;
@@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
294 *offset = bytes * (count - (bcnt - 1) * nitem_per_block); 301 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
295 302
296 brelse(*pbh); 303 brelse(*pbh);
297 *pbh = sb_bread(sb, blocknr + bcnt); 304 *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
305 nilfs->ns_blocksize);
298 } 306 }
299} 307}
300 308
301static int 309/**
302collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, 310 * nilfs_scan_dsync_log - get block information of a log written for data sync
303 struct nilfs_segsum_info *ssi, 311 * @nilfs: nilfs object
304 struct list_head *head) 312 * @start_blocknr: start block number of the log
313 * @sum: log summary information
314 * @head: list head to add nilfs_recovery_block struct
315 */
316static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
317 struct nilfs_segment_summary *sum,
318 struct list_head *head)
305{ 319{
306 struct buffer_head *bh; 320 struct buffer_head *bh;
307 unsigned int offset; 321 unsigned int offset;
308 unsigned long nfinfo = ssi->nfinfo; 322 u32 nfinfo, sumbytes;
309 sector_t blocknr = sum_blocknr + ssi->nsumblk; 323 sector_t blocknr;
310 ino_t ino; 324 ino_t ino;
311 int err = -EIO; 325 int err = -EIO;
312 326
327 nfinfo = le32_to_cpu(sum->ss_nfinfo);
313 if (!nfinfo) 328 if (!nfinfo)
314 return 0; 329 return 0;
315 330
316 bh = sb_bread(sbi->s_super, sum_blocknr); 331 sumbytes = le32_to_cpu(sum->ss_sumbytes);
332 blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
333 bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
317 if (unlikely(!bh)) 334 if (unlikely(!bh))
318 goto out; 335 goto out;
319 336
320 offset = le16_to_cpu( 337 offset = le16_to_cpu(sum->ss_bytes);
321 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
322 for (;;) { 338 for (;;) {
323 unsigned long nblocks, ndatablk, nnodeblk; 339 unsigned long nblocks, ndatablk, nnodeblk;
324 struct nilfs_finfo *finfo; 340 struct nilfs_finfo *finfo;
325 341
326 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); 342 finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
343 sizeof(*finfo));
327 if (unlikely(!finfo)) 344 if (unlikely(!finfo))
328 goto out; 345 goto out;
329 346
@@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
336 struct nilfs_recovery_block *rb; 353 struct nilfs_recovery_block *rb;
337 struct nilfs_binfo_v *binfo; 354 struct nilfs_binfo_v *binfo;
338 355
339 binfo = segsum_get(sbi->s_super, &bh, &offset, 356 binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
340 sizeof(*binfo)); 357 sizeof(*binfo));
341 if (unlikely(!binfo)) 358 if (unlikely(!binfo))
342 goto out; 359 goto out;
343 360
@@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
355 } 372 }
356 if (--nfinfo == 0) 373 if (--nfinfo == 0)
357 break; 374 break;
358 blocknr += nnodeblk; /* always 0 for the data sync segments */ 375 blocknr += nnodeblk; /* always 0 for data sync logs */
359 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), 376 nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
360 nnodeblk); 377 nnodeblk);
361 if (unlikely(!bh)) 378 if (unlikely(!bh))
362 goto out; 379 goto out;
363 } 380 }
@@ -423,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
423 segnum[2] = ri->ri_segnum; 440 segnum[2] = ri->ri_segnum;
424 segnum[3] = ri->ri_nextnum; 441 segnum[3] = ri->ri_nextnum;
425 442
426 nilfs_attach_writer(nilfs, sbi);
427 /* 443 /*
428 * Releasing the next segment of the latest super root. 444 * Releasing the next segment of the latest super root.
429 * The next segment is invalidated by this recovery. 445 * The next segment is invalidated by this recovery.
@@ -463,18 +479,17 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
463 479
464 failed: 480 failed:
465 /* No need to recover sufile because it will be destroyed on error */ 481 /* No need to recover sufile because it will be destroyed on error */
466 nilfs_detach_writer(nilfs, sbi);
467 return err; 482 return err;
468} 483}
469 484
470static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, 485static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
471 struct nilfs_recovery_block *rb, 486 struct nilfs_recovery_block *rb,
472 struct page *page) 487 struct page *page)
473{ 488{
474 struct buffer_head *bh_org; 489 struct buffer_head *bh_org;
475 void *kaddr; 490 void *kaddr;
476 491
477 bh_org = sb_bread(sbi->s_super, rb->blocknr); 492 bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
478 if (unlikely(!bh_org)) 493 if (unlikely(!bh_org))
479 return -EIO; 494 return -EIO;
480 495
@@ -485,19 +500,21 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
485 return 0; 500 return 0;
486} 501}
487 502
488static int recover_dsync_blocks(struct nilfs_sb_info *sbi, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
489 struct list_head *head, 504 struct nilfs_sb_info *sbi,
490 unsigned long *nr_salvaged_blocks) 505 struct nilfs_root *root,
506 struct list_head *head,
507 unsigned long *nr_salvaged_blocks)
491{ 508{
492 struct inode *inode; 509 struct inode *inode;
493 struct nilfs_recovery_block *rb, *n; 510 struct nilfs_recovery_block *rb, *n;
494 unsigned blocksize = sbi->s_super->s_blocksize; 511 unsigned blocksize = nilfs->ns_blocksize;
495 struct page *page; 512 struct page *page;
496 loff_t pos; 513 loff_t pos;
497 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
498 515
499 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
500 inode = nilfs_iget(sbi->s_super, rb->ino); 517 inode = nilfs_iget(sbi->s_super, root, rb->ino);
501 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
502 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
503 inode = NULL; 520 inode = NULL;
@@ -505,13 +522,16 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
505 } 522 }
506 523
507 pos = rb->blkoff << inode->i_blkbits; 524 pos = rb->blkoff << inode->i_blkbits;
508 page = NULL; 525 err = block_write_begin(inode->i_mapping, pos, blocksize,
509 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, 526 0, &page, nilfs_get_block);
510 0, &page, NULL, nilfs_get_block); 527 if (unlikely(err)) {
511 if (unlikely(err)) 528 loff_t isize = inode->i_size;
529 if (pos + blocksize > isize)
530 vmtruncate(inode, isize);
512 goto failed_inode; 531 goto failed_inode;
532 }
513 533
514 err = nilfs_recovery_copy_block(sbi, rb, page); 534 err = nilfs_recovery_copy_block(nilfs, rb, page);
515 if (unlikely(err)) 535 if (unlikely(err))
516 goto failed_page; 536 goto failed_page;
517 537
@@ -551,18 +571,21 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
551/** 571/**
552 * nilfs_do_roll_forward - salvage logical segments newer than the latest 572 * nilfs_do_roll_forward - salvage logical segments newer than the latest
553 * checkpoint 573 * checkpoint
574 * @nilfs: nilfs object
554 * @sbi: nilfs_sb_info 575 * @sbi: nilfs_sb_info
555 * @nilfs: the_nilfs
556 * @ri: pointer to a nilfs_recovery_info 576 * @ri: pointer to a nilfs_recovery_info
557 */ 577 */
558static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
559 struct nilfs_sb_info *sbi, 579 struct nilfs_sb_info *sbi,
580 struct nilfs_root *root,
560 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
561{ 582{
562 struct nilfs_segsum_info ssi; 583 struct buffer_head *bh_sum = NULL;
584 struct nilfs_segment_summary *sum;
563 sector_t pseg_start; 585 sector_t pseg_start;
564 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ 586 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
565 unsigned long nsalvaged_blocks = 0; 587 unsigned long nsalvaged_blocks = 0;
588 unsigned int flags;
566 u64 seg_seq; 589 u64 seg_seq;
567 __u64 segnum, nextnum = 0; 590 __u64 segnum, nextnum = 0;
568 int empty_seg = 0; 591 int empty_seg = 0;
@@ -574,15 +597,20 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
574 }; 597 };
575 int state = RF_INIT_ST; 598 int state = RF_INIT_ST;
576 599
577 nilfs_attach_writer(nilfs, sbi);
578 pseg_start = ri->ri_lsegs_start; 600 pseg_start = ri->ri_lsegs_start;
579 seg_seq = ri->ri_lsegs_start_seq; 601 seg_seq = ri->ri_lsegs_start_seq;
580 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); 602 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
581 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); 603 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
582 604
583 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 605 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
606 brelse(bh_sum);
607 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
608 if (!bh_sum) {
609 err = -EIO;
610 goto failed;
611 }
584 612
585 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 613 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
586 if (ret) { 614 if (ret) {
587 if (ret == NILFS_SEG_FAIL_IO) { 615 if (ret == NILFS_SEG_FAIL_IO) {
588 err = -EIO; 616 err = -EIO;
@@ -590,33 +618,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
590 } 618 }
591 goto strayed; 619 goto strayed;
592 } 620 }
593 if (unlikely(NILFS_SEG_HAS_SR(&ssi))) 621
622 flags = le16_to_cpu(sum->ss_flags);
623 if (flags & NILFS_SS_SR)
594 goto confused; 624 goto confused;
595 625
596 /* Found a valid partial segment; do recovery actions */ 626 /* Found a valid partial segment; do recovery actions */
597 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 627 nextnum = nilfs_get_segnum_of_block(nilfs,
628 le64_to_cpu(sum->ss_next));
598 empty_seg = 0; 629 empty_seg = 0;
599 nilfs->ns_ctime = ssi.ctime; 630 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
600 if (!(ssi.flags & NILFS_SS_GC)) 631 if (!(flags & NILFS_SS_GC))
601 nilfs->ns_nongc_ctime = ssi.ctime; 632 nilfs->ns_nongc_ctime = nilfs->ns_ctime;
602 633
603 switch (state) { 634 switch (state) {
604 case RF_INIT_ST: 635 case RF_INIT_ST:
605 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) 636 if (!(flags & NILFS_SS_LOGBGN) ||
637 !(flags & NILFS_SS_SYNDT))
606 goto try_next_pseg; 638 goto try_next_pseg;
607 state = RF_DSYNC_ST; 639 state = RF_DSYNC_ST;
608 /* Fall through */ 640 /* Fall through */
609 case RF_DSYNC_ST: 641 case RF_DSYNC_ST:
610 if (!NILFS_SEG_DSYNC(&ssi)) 642 if (!(flags & NILFS_SS_SYNDT))
611 goto confused; 643 goto confused;
612 644
613 err = collect_blocks_from_segsum( 645 err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
614 sbi, pseg_start, &ssi, &dsync_blocks); 646 &dsync_blocks);
615 if (unlikely(err)) 647 if (unlikely(err))
616 goto failed; 648 goto failed;
617 if (NILFS_SEG_LOGEND(&ssi)) { 649 if (flags & NILFS_SS_LOGEND) {
618 err = recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
619 sbi, &dsync_blocks, &nsalvaged_blocks); 651 nilfs, sbi, root, &dsync_blocks,
652 &nsalvaged_blocks);
620 if (unlikely(err)) 653 if (unlikely(err))
621 goto failed; 654 goto failed;
622 state = RF_INIT_ST; 655 state = RF_INIT_ST;
@@ -627,7 +660,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
627 try_next_pseg: 660 try_next_pseg:
628 if (pseg_start == ri->ri_lsegs_end) 661 if (pseg_start == ri->ri_lsegs_end)
629 break; 662 break;
630 pseg_start += ssi.nblocks; 663 pseg_start += le32_to_cpu(sum->ss_nblocks);
631 if (pseg_start < seg_end) 664 if (pseg_start < seg_end)
632 continue; 665 continue;
633 goto feed_segment; 666 goto feed_segment;
@@ -652,8 +685,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
652 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; 685 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
653 } 686 }
654 out: 687 out:
688 brelse(bh_sum);
655 dispose_recovery_list(&dsync_blocks); 689 dispose_recovery_list(&dsync_blocks);
656 nilfs_detach_writer(sbi->s_nilfs, sbi);
657 return err; 690 return err;
658 691
659 confused: 692 confused:
@@ -667,7 +700,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
667} 700}
668 701
669static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, 702static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
670 struct nilfs_sb_info *sbi,
671 struct nilfs_recovery_info *ri) 703 struct nilfs_recovery_info *ri)
672{ 704{
673 struct buffer_head *bh; 705 struct buffer_head *bh;
@@ -677,7 +709,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
677 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) 709 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
678 return; 710 return;
679 711
680 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); 712 bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
681 BUG_ON(!bh); 713 BUG_ON(!bh);
682 memset(bh->b_data, 0, bh->b_size); 714 memset(bh->b_data, 0, bh->b_size);
683 set_buffer_dirty(bh); 715 set_buffer_dirty(bh);
@@ -690,9 +722,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
690} 722}
691 723
692/** 724/**
693 * nilfs_recover_logical_segments - salvage logical segments written after 725 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
694 * the latest super root 726 * @nilfs: nilfs object
695 * @nilfs: the_nilfs
696 * @sbi: nilfs_sb_info 727 * @sbi: nilfs_sb_info
697 * @ri: pointer to a nilfs_recovery_info struct to store search results. 728 * @ri: pointer to a nilfs_recovery_info struct to store search results.
698 * 729 *
@@ -709,23 +740,24 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
709 * 740 *
710 * %-ENOMEM - Insufficient memory available. 741 * %-ENOMEM - Insufficient memory available.
711 */ 742 */
712int nilfs_recover_logical_segments(struct the_nilfs *nilfs, 743int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
713 struct nilfs_sb_info *sbi, 744 struct nilfs_sb_info *sbi,
714 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
715{ 746{
747 struct nilfs_root *root;
716 int err; 748 int err;
717 749
718 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
719 return 0; 751 return 0;
720 752
721 err = nilfs_attach_checkpoint(sbi, ri->ri_cno); 753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
722 if (unlikely(err)) { 754 if (unlikely(err)) {
723 printk(KERN_ERR 755 printk(KERN_ERR
724 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
725 return err; 757 return err;
726 } 758 }
727 759
728 err = nilfs_do_roll_forward(nilfs, sbi, ri); 760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
729 if (unlikely(err)) 761 if (unlikely(err))
730 goto failed; 762 goto failed;
731 763
@@ -737,7 +769,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
737 goto failed; 769 goto failed;
738 } 770 }
739 771
740 err = nilfs_attach_segment_constructor(sbi); 772 err = nilfs_attach_segment_constructor(sbi, root);
741 if (unlikely(err)) 773 if (unlikely(err))
742 goto failed; 774 goto failed;
743 775
@@ -751,18 +783,17 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
751 goto failed; 783 goto failed;
752 } 784 }
753 785
754 nilfs_finish_roll_forward(nilfs, sbi, ri); 786 nilfs_finish_roll_forward(nilfs, ri);
755 } 787 }
756 788
757 failed: 789 failed:
758 nilfs_detach_checkpoint(sbi); 790 nilfs_put_root(root);
759 return err; 791 return err;
760} 792}
761 793
762/** 794/**
763 * nilfs_search_super_root - search the latest valid super root 795 * nilfs_search_super_root - search the latest valid super root
764 * @nilfs: the_nilfs 796 * @nilfs: the_nilfs
765 * @sbi: nilfs_sb_info
766 * @ri: pointer to a nilfs_recovery_info struct to store search results. 797 * @ri: pointer to a nilfs_recovery_info struct to store search results.
767 * 798 *
768 * nilfs_search_super_root() looks for the latest super-root from a partial 799 * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -775,14 +806,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
775 * %-EINVAL - No valid segment found 806 * %-EINVAL - No valid segment found
776 * 807 *
777 * %-EIO - I/O error 808 * %-EIO - I/O error
809 *
810 * %-ENOMEM - Insufficient memory available.
778 */ 811 */
779int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, 812int nilfs_search_super_root(struct the_nilfs *nilfs,
780 struct nilfs_recovery_info *ri) 813 struct nilfs_recovery_info *ri)
781{ 814{
782 struct nilfs_segsum_info ssi; 815 struct buffer_head *bh_sum = NULL;
816 struct nilfs_segment_summary *sum;
783 sector_t pseg_start, pseg_end, sr_pseg_start = 0; 817 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
784 sector_t seg_start, seg_end; /* range of full segment (block number) */ 818 sector_t seg_start, seg_end; /* range of full segment (block number) */
785 sector_t b, end; 819 sector_t b, end;
820 unsigned long nblocks;
821 unsigned int flags;
786 u64 seg_seq; 822 u64 seg_seq;
787 __u64 segnum, nextnum = 0; 823 __u64 segnum, nextnum = 0;
788 __u64 cno; 824 __u64 cno;
@@ -801,17 +837,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
801 /* Read ahead segment */ 837 /* Read ahead segment */
802 b = seg_start; 838 b = seg_start;
803 while (b <= seg_end) 839 while (b <= seg_end)
804 sb_breadahead(sbi->s_super, b++); 840 __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
805 841
806 for (;;) { 842 for (;;) {
807 /* Load segment summary */ 843 brelse(bh_sum);
808 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 844 ret = NILFS_SEG_FAIL_IO;
845 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
846 if (!bh_sum)
847 goto failed;
848
849 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
809 if (ret) { 850 if (ret) {
810 if (ret == NILFS_SEG_FAIL_IO) 851 if (ret == NILFS_SEG_FAIL_IO)
811 goto failed; 852 goto failed;
812 goto strayed; 853 goto strayed;
813 } 854 }
814 pseg_end = pseg_start + ssi.nblocks - 1; 855
856 nblocks = le32_to_cpu(sum->ss_nblocks);
857 pseg_end = pseg_start + nblocks - 1;
815 if (unlikely(pseg_end > seg_end)) { 858 if (unlikely(pseg_end > seg_end)) {
816 ret = NILFS_SEG_FAIL_CONSISTENCY; 859 ret = NILFS_SEG_FAIL_CONSISTENCY;
817 goto strayed; 860 goto strayed;
@@ -821,11 +864,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 ri->ri_pseg_start = pseg_start; 864 ri->ri_pseg_start = pseg_start;
822 ri->ri_seq = seg_seq; 865 ri->ri_seq = seg_seq;
823 ri->ri_segnum = segnum; 866 ri->ri_segnum = segnum;
824 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 867 nextnum = nilfs_get_segnum_of_block(nilfs,
868 le64_to_cpu(sum->ss_next));
825 ri->ri_nextnum = nextnum; 869 ri->ri_nextnum = nextnum;
826 empty_seg = 0; 870 empty_seg = 0;
827 871
828 if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { 872 flags = le16_to_cpu(sum->ss_flags);
873 if (!(flags & NILFS_SS_SR) && !scan_newer) {
829 /* This will never happen because a superblock 874 /* This will never happen because a superblock
830 (last_segment) always points to a pseg 875 (last_segment) always points to a pseg
831 having a super root. */ 876 having a super root. */
@@ -836,14 +881,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
836 if (pseg_start == seg_start) { 881 if (pseg_start == seg_start) {
837 nilfs_get_segment_range(nilfs, nextnum, &b, &end); 882 nilfs_get_segment_range(nilfs, nextnum, &b, &end);
838 while (b <= end) 883 while (b <= end)
839 sb_breadahead(sbi->s_super, b++); 884 __breadahead(nilfs->ns_bdev, b++,
885 nilfs->ns_blocksize);
840 } 886 }
841 if (!NILFS_SEG_HAS_SR(&ssi)) { 887 if (!(flags & NILFS_SS_SR)) {
842 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { 888 if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
843 ri->ri_lsegs_start = pseg_start; 889 ri->ri_lsegs_start = pseg_start;
844 ri->ri_lsegs_start_seq = seg_seq; 890 ri->ri_lsegs_start_seq = seg_seq;
845 } 891 }
846 if (NILFS_SEG_LOGEND(&ssi)) 892 if (flags & NILFS_SS_LOGEND)
847 ri->ri_lsegs_end = pseg_start; 893 ri->ri_lsegs_end = pseg_start;
848 goto try_next_pseg; 894 goto try_next_pseg;
849 } 895 }
@@ -854,12 +900,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
854 ri->ri_lsegs_start = ri->ri_lsegs_end = 0; 900 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
855 901
856 nilfs_dispose_segment_list(&segments); 902 nilfs_dispose_segment_list(&segments);
857 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) 903 sr_pseg_start = pseg_start;
858 + ssi.nblocks - seg_start; 904 nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
859 nilfs->ns_seg_seq = seg_seq; 905 nilfs->ns_seg_seq = seg_seq;
860 nilfs->ns_segnum = segnum; 906 nilfs->ns_segnum = segnum;
861 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ 907 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
862 nilfs->ns_ctime = ssi.ctime; 908 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
863 nilfs->ns_nextnum = nextnum; 909 nilfs->ns_nextnum = nextnum;
864 910
865 if (scan_newer) 911 if (scan_newer)
@@ -870,15 +916,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
870 scan_newer = 1; 916 scan_newer = 1;
871 } 917 }
872 918
873 /* reset region for roll-forward */
874 pseg_start += ssi.nblocks;
875 if (pseg_start < seg_end)
876 continue;
877 goto feed_segment;
878
879 try_next_pseg: 919 try_next_pseg:
880 /* Standing on a course, or met an inconsistent state */ 920 /* Standing on a course, or met an inconsistent state */
881 pseg_start += ssi.nblocks; 921 pseg_start += nblocks;
882 if (pseg_start < seg_end) 922 if (pseg_start < seg_end)
883 continue; 923 continue;
884 goto feed_segment; 924 goto feed_segment;
@@ -909,6 +949,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
909 949
910 super_root_found: 950 super_root_found:
911 /* Updating pointers relating to the latest checkpoint */ 951 /* Updating pointers relating to the latest checkpoint */
952 brelse(bh_sum);
912 list_splice_tail(&segments, &ri->ri_used_segments); 953 list_splice_tail(&segments, &ri->ri_used_segments);
913 nilfs->ns_last_pseg = sr_pseg_start; 954 nilfs->ns_last_pseg = sr_pseg_start;
914 nilfs->ns_last_seq = nilfs->ns_seg_seq; 955 nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -916,6 +957,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
916 return 0; 957 return 0;
917 958
918 failed: 959 failed:
960 brelse(bh_sum);
919 nilfs_dispose_segment_list(&segments); 961 nilfs_dispose_segment_list(&segments);
920 return (ret < 0) ? ret : nilfs_warn_segment_error(ret); 962 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
921} 963}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504..35a07157b98 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,11 +42,6 @@ struct nilfs_sc_info;
42 * NILFS super-block data in memory 42 * NILFS super-block data in memory
43 */ 43 */
44struct nilfs_sb_info { 44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */ 45 /* Mount options */
51 unsigned long s_mount_opt; 46 unsigned long s_mount_opt;
52 uid_t s_resuid; 47 uid_t s_resuid;
@@ -59,8 +54,6 @@ struct nilfs_sb_info {
59 /* Fundamental members */ 54 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */ 55 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 56 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
64 57
65 /* Segment constructor */ 58 /* Segment constructor */
66 struct list_head s_dirty_files; /* dirty files list */ 59 struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +61,6 @@ struct nilfs_sb_info {
68 spinlock_t s_inode_lock; /* Lock for the nilfs inode. 61 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
69 It covers s_dirty_files list */ 62 It covers s_dirty_files list */
70 63
71 /* Metadata files */
72 struct inode *s_ifile; /* index file inode */
73
74 /* Inode allocator */ 64 /* Inode allocator */
75 spinlock_t s_next_gen_lock; 65 spinlock_t s_next_gen_lock;
76 u32 s_next_generation; 66 u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2e6a2723b8f..0f83e93935b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
371 struct bio *bio = wi->bio; 371 struct bio *bio = wi->bio;
372 int err; 372 int err;
373 373
374 if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { 374 if (segbuf->sb_nbio > 0 &&
375 bdi_write_congested(segbuf->sb_super->s_bdi)) {
375 wait_for_completion(&segbuf->sb_bio_event); 376 wait_for_completion(&segbuf->sb_bio_event);
376 segbuf->sb_nbio--; 377 segbuf->sb_nbio--;
377 if (unlikely(atomic_read(&segbuf->sb_err))) { 378 if (unlikely(atomic_read(&segbuf->sb_err))) {
@@ -508,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
508 * Last BIO is always sent through the following 509 * Last BIO is always sent through the following
509 * submission. 510 * submission.
510 */ 511 */
511 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 512 rw |= REQ_SYNC | REQ_UNPLUG;
512 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); 513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
513 } 514 }
514 515
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 85fbb66455e..b04f08cc239 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -54,17 +54,6 @@ struct nilfs_segsum_info {
54 sector_t next; 54 sector_t next;
55}; 55};
56 56
57/* macro for the flags */
58#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
59#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
60#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
61#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
62#define NILFS_SEG_SIMPLEX(sum) \
63 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
64 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
65
66#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
67
68/** 57/**
69 * struct nilfs_segment_buffer - Segment buffer 58 * struct nilfs_segment_buffer - Segment buffer
70 * @sb_super: back pointer to a superblock struct 59 * @sb_super: back pointer to a superblock struct
@@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
141 struct buffer_head **); 130 struct buffer_head **);
142void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 131void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
143 132
133static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
134{
135 unsigned int flags = segbuf->sb_sum.flags;
136
137 return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
138 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
139}
140
141static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
142{
143 return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
144}
145
144static inline void 146static inline void
145nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
146 struct buffer_head *bh) 148 struct buffer_head *bh)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9201649cc4..687d090cea3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
191 if (ret > 0) 191 if (ret > 0)
192 return 0; 192 return 0;
193 193
194 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195
194 sbi = NILFS_SB(sb); 196 sbi = NILFS_SB(sb);
195 nilfs = sbi->s_nilfs; 197 nilfs = sbi->s_nilfs;
196 down_read(&nilfs->ns_segctor_sem); 198 down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
366 368
367 if (nilfs_doing_gc()) 369 if (nilfs_doing_gc())
368 flags = NILFS_SS_GC; 370 flags = NILFS_SS_GC;
369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, 371 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
370 sci->sc_sbi->s_nilfs->ns_cno);
371 if (unlikely(err)) 372 if (unlikely(err))
372 return err; 373 return err;
373 374
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
440 struct nilfs_finfo *finfo; 441 struct nilfs_finfo *finfo;
441 struct nilfs_inode_info *ii; 442 struct nilfs_inode_info *ii;
442 struct nilfs_segment_buffer *segbuf; 443 struct nilfs_segment_buffer *segbuf;
444 __u64 cno;
443 445
444 if (sci->sc_blk_cnt == 0) 446 if (sci->sc_blk_cnt == 0)
445 return; 447 return;
446 448
447 ii = NILFS_I(inode); 449 ii = NILFS_I(inode);
450
451 if (test_bit(NILFS_I_GCINODE, &ii->i_state))
452 cno = ii->i_cno;
453 else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
454 cno = 0;
455 else
456 cno = sci->sc_cno;
457
448 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, 458 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
449 sizeof(*finfo)); 459 sizeof(*finfo));
450 finfo->fi_ino = cpu_to_le64(inode->i_ino); 460 finfo->fi_ino = cpu_to_le64(inode->i_ino);
451 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); 461 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
452 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); 462 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
453 finfo->fi_cno = cpu_to_le64(ii->i_cno); 463 finfo->fi_cno = cpu_to_le64(cno);
454 464
455 segbuf = sci->sc_curseg; 465 segbuf = sci->sc_curseg;
456 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + 466 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -755,12 +765,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
755 } 765 }
756} 766}
757 767
758static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) 768static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
769 struct nilfs_root *root)
759{ 770{
760 struct the_nilfs *nilfs = sbi->s_nilfs;
761 int ret = 0; 771 int ret = 0;
762 772
763 if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) 773 if (nilfs_mdt_fetch_dirty(root->ifile))
764 ret++; 774 ret++;
765 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) 775 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
766 ret++; 776 ret++;
@@ -785,7 +795,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
785 struct nilfs_sb_info *sbi = sci->sc_sbi; 795 struct nilfs_sb_info *sbi = sci->sc_sbi;
786 int ret = 0; 796 int ret = 0;
787 797
788 if (nilfs_test_metadata_dirty(sbi)) 798 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
789 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 799 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
790 800
791 spin_lock(&sbi->s_inode_lock); 801 spin_lock(&sbi->s_inode_lock);
@@ -801,7 +811,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
801 struct nilfs_sb_info *sbi = sci->sc_sbi; 811 struct nilfs_sb_info *sbi = sci->sc_sbi;
802 struct the_nilfs *nilfs = sbi->s_nilfs; 812 struct the_nilfs *nilfs = sbi->s_nilfs;
803 813
804 nilfs_mdt_clear_dirty(sbi->s_ifile); 814 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
805 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 815 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
806 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 816 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
807 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 817 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -848,9 +858,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
848 raw_cp->cp_snapshot_list.ssl_next = 0; 858 raw_cp->cp_snapshot_list.ssl_next = 0;
849 raw_cp->cp_snapshot_list.ssl_prev = 0; 859 raw_cp->cp_snapshot_list.ssl_prev = 0;
850 raw_cp->cp_inodes_count = 860 raw_cp->cp_inodes_count =
851 cpu_to_le64(atomic_read(&sbi->s_inodes_count)); 861 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
852 raw_cp->cp_blocks_count = 862 raw_cp->cp_blocks_count =
853 cpu_to_le64(atomic_read(&sbi->s_blocks_count)); 863 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
854 raw_cp->cp_nblk_inc = 864 raw_cp->cp_nblk_inc =
855 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 865 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
856 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 866 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +871,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
861 else 871 else
862 nilfs_checkpoint_set_minor(raw_cp); 872 nilfs_checkpoint_set_minor(raw_cp);
863 873
864 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); 874 nilfs_write_inode_common(sci->sc_root->ifile,
875 &raw_cp->cp_ifile_inode, 1);
865 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); 876 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
866 return 0; 877 return 0;
867 878
@@ -886,13 +897,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
886 } 897 }
887} 898}
888 899
889static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
890 struct inode *ifile)
891{ 901{
892 struct nilfs_inode_info *ii; 902 struct nilfs_inode_info *ii;
893 903
894 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { 904 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
895 nilfs_fill_in_file_bmap(ifile, ii); 905 nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
896 set_bit(NILFS_I_COLLECTED, &ii->i_state); 906 set_bit(NILFS_I_COLLECTED, &ii->i_state);
897 } 907 }
898} 908}
@@ -1135,7 +1145,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1135 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; 1145 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1136 /* Fall through */ 1146 /* Fall through */
1137 case NILFS_ST_IFILE: 1147 case NILFS_ST_IFILE:
1138 err = nilfs_segctor_scan_file(sci, sbi->s_ifile, 1148 err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
1139 &nilfs_sc_file_ops); 1149 &nilfs_sc_file_ops);
1140 if (unlikely(err)) 1150 if (unlikely(err))
1141 break; 1151 break;
@@ -1599,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1599 kunmap_atomic(kaddr, KM_USER0); 1609 kunmap_atomic(kaddr, KM_USER0);
1600 1610
1601 if (!TestSetPageWriteback(clone_page)) 1611 if (!TestSetPageWriteback(clone_page))
1602 inc_zone_page_state(clone_page, NR_WRITEBACK); 1612 account_page_writeback(clone_page);
1603 unlock_page(clone_page); 1613 unlock_page(clone_page);
1604 1614
1605 return 0; 1615 return 0;
@@ -1900,6 +1910,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1900 set_buffer_uptodate(bh); 1910 set_buffer_uptodate(bh);
1901 clear_buffer_dirty(bh); 1911 clear_buffer_dirty(bh);
1902 clear_buffer_nilfs_volatile(bh); 1912 clear_buffer_nilfs_volatile(bh);
1913 clear_buffer_nilfs_redirected(bh);
1903 if (bh == segbuf->sb_super_root) { 1914 if (bh == segbuf->sb_super_root) {
1904 if (bh->b_page != bd_page) { 1915 if (bh->b_page != bd_page) {
1905 end_page_writeback(bd_page); 1916 end_page_writeback(bd_page);
@@ -1914,12 +1925,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1914 } 1925 }
1915 } 1926 }
1916 1927
1917 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { 1928 if (!nilfs_segbuf_simplex(segbuf)) {
1918 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { 1929 if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
1919 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1930 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1920 sci->sc_lseg_stime = jiffies; 1931 sci->sc_lseg_stime = jiffies;
1921 } 1932 }
1922 if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) 1933 if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
1923 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1934 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1924 } 1935 }
1925 } 1936 }
@@ -1936,11 +1947,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1936 1947
1937 nilfs_drop_collected_inodes(&sci->sc_dirty_files); 1948 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
1938 1949
1939 if (nilfs_doing_gc()) { 1950 if (nilfs_doing_gc())
1940 nilfs_drop_collected_inodes(&sci->sc_gc_inodes); 1951 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
1941 if (update_sr) 1952 else
1942 nilfs_commit_gcdat_inode(nilfs);
1943 } else
1944 nilfs->ns_nongc_ctime = sci->sc_seg_ctime; 1953 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
1945 1954
1946 sci->sc_nblk_inc += sci->sc_nblk_this_inc; 1955 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1951,7 +1960,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1951 if (update_sr) { 1960 if (update_sr) {
1952 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 1961 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
1953 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 1962 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
1954 set_nilfs_sb_dirty(nilfs);
1955 1963
1956 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 1964 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
1957 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 1965 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -1977,7 +1985,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1977 struct nilfs_sb_info *sbi) 1985 struct nilfs_sb_info *sbi)
1978{ 1986{
1979 struct nilfs_inode_info *ii, *n; 1987 struct nilfs_inode_info *ii, *n;
1980 __u64 cno = sbi->s_nilfs->ns_cno; 1988 struct inode *ifile = sci->sc_root->ifile;
1981 1989
1982 spin_lock(&sbi->s_inode_lock); 1990 spin_lock(&sbi->s_inode_lock);
1983 retry: 1991 retry:
@@ -1988,14 +1996,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1988 1996
1989 spin_unlock(&sbi->s_inode_lock); 1997 spin_unlock(&sbi->s_inode_lock);
1990 err = nilfs_ifile_get_inode_block( 1998 err = nilfs_ifile_get_inode_block(
1991 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); 1999 ifile, ii->vfs_inode.i_ino, &ibh);
1992 if (unlikely(err)) { 2000 if (unlikely(err)) {
1993 nilfs_warning(sbi->s_super, __func__, 2001 nilfs_warning(sbi->s_super, __func__,
1994 "failed to get inode block.\n"); 2002 "failed to get inode block.\n");
1995 return err; 2003 return err;
1996 } 2004 }
1997 nilfs_mdt_mark_buffer_dirty(ibh); 2005 nilfs_mdt_mark_buffer_dirty(ibh);
1998 nilfs_mdt_mark_dirty(sbi->s_ifile); 2006 nilfs_mdt_mark_dirty(ifile);
1999 spin_lock(&sbi->s_inode_lock); 2007 spin_lock(&sbi->s_inode_lock);
2000 if (likely(!ii->i_bh)) 2008 if (likely(!ii->i_bh))
2001 ii->i_bh = ibh; 2009 ii->i_bh = ibh;
@@ -2003,7 +2011,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2003 brelse(ibh); 2011 brelse(ibh);
2004 goto retry; 2012 goto retry;
2005 } 2013 }
2006 ii->i_cno = cno;
2007 2014
2008 clear_bit(NILFS_I_QUEUED, &ii->i_state); 2015 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2009 set_bit(NILFS_I_BUSY, &ii->i_state); 2016 set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2012,8 +2019,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2012 } 2019 }
2013 spin_unlock(&sbi->s_inode_lock); 2020 spin_unlock(&sbi->s_inode_lock);
2014 2021
2015 NILFS_I(sbi->s_ifile)->i_cno = cno;
2016
2017 return 0; 2022 return 0;
2018} 2023}
2019 2024
@@ -2022,19 +2027,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2022{ 2027{
2023 struct nilfs_transaction_info *ti = current->journal_info; 2028 struct nilfs_transaction_info *ti = current->journal_info;
2024 struct nilfs_inode_info *ii, *n; 2029 struct nilfs_inode_info *ii, *n;
2025 __u64 cno = sbi->s_nilfs->ns_cno;
2026 2030
2027 spin_lock(&sbi->s_inode_lock); 2031 spin_lock(&sbi->s_inode_lock);
2028 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2032 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2029 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2033 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2030 test_bit(NILFS_I_DIRTY, &ii->i_state)) { 2034 test_bit(NILFS_I_DIRTY, &ii->i_state))
2031 /* The current checkpoint number (=nilfs->ns_cno) is
2032 changed between check-in and check-out only if the
2033 super root is written out. So, we can update i_cno
2034 for the inodes that remain in the dirty list. */
2035 ii->i_cno = cno;
2036 continue; 2035 continue;
2037 } 2036
2038 clear_bit(NILFS_I_BUSY, &ii->i_state); 2037 clear_bit(NILFS_I_BUSY, &ii->i_state);
2039 brelse(ii->i_bh); 2038 brelse(ii->i_bh);
2040 ii->i_bh = NULL; 2039 ii->i_bh = NULL;
@@ -2055,12 +2054,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2055 int err; 2054 int err;
2056 2055
2057 sci->sc_stage.scnt = NILFS_ST_INIT; 2056 sci->sc_stage.scnt = NILFS_ST_INIT;
2057 sci->sc_cno = nilfs->ns_cno;
2058 2058
2059 err = nilfs_segctor_check_in_files(sci, sbi); 2059 err = nilfs_segctor_check_in_files(sci, sbi);
2060 if (unlikely(err)) 2060 if (unlikely(err))
2061 goto out; 2061 goto out;
2062 2062
2063 if (nilfs_test_metadata_dirty(sbi)) 2063 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
2064 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2064 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2065 2065
2066 if (nilfs_segctor_clean(sci)) 2066 if (nilfs_segctor_clean(sci))
@@ -2082,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2082 2082
2083 /* Avoid empty segment */ 2083 /* Avoid empty segment */
2084 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2084 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2085 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2085 nilfs_segbuf_empty(sci->sc_curseg)) {
2086 nilfs_segctor_abort_construction(sci, nilfs, 1); 2086 nilfs_segctor_abort_construction(sci, nilfs, 1);
2087 goto out; 2087 goto out;
2088 } 2088 }
@@ -2092,7 +2092,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2092 goto failed; 2092 goto failed;
2093 2093
2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2095 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci);
2096 2096
2097 if (mode == SC_LSEG_SR && 2097 if (mode == SC_LSEG_SR &&
2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) { 2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2408,6 +2408,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2408{ 2408{
2409 struct nilfs_sb_info *sbi = sci->sc_sbi; 2409 struct nilfs_sb_info *sbi = sci->sc_sbi;
2410 struct the_nilfs *nilfs = sbi->s_nilfs; 2410 struct the_nilfs *nilfs = sbi->s_nilfs;
2411 struct nilfs_super_block **sbp;
2411 int err = 0; 2412 int err = 0;
2412 2413
2413 nilfs_segctor_accept(sci); 2414 nilfs_segctor_accept(sci);
@@ -2423,8 +2424,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2423 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2424 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2424 nilfs_discontinued(nilfs)) { 2425 nilfs_discontinued(nilfs)) {
2425 down_write(&nilfs->ns_sem); 2426 down_write(&nilfs->ns_sem);
2426 err = nilfs_commit_super( 2427 err = -EIO;
2427 sbi, nilfs_altsb_need_update(nilfs)); 2428 sbp = nilfs_prepare_super(sbi,
2429 nilfs_sb_will_flip(nilfs));
2430 if (likely(sbp)) {
2431 nilfs_set_log_cursor(sbp[0], nilfs);
2432 err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
2433 }
2428 up_write(&nilfs->ns_sem); 2434 up_write(&nilfs->ns_sem);
2429 } 2435 }
2430 } 2436 }
@@ -2447,9 +2453,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2447 list_for_each_entry_safe(ii, n, head, i_dirty) { 2453 list_for_each_entry_safe(ii, n, head, i_dirty) {
2448 if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) 2454 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2449 continue; 2455 continue;
2450 hlist_del_init(&ii->vfs_inode.i_hash);
2451 list_del_init(&ii->i_dirty); 2456 list_del_init(&ii->i_dirty);
2452 nilfs_clear_gcinode(&ii->vfs_inode); 2457 iput(&ii->vfs_inode);
2453 } 2458 }
2454} 2459}
2455 2460
@@ -2467,13 +2472,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2467 2472
2468 nilfs_transaction_lock(sbi, &ti, 1); 2473 nilfs_transaction_lock(sbi, &ti, 1);
2469 2474
2470 err = nilfs_init_gcdat_inode(nilfs); 2475 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2471 if (unlikely(err)) 2476 if (unlikely(err))
2472 goto out_unlock; 2477 goto out_unlock;
2473 2478
2474 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); 2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2475 if (unlikely(err)) 2480 if (unlikely(err)) {
2481 nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
2476 goto out_unlock; 2482 goto out_unlock;
2483 }
2477 2484
2478 sci->sc_freesegs = kbufs[4]; 2485 sci->sc_freesegs = kbufs[4];
2479 sci->sc_nfreesegs = argv[4].v_nmembs; 2486 sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2505,7 +2512,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2505 out_unlock: 2512 out_unlock:
2506 sci->sc_freesegs = NULL; 2513 sci->sc_freesegs = NULL;
2507 sci->sc_nfreesegs = 0; 2514 sci->sc_nfreesegs = 0;
2508 nilfs_clear_gcdat_inode(nilfs); 2515 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2509 nilfs_transaction_unlock(sbi); 2516 nilfs_transaction_unlock(sbi);
2510 return err; 2517 return err;
2511} 2518}
@@ -2667,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2667} 2674}
2668 2675
2669static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) 2676static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2677 __acquires(&sci->sc_state_lock)
2678 __releases(&sci->sc_state_lock)
2670{ 2679{
2671 sci->sc_state |= NILFS_SEGCTOR_QUIT; 2680 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2672 2681
@@ -2681,7 +2690,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2681/* 2690/*
2682 * Setup & clean-up functions 2691 * Setup & clean-up functions
2683 */ 2692 */
2684static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) 2693static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2694 struct nilfs_root *root)
2685{ 2695{
2686 struct nilfs_sc_info *sci; 2696 struct nilfs_sc_info *sci;
2687 2697
@@ -2692,6 +2702,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2692 sci->sc_sbi = sbi; 2702 sci->sc_sbi = sbi;
2693 sci->sc_super = sbi->s_super; 2703 sci->sc_super = sbi->s_super;
2694 2704
2705 nilfs_get_root(root);
2706 sci->sc_root = root;
2707
2695 init_waitqueue_head(&sci->sc_wait_request); 2708 init_waitqueue_head(&sci->sc_wait_request);
2696 init_waitqueue_head(&sci->sc_wait_daemon); 2709 init_waitqueue_head(&sci->sc_wait_daemon);
2697 init_waitqueue_head(&sci->sc_wait_task); 2710 init_waitqueue_head(&sci->sc_wait_task);
@@ -2766,6 +2779,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2766 WARN_ON(!list_empty(&sci->sc_segbufs)); 2779 WARN_ON(!list_empty(&sci->sc_segbufs));
2767 WARN_ON(!list_empty(&sci->sc_write_logs)); 2780 WARN_ON(!list_empty(&sci->sc_write_logs));
2768 2781
2782 nilfs_put_root(sci->sc_root);
2783
2769 down_write(&sbi->s_nilfs->ns_segctor_sem); 2784 down_write(&sbi->s_nilfs->ns_segctor_sem);
2770 2785
2771 del_timer_sync(&sci->sc_timer); 2786 del_timer_sync(&sci->sc_timer);
@@ -2775,6 +2790,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2775/** 2790/**
2776 * nilfs_attach_segment_constructor - attach a segment constructor 2791 * nilfs_attach_segment_constructor - attach a segment constructor
2777 * @sbi: nilfs_sb_info 2792 * @sbi: nilfs_sb_info
2793 * @root: root object of the current filesystem tree
2778 * 2794 *
2779 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2795 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2780 * initializes it, and starts the segment constructor. 2796 * initializes it, and starts the segment constructor.
@@ -2784,9 +2800,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2784 * 2800 *
2785 * %-ENOMEM - Insufficient memory available. 2801 * %-ENOMEM - Insufficient memory available.
2786 */ 2802 */
2787int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) 2803int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
2804 struct nilfs_root *root)
2788{ 2805{
2789 struct the_nilfs *nilfs = sbi->s_nilfs;
2790 int err; 2806 int err;
2791 2807
2792 if (NILFS_SC(sbi)) { 2808 if (NILFS_SC(sbi)) {
@@ -2798,14 +2814,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2798 nilfs_detach_segment_constructor(sbi); 2814 nilfs_detach_segment_constructor(sbi);
2799 } 2815 }
2800 2816
2801 sbi->s_sc_info = nilfs_segctor_new(sbi); 2817 sbi->s_sc_info = nilfs_segctor_new(sbi, root);
2802 if (!sbi->s_sc_info) 2818 if (!sbi->s_sc_info)
2803 return -ENOMEM; 2819 return -ENOMEM;
2804 2820
2805 nilfs_attach_writer(nilfs, sbi);
2806 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2821 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2807 if (err) { 2822 if (err) {
2808 nilfs_detach_writer(nilfs, sbi);
2809 kfree(sbi->s_sc_info); 2823 kfree(sbi->s_sc_info);
2810 sbi->s_sc_info = NULL; 2824 sbi->s_sc_info = NULL;
2811 } 2825 }
@@ -2842,5 +2856,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2842 up_write(&nilfs->ns_segctor_sem); 2856 up_write(&nilfs->ns_segctor_sem);
2843 2857
2844 nilfs_dispose_list(sbi, &garbage_list, 1); 2858 nilfs_dispose_list(sbi, &garbage_list, 1);
2845 nilfs_detach_writer(nilfs, sbi);
2846} 2859}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 01e20dbb217..cd8056e7cbe 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "sb.h"
31 31
32struct nilfs_root;
33
32/** 34/**
33 * struct nilfs_recovery_info - Recovery information 35 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 36 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
87 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct 91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree
90 * @sc_nblk_inc: Block count of current generation 93 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written 94 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
107 * @sc_datablk_cnt: Data block count of a file 110 * @sc_datablk_cnt: Data block count of a file
108 * @sc_nblk_this_inc: Number of blocks included in the current logical segment 111 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
109 * @sc_seg_ctime: Creation time 112 * @sc_seg_ctime: Creation time
113 * @sc_cno: checkpoint number of current log
110 * @sc_flags: Internal flags 114 * @sc_flags: Internal flags
111 * @sc_state_lock: spinlock for sc_state and so on 115 * @sc_state_lock: spinlock for sc_state and so on
112 * @sc_state: Segctord state flags 116 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
128struct nilfs_sc_info { 132struct nilfs_sc_info {
129 struct super_block *sc_super; 133 struct super_block *sc_super;
130 struct nilfs_sb_info *sc_sbi; 134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root;
131 136
132 unsigned long sc_nblk_inc; 137 unsigned long sc_nblk_inc;
133 138
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
156 unsigned long sc_datablk_cnt; 161 unsigned long sc_datablk_cnt;
157 unsigned long sc_nblk_this_inc; 162 unsigned long sc_nblk_this_inc;
158 time_t sc_seg_ctime; 163 time_t sc_seg_ctime;
159 164 __u64 sc_cno;
160 unsigned long sc_flags; 165 unsigned long sc_flags;
161 166
162 spinlock_t sc_state_lock; 167 spinlock_t sc_state_lock;
@@ -230,17 +235,18 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
230extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
231 void **); 236 void **);
232 237
233extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); 238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
239 struct nilfs_root *root);
234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
235 241
236/* recovery.c */ 242/* recovery.c */
237extern int nilfs_read_super_root_block(struct super_block *, sector_t, 243extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
238 struct buffer_head **, int); 244 struct buffer_head **, int);
239extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, 245extern int nilfs_search_super_root(struct the_nilfs *,
240 struct nilfs_recovery_info *); 246 struct nilfs_recovery_info *);
241extern int nilfs_recover_logical_segments(struct the_nilfs *, 247extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
242 struct nilfs_sb_info *, 248 struct nilfs_sb_info *,
243 struct nilfs_recovery_info *); 249 struct nilfs_recovery_info *);
244extern void nilfs_dispose_segment_list(struct list_head *); 250extern void nilfs_dispose_segment_list(struct list_head *);
245 251
246#endif /* _NILFS_SEGMENT_H */ 252#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2..1d6f488ccae 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
505{ 505{
506 struct buffer_head *header_bh; 506 struct buffer_head *header_bh;
507 struct nilfs_sufile_header *header; 507 struct nilfs_sufile_header *header;
508 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 508 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
509 void *kaddr; 509 void *kaddr;
510 int ret; 510 int ret;
511 511
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
583 struct nilfs_segment_usage *su; 583 struct nilfs_segment_usage *su;
584 struct nilfs_suinfo *si = buf; 584 struct nilfs_suinfo *si = buf;
585 size_t susz = NILFS_MDT(sufile)->mi_entry_size; 585 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
586 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 586 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
587 void *kaddr; 587 void *kaddr;
588 unsigned long nsegs, segusages_per_block; 588 unsigned long nsegs, segusages_per_block;
589 ssize_t n; 589 ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
635} 635}
636 636
637/** 637/**
638 * nilfs_sufile_read - read sufile inode 638 * nilfs_sufile_read - read or get sufile inode
639 * @sufile: sufile inode 639 * @sb: super block instance
640 * @susize: size of a segment usage entry
640 * @raw_inode: on-disk sufile inode 641 * @raw_inode: on-disk sufile inode
642 * @inodep: buffer to store the inode
641 */ 643 */
642int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) 644int nilfs_sufile_read(struct super_block *sb, size_t susize,
645 struct nilfs_inode *raw_inode, struct inode **inodep)
643{ 646{
644 struct nilfs_sufile_info *sui = NILFS_SUI(sufile); 647 struct inode *sufile;
648 struct nilfs_sufile_info *sui;
645 struct buffer_head *header_bh; 649 struct buffer_head *header_bh;
646 struct nilfs_sufile_header *header; 650 struct nilfs_sufile_header *header;
647 void *kaddr; 651 void *kaddr;
648 int ret; 652 int err;
649 653
650 ret = nilfs_read_inode_common(sufile, raw_inode); 654 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
651 if (ret < 0) 655 if (unlikely(!sufile))
652 return ret; 656 return -ENOMEM;
657 if (!(sufile->i_state & I_NEW))
658 goto out;
653 659
654 ret = nilfs_sufile_get_header_block(sufile, &header_bh); 660 err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
655 if (!ret) { 661 if (err)
656 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 662 goto failed;
657 header = kaddr + bh_offset(header_bh);
658 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
659 kunmap_atomic(kaddr, KM_USER0);
660 brelse(header_bh);
661 }
662 return ret;
663}
664 663
665/** 664 nilfs_mdt_set_entry_size(sufile, susize,
666 * nilfs_sufile_new - create sufile 665 sizeof(struct nilfs_sufile_header));
667 * @nilfs: nilfs object 666
668 * @susize: size of a segment usage entry 667 err = nilfs_read_inode_common(sufile, raw_inode);
669 */ 668 if (err)
670struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) 669 goto failed;
671{ 670
672 struct inode *sufile; 671 err = nilfs_sufile_get_header_block(sufile, &header_bh);
672 if (err)
673 goto failed;
673 674
674 sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, 675 sui = NILFS_SUI(sufile);
675 sizeof(struct nilfs_sufile_info)); 676 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
676 if (sufile) 677 header = kaddr + bh_offset(header_bh);
677 nilfs_mdt_set_entry_size(sufile, susize, 678 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
678 sizeof(struct nilfs_sufile_header)); 679 kunmap_atomic(kaddr, KM_USER0);
679 return sufile; 680 brelse(header_bh);
681
682 unlock_new_inode(sufile);
683 out:
684 *inodep = sufile;
685 return 0;
686 failed:
687 iget_failed(sufile);
688 return err;
680} 689}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7..a943fbacb45 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
31 31
32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
33{ 33{
34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; 34 return NILFS_I_NILFS(sufile)->ns_nsegments;
35} 35}
36 36
37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); 37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
62 struct buffer_head *); 62 struct buffer_head *);
63 63
64int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); 64int nilfs_sufile_read(struct super_block *sb, size_t susize,
65struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); 65 struct nilfs_inode *raw_inode, struct inode **inodep);
66 66
67/** 67/**
68 * nilfs_sufile_scrap - make a segment garbage 68 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 414ef68931c..f804d41ec9d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,16 +45,17 @@
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h> 46#include <linux/random.h>
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h> 48#include <linux/vfs.h>
50#include <linux/writeback.h> 49#include <linux/writeback.h>
51#include <linux/kobject.h> 50#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include <linux/seq_file.h> 51#include <linux/seq_file.h>
54#include <linux/mount.h> 52#include <linux/mount.h>
55#include "nilfs.h" 53#include "nilfs.h"
54#include "export.h"
56#include "mdt.h" 55#include "mdt.h"
57#include "alloc.h" 56#include "alloc.h"
57#include "btree.h"
58#include "btnode.h"
58#include "page.h" 59#include "page.h"
59#include "cpfile.h" 60#include "cpfile.h"
60#include "ifile.h" 61#include "ifile.h"
@@ -67,13 +68,33 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
67 "(NILFS)"); 68 "(NILFS)");
68MODULE_LICENSE("GPL"); 69MODULE_LICENSE("GPL");
69 70
70struct kmem_cache *nilfs_inode_cachep; 71static struct kmem_cache *nilfs_inode_cachep;
71struct kmem_cache *nilfs_transaction_cachep; 72struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep; 73struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache; 74struct kmem_cache *nilfs_btree_path_cache;
74 75
76static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
75static int nilfs_remount(struct super_block *sb, int *flags, char *data); 77static int nilfs_remount(struct super_block *sb, int *flags, char *data);
76 78
79static void nilfs_set_error(struct nilfs_sb_info *sbi)
80{
81 struct the_nilfs *nilfs = sbi->s_nilfs;
82 struct nilfs_super_block **sbp;
83
84 down_write(&nilfs->ns_sem);
85 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
86 nilfs->ns_mount_state |= NILFS_ERROR_FS;
87 sbp = nilfs_prepare_super(sbi, 0);
88 if (likely(sbp)) {
89 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
90 if (sbp[1])
91 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
92 nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
93 }
94 }
95 up_write(&nilfs->ns_sem);
96}
97
77/** 98/**
78 * nilfs_error() - report failure condition on a filesystem 99 * nilfs_error() - report failure condition on a filesystem
79 * 100 *
@@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
99 va_end(args); 120 va_end(args);
100 121
101 if (!(sb->s_flags & MS_RDONLY)) { 122 if (!(sb->s_flags & MS_RDONLY)) {
102 struct the_nilfs *nilfs = sbi->s_nilfs; 123 nilfs_set_error(sbi);
103
104 down_write(&nilfs->ns_sem);
105 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
106 nilfs->ns_mount_state |= NILFS_ERROR_FS;
107 nilfs->ns_sbp[0]->s_state |=
108 cpu_to_le16(NILFS_ERROR_FS);
109 nilfs_commit_super(sbi, 1);
110 }
111 up_write(&nilfs->ns_sem);
112 124
113 if (nilfs_test_opt(sbi, ERRORS_RO)) { 125 if (nilfs_test_opt(sbi, ERRORS_RO)) {
114 printk(KERN_CRIT "Remounting filesystem read-only\n"); 126 printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -135,7 +147,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
135} 147}
136 148
137 149
138struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 150struct inode *nilfs_alloc_inode(struct super_block *sb)
139{ 151{
140 struct nilfs_inode_info *ii; 152 struct nilfs_inode_info *ii;
141 153
@@ -144,70 +156,55 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
144 return NULL; 156 return NULL;
145 ii->i_bh = NULL; 157 ii->i_bh = NULL;
146 ii->i_state = 0; 158 ii->i_state = 0;
159 ii->i_cno = 0;
147 ii->vfs_inode.i_version = 1; 160 ii->vfs_inode.i_version = 1;
148 nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); 161 nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
149 return &ii->vfs_inode; 162 return &ii->vfs_inode;
150} 163}
151 164
152struct inode *nilfs_alloc_inode(struct super_block *sb)
153{
154 return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
155}
156
157void nilfs_destroy_inode(struct inode *inode) 165void nilfs_destroy_inode(struct inode *inode)
158{ 166{
159 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 167 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
160}
161
162static void nilfs_clear_inode(struct inode *inode)
163{
164 struct nilfs_inode_info *ii = NILFS_I(inode);
165
166 /*
167 * Free resources allocated in nilfs_read_inode(), here.
168 */
169 BUG_ON(!list_empty(&ii->i_dirty));
170 brelse(ii->i_bh);
171 ii->i_bh = NULL;
172
173 if (test_bit(NILFS_I_BMAP, &ii->i_state))
174 nilfs_bmap_clear(ii->i_bmap);
175 168
176 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 169 if (mdi) {
170 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
171 kfree(mdi);
172 }
173 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
177} 174}
178 175
179static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) 176static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
180{ 177{
181 struct the_nilfs *nilfs = sbi->s_nilfs; 178 struct the_nilfs *nilfs = sbi->s_nilfs;
182 int err; 179 int err;
183 int barrier_done = 0;
184 180
185 if (nilfs_test_opt(sbi, BARRIER)) {
186 set_buffer_ordered(nilfs->ns_sbh[0]);
187 barrier_done = 1;
188 }
189 retry: 181 retry:
190 set_buffer_dirty(nilfs->ns_sbh[0]); 182 set_buffer_dirty(nilfs->ns_sbh[0]);
191 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 183 if (nilfs_test_opt(sbi, BARRIER)) {
192 if (err == -EOPNOTSUPP && barrier_done) { 184 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
193 nilfs_warning(sbi->s_super, __func__, 185 WRITE_SYNC | WRITE_FLUSH_FUA);
194 "barrier-based sync failed. " 186 } else {
195 "disabling barriers\n"); 187 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
196 nilfs_clear_opt(sbi, BARRIER);
197 barrier_done = 0;
198 clear_buffer_ordered(nilfs->ns_sbh[0]);
199 goto retry;
200 } 188 }
189
201 if (unlikely(err)) { 190 if (unlikely(err)) {
202 printk(KERN_ERR 191 printk(KERN_ERR
203 "NILFS: unable to write superblock (err=%d)\n", err); 192 "NILFS: unable to write superblock (err=%d)\n", err);
204 if (err == -EIO && nilfs->ns_sbh[1]) { 193 if (err == -EIO && nilfs->ns_sbh[1]) {
194 /*
195 * sbp[0] points to newer log than sbp[1],
196 * so copy sbp[0] to sbp[1] to take over sbp[0].
197 */
198 memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
199 nilfs->ns_sbsize);
205 nilfs_fall_back_super_block(nilfs); 200 nilfs_fall_back_super_block(nilfs);
206 goto retry; 201 goto retry;
207 } 202 }
208 } else { 203 } else {
209 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 204 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
210 205
206 nilfs->ns_sbwcount++;
207
211 /* 208 /*
212 * The latest segment becomes trailable from the position 209 * The latest segment becomes trailable from the position
213 * written in superblock. 210 * written in superblock.
@@ -216,66 +213,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
216 213
217 /* update GC protection for recent segments */ 214 /* update GC protection for recent segments */
218 if (nilfs->ns_sbh[1]) { 215 if (nilfs->ns_sbh[1]) {
219 sbp = NULL; 216 if (flag == NILFS_SB_COMMIT_ALL) {
220 if (dupsb) {
221 set_buffer_dirty(nilfs->ns_sbh[1]); 217 set_buffer_dirty(nilfs->ns_sbh[1]);
222 if (!sync_dirty_buffer(nilfs->ns_sbh[1])) 218 if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
223 sbp = nilfs->ns_sbp[1]; 219 goto out;
224 } 220 }
221 if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
222 le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
223 sbp = nilfs->ns_sbp[1];
225 } 224 }
226 if (sbp) {
227 spin_lock(&nilfs->ns_last_segment_lock);
228 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
229 spin_unlock(&nilfs->ns_last_segment_lock);
230 }
231 }
232 225
226 spin_lock(&nilfs->ns_last_segment_lock);
227 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
228 spin_unlock(&nilfs->ns_last_segment_lock);
229 }
230 out:
233 return err; 231 return err;
234} 232}
235 233
236int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) 234void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
235 struct the_nilfs *nilfs)
236{
237 sector_t nfreeblocks;
238
239 /* nilfs->ns_sem must be locked by the caller. */
240 nilfs_count_free_blocks(nilfs, &nfreeblocks);
241 sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
242
243 spin_lock(&nilfs->ns_last_segment_lock);
244 sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
245 sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
246 sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
247 spin_unlock(&nilfs->ns_last_segment_lock);
248}
249
250struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
251 int flip)
237{ 252{
238 struct the_nilfs *nilfs = sbi->s_nilfs; 253 struct the_nilfs *nilfs = sbi->s_nilfs;
239 struct nilfs_super_block **sbp = nilfs->ns_sbp; 254 struct nilfs_super_block **sbp = nilfs->ns_sbp;
240 sector_t nfreeblocks;
241 time_t t;
242 int err;
243 255
244 /* nilfs->sem must be locked by the caller. */ 256 /* nilfs->ns_sem must be locked by the caller. */
245 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { 257 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
246 if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) 258 if (sbp[1] &&
247 nilfs_swap_super_block(nilfs); 259 sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
248 else { 260 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
261 } else {
249 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 262 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
250 sbi->s_super->s_id); 263 sbi->s_super->s_id);
251 return -EIO; 264 return NULL;
252 } 265 }
266 } else if (sbp[1] &&
267 sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
268 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
253 } 269 }
254 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
255 if (unlikely(err)) {
256 printk(KERN_ERR "NILFS: failed to count free blocks\n");
257 return err;
258 }
259 spin_lock(&nilfs->ns_last_segment_lock);
260 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
261 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
262 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
263 spin_unlock(&nilfs->ns_last_segment_lock);
264 270
271 if (flip && sbp[1])
272 nilfs_swap_super_block(nilfs);
273
274 return sbp;
275}
276
277int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
278{
279 struct the_nilfs *nilfs = sbi->s_nilfs;
280 struct nilfs_super_block **sbp = nilfs->ns_sbp;
281 time_t t;
282
283 /* nilfs->ns_sem must be locked by the caller. */
265 t = get_seconds(); 284 t = get_seconds();
266 nilfs->ns_sbwtime[0] = t; 285 nilfs->ns_sbwtime = t;
267 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
268 sbp[0]->s_wtime = cpu_to_le64(t); 286 sbp[0]->s_wtime = cpu_to_le64(t);
269 sbp[0]->s_sum = 0; 287 sbp[0]->s_sum = 0;
270 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, 288 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
271 (unsigned char *)sbp[0], 289 (unsigned char *)sbp[0],
272 nilfs->ns_sbsize)); 290 nilfs->ns_sbsize));
273 if (dupsb && sbp[1]) { 291 if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
274 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 292 sbp[1]->s_wtime = sbp[0]->s_wtime;
275 nilfs->ns_sbwtime[1] = t; 293 sbp[1]->s_sum = 0;
294 sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
295 (unsigned char *)sbp[1],
296 nilfs->ns_sbsize));
276 } 297 }
277 clear_nilfs_sb_dirty(nilfs); 298 clear_nilfs_sb_dirty(nilfs);
278 return nilfs_sync_super(sbi, dupsb); 299 return nilfs_sync_super(sbi, flag);
300}
301
302/**
303 * nilfs_cleanup_super() - write filesystem state for cleanup
304 * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
305 *
306 * This function restores state flags in the on-disk super block.
307 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
308 * filesystem was not clean previously.
309 */
310int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
311{
312 struct nilfs_super_block **sbp;
313 int flag = NILFS_SB_COMMIT;
314 int ret = -EIO;
315
316 sbp = nilfs_prepare_super(sbi, 0);
317 if (sbp) {
318 sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
319 nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
320 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
321 /*
322 * make the "clean" flag also to the opposite
323 * super block if both super blocks point to
324 * the same checkpoint.
325 */
326 sbp[1]->s_state = sbp[0]->s_state;
327 flag = NILFS_SB_COMMIT_ALL;
328 }
329 ret = nilfs_commit_super(sbi, flag);
330 }
331 return ret;
279} 332}
280 333
281static void nilfs_put_super(struct super_block *sb) 334static void nilfs_put_super(struct super_block *sb)
@@ -283,34 +336,29 @@ static void nilfs_put_super(struct super_block *sb)
283 struct nilfs_sb_info *sbi = NILFS_SB(sb); 336 struct nilfs_sb_info *sbi = NILFS_SB(sb);
284 struct the_nilfs *nilfs = sbi->s_nilfs; 337 struct the_nilfs *nilfs = sbi->s_nilfs;
285 338
286 lock_kernel();
287
288 nilfs_detach_segment_constructor(sbi); 339 nilfs_detach_segment_constructor(sbi);
289 340
290 if (!(sb->s_flags & MS_RDONLY)) { 341 if (!(sb->s_flags & MS_RDONLY)) {
291 down_write(&nilfs->ns_sem); 342 down_write(&nilfs->ns_sem);
292 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 343 nilfs_cleanup_super(sbi);
293 nilfs_commit_super(sbi, 1);
294 up_write(&nilfs->ns_sem); 344 up_write(&nilfs->ns_sem);
295 } 345 }
296 down_write(&nilfs->ns_super_sem);
297 if (nilfs->ns_current == sbi)
298 nilfs->ns_current = NULL;
299 up_write(&nilfs->ns_super_sem);
300 346
301 nilfs_detach_checkpoint(sbi); 347 iput(nilfs->ns_sufile);
302 put_nilfs(sbi->s_nilfs); 348 iput(nilfs->ns_cpfile);
349 iput(nilfs->ns_dat);
350
351 destroy_nilfs(nilfs);
303 sbi->s_super = NULL; 352 sbi->s_super = NULL;
304 sb->s_fs_info = NULL; 353 sb->s_fs_info = NULL;
305 nilfs_put_sbinfo(sbi); 354 kfree(sbi);
306
307 unlock_kernel();
308} 355}
309 356
310static int nilfs_sync_fs(struct super_block *sb, int wait) 357static int nilfs_sync_fs(struct super_block *sb, int wait)
311{ 358{
312 struct nilfs_sb_info *sbi = NILFS_SB(sb); 359 struct nilfs_sb_info *sbi = NILFS_SB(sb);
313 struct the_nilfs *nilfs = sbi->s_nilfs; 360 struct the_nilfs *nilfs = sbi->s_nilfs;
361 struct nilfs_super_block **sbp;
314 int err = 0; 362 int err = 0;
315 363
316 /* This function is called when super block should be written back */ 364 /* This function is called when super block should be written back */
@@ -318,27 +366,34 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
318 err = nilfs_construct_segment(sb); 366 err = nilfs_construct_segment(sb);
319 367
320 down_write(&nilfs->ns_sem); 368 down_write(&nilfs->ns_sem);
321 if (nilfs_sb_dirty(nilfs)) 369 if (nilfs_sb_dirty(nilfs)) {
322 nilfs_commit_super(sbi, 1); 370 sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
371 if (likely(sbp)) {
372 nilfs_set_log_cursor(sbp[0], nilfs);
373 nilfs_commit_super(sbi, NILFS_SB_COMMIT);
374 }
375 }
323 up_write(&nilfs->ns_sem); 376 up_write(&nilfs->ns_sem);
324 377
325 return err; 378 return err;
326} 379}
327 380
328int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) 381int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
382 struct nilfs_root **rootp)
329{ 383{
330 struct the_nilfs *nilfs = sbi->s_nilfs; 384 struct the_nilfs *nilfs = sbi->s_nilfs;
385 struct nilfs_root *root;
331 struct nilfs_checkpoint *raw_cp; 386 struct nilfs_checkpoint *raw_cp;
332 struct buffer_head *bh_cp; 387 struct buffer_head *bh_cp;
333 int err; 388 int err = -ENOMEM;
334 389
335 down_write(&nilfs->ns_super_sem); 390 root = nilfs_find_or_create_root(
336 list_add(&sbi->s_list, &nilfs->ns_supers); 391 nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
337 up_write(&nilfs->ns_super_sem); 392 if (!root)
393 return err;
338 394
339 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 395 if (root->ifile)
340 if (!sbi->s_ifile) 396 goto reuse; /* already attached checkpoint */
341 return -ENOMEM;
342 397
343 down_read(&nilfs->ns_segctor_sem); 398 down_read(&nilfs->ns_segctor_sem);
344 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 399 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -354,44 +409,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
354 } 409 }
355 goto failed; 410 goto failed;
356 } 411 }
357 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); 412
358 if (unlikely(err)) 413 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
414 &raw_cp->cp_ifile_inode, &root->ifile);
415 if (err)
359 goto failed_bh; 416 goto failed_bh;
360 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 417
361 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 418 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
419 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
362 420
363 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 421 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
422
423 reuse:
424 *rootp = root;
364 return 0; 425 return 0;
365 426
366 failed_bh: 427 failed_bh:
367 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
368 failed: 429 failed:
369 nilfs_mdt_destroy(sbi->s_ifile); 430 nilfs_put_root(root);
370 sbi->s_ifile = NULL;
371 431
372 down_write(&nilfs->ns_super_sem); 432 return err;
373 list_del_init(&sbi->s_list); 433}
374 up_write(&nilfs->ns_super_sem);
375 434
435static int nilfs_freeze(struct super_block *sb)
436{
437 struct nilfs_sb_info *sbi = NILFS_SB(sb);
438 struct the_nilfs *nilfs = sbi->s_nilfs;
439 int err;
440
441 if (sb->s_flags & MS_RDONLY)
442 return 0;
443
444 /* Mark super block clean */
445 down_write(&nilfs->ns_sem);
446 err = nilfs_cleanup_super(sbi);
447 up_write(&nilfs->ns_sem);
376 return err; 448 return err;
377} 449}
378 450
379void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) 451static int nilfs_unfreeze(struct super_block *sb)
380{ 452{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb);
381 struct the_nilfs *nilfs = sbi->s_nilfs; 454 struct the_nilfs *nilfs = sbi->s_nilfs;
382 455
383 nilfs_mdt_destroy(sbi->s_ifile); 456 if (sb->s_flags & MS_RDONLY)
384 sbi->s_ifile = NULL; 457 return 0;
385 down_write(&nilfs->ns_super_sem); 458
386 list_del_init(&sbi->s_list); 459 down_write(&nilfs->ns_sem);
387 up_write(&nilfs->ns_super_sem); 460 nilfs_setup_super(sbi, false);
461 up_write(&nilfs->ns_sem);
462 return 0;
388} 463}
389 464
390static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) 465static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
391{ 466{
392 struct super_block *sb = dentry->d_sb; 467 struct super_block *sb = dentry->d_sb;
393 struct nilfs_sb_info *sbi = NILFS_SB(sb); 468 struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
394 struct the_nilfs *nilfs = sbi->s_nilfs; 469 struct the_nilfs *nilfs = root->nilfs;
395 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 470 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
396 unsigned long long blocks; 471 unsigned long long blocks;
397 unsigned long overhead; 472 unsigned long overhead;
@@ -427,7 +502,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
427 buf->f_bfree = nfreeblocks; 502 buf->f_bfree = nfreeblocks;
428 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 503 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
429 (buf->f_bfree - nrsvblocks) : 0; 504 (buf->f_bfree - nrsvblocks) : 0;
430 buf->f_files = atomic_read(&sbi->s_inodes_count); 505 buf->f_files = atomic_read(&root->inodes_count);
431 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 506 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
432 buf->f_namelen = NILFS_NAME_LEN; 507 buf->f_namelen = NILFS_NAME_LEN;
433 buf->f_fsid.val[0] = (u32)id; 508 buf->f_fsid.val[0] = (u32)id;
@@ -440,22 +515,22 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
440{ 515{
441 struct super_block *sb = vfs->mnt_sb; 516 struct super_block *sb = vfs->mnt_sb;
442 struct nilfs_sb_info *sbi = NILFS_SB(sb); 517 struct nilfs_sb_info *sbi = NILFS_SB(sb);
518 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
443 519
444 if (!nilfs_test_opt(sbi, BARRIER)) 520 if (!nilfs_test_opt(sbi, BARRIER))
445 seq_printf(seq, ",nobarrier"); 521 seq_puts(seq, ",nobarrier");
446 if (nilfs_test_opt(sbi, SNAPSHOT)) 522 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
447 seq_printf(seq, ",cp=%llu", 523 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
448 (unsigned long long int)sbi->s_snapshot_cno);
449 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 524 if (nilfs_test_opt(sbi, ERRORS_PANIC))
450 seq_printf(seq, ",errors=panic"); 525 seq_puts(seq, ",errors=panic");
451 if (nilfs_test_opt(sbi, ERRORS_CONT)) 526 if (nilfs_test_opt(sbi, ERRORS_CONT))
452 seq_printf(seq, ",errors=continue"); 527 seq_puts(seq, ",errors=continue");
453 if (nilfs_test_opt(sbi, STRICT_ORDER)) 528 if (nilfs_test_opt(sbi, STRICT_ORDER))
454 seq_printf(seq, ",order=strict"); 529 seq_puts(seq, ",order=strict");
455 if (nilfs_test_opt(sbi, NORECOVERY)) 530 if (nilfs_test_opt(sbi, NORECOVERY))
456 seq_printf(seq, ",norecovery"); 531 seq_puts(seq, ",norecovery");
457 if (nilfs_test_opt(sbi, DISCARD)) 532 if (nilfs_test_opt(sbi, DISCARD))
458 seq_printf(seq, ",discard"); 533 seq_puts(seq, ",discard");
459 534
460 return 0; 535 return 0;
461} 536}
@@ -467,85 +542,45 @@ static const struct super_operations nilfs_sops = {
467 /* .write_inode = nilfs_write_inode, */ 542 /* .write_inode = nilfs_write_inode, */
468 /* .put_inode = nilfs_put_inode, */ 543 /* .put_inode = nilfs_put_inode, */
469 /* .drop_inode = nilfs_drop_inode, */ 544 /* .drop_inode = nilfs_drop_inode, */
470 .delete_inode = nilfs_delete_inode, 545 .evict_inode = nilfs_evict_inode,
471 .put_super = nilfs_put_super, 546 .put_super = nilfs_put_super,
472 /* .write_super = nilfs_write_super, */ 547 /* .write_super = nilfs_write_super, */
473 .sync_fs = nilfs_sync_fs, 548 .sync_fs = nilfs_sync_fs,
549 .freeze_fs = nilfs_freeze,
550 .unfreeze_fs = nilfs_unfreeze,
474 /* .write_super_lockfs */ 551 /* .write_super_lockfs */
475 /* .unlockfs */ 552 /* .unlockfs */
476 .statfs = nilfs_statfs, 553 .statfs = nilfs_statfs,
477 .remount_fs = nilfs_remount, 554 .remount_fs = nilfs_remount,
478 .clear_inode = nilfs_clear_inode,
479 /* .umount_begin */ 555 /* .umount_begin */
480 .show_options = nilfs_show_options 556 .show_options = nilfs_show_options
481}; 557};
482 558
483static struct inode *
484nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
485{
486 struct inode *inode;
487
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
489 ino != NILFS_SKETCH_INO)
490 return ERR_PTR(-ESTALE);
491
492 inode = nilfs_iget(sb, ino);
493 if (IS_ERR(inode))
494 return ERR_CAST(inode);
495 if (generation && inode->i_generation != generation) {
496 iput(inode);
497 return ERR_PTR(-ESTALE);
498 }
499
500 return inode;
501}
502
503static struct dentry *
504nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
505 int fh_type)
506{
507 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
508 nilfs_nfs_get_inode);
509}
510
511static struct dentry *
512nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
513 int fh_type)
514{
515 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
516 nilfs_nfs_get_inode);
517}
518
519static const struct export_operations nilfs_export_ops = {
520 .fh_to_dentry = nilfs_fh_to_dentry,
521 .fh_to_parent = nilfs_fh_to_parent,
522 .get_parent = nilfs_get_parent,
523};
524
525enum { 559enum {
526 Opt_err_cont, Opt_err_panic, Opt_err_ro, 560 Opt_err_cont, Opt_err_panic, Opt_err_ro,
527 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 561 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
528 Opt_discard, Opt_err, 562 Opt_discard, Opt_nodiscard, Opt_err,
529}; 563};
530 564
531static match_table_t tokens = { 565static match_table_t tokens = {
532 {Opt_err_cont, "errors=continue"}, 566 {Opt_err_cont, "errors=continue"},
533 {Opt_err_panic, "errors=panic"}, 567 {Opt_err_panic, "errors=panic"},
534 {Opt_err_ro, "errors=remount-ro"}, 568 {Opt_err_ro, "errors=remount-ro"},
569 {Opt_barrier, "barrier"},
535 {Opt_nobarrier, "nobarrier"}, 570 {Opt_nobarrier, "nobarrier"},
536 {Opt_snapshot, "cp=%u"}, 571 {Opt_snapshot, "cp=%u"},
537 {Opt_order, "order=%s"}, 572 {Opt_order, "order=%s"},
538 {Opt_norecovery, "norecovery"}, 573 {Opt_norecovery, "norecovery"},
539 {Opt_discard, "discard"}, 574 {Opt_discard, "discard"},
575 {Opt_nodiscard, "nodiscard"},
540 {Opt_err, NULL} 576 {Opt_err, NULL}
541}; 577};
542 578
543static int parse_options(char *options, struct super_block *sb) 579static int parse_options(char *options, struct super_block *sb, int is_remount)
544{ 580{
545 struct nilfs_sb_info *sbi = NILFS_SB(sb); 581 struct nilfs_sb_info *sbi = NILFS_SB(sb);
546 char *p; 582 char *p;
547 substring_t args[MAX_OPT_ARGS]; 583 substring_t args[MAX_OPT_ARGS];
548 int option;
549 584
550 if (!options) 585 if (!options)
551 return 1; 586 return 1;
@@ -557,6 +592,9 @@ static int parse_options(char *options, struct super_block *sb)
557 592
558 token = match_token(p, tokens, args); 593 token = match_token(p, tokens, args);
559 switch (token) { 594 switch (token) {
595 case Opt_barrier:
596 nilfs_set_opt(sbi, BARRIER);
597 break;
560 case Opt_nobarrier: 598 case Opt_nobarrier:
561 nilfs_clear_opt(sbi, BARRIER); 599 nilfs_clear_opt(sbi, BARRIER);
562 break; 600 break;
@@ -580,12 +618,12 @@ static int parse_options(char *options, struct super_block *sb)
580 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 618 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
581 break; 619 break;
582 case Opt_snapshot: 620 case Opt_snapshot:
583 if (match_int(&args[0], &option) || option <= 0) 621 if (is_remount) {
584 return 0; 622 printk(KERN_ERR
585 if (!(sb->s_flags & MS_RDONLY)) 623 "NILFS: \"%s\" option is invalid "
624 "for remount.\n", p);
586 return 0; 625 return 0;
587 sbi->s_snapshot_cno = option; 626 }
588 nilfs_set_opt(sbi, SNAPSHOT);
589 break; 627 break;
590 case Opt_norecovery: 628 case Opt_norecovery:
591 nilfs_set_opt(sbi, NORECOVERY); 629 nilfs_set_opt(sbi, NORECOVERY);
@@ -593,6 +631,9 @@ static int parse_options(char *options, struct super_block *sb)
593 case Opt_discard: 631 case Opt_discard:
594 nilfs_set_opt(sbi, DISCARD); 632 nilfs_set_opt(sbi, DISCARD);
595 break; 633 break;
634 case Opt_nodiscard:
635 nilfs_clear_opt(sbi, DISCARD);
636 break;
596 default: 637 default:
597 printk(KERN_ERR 638 printk(KERN_ERR
598 "NILFS: Unrecognized mount option \"%s\"\n", p); 639 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -610,14 +651,24 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
610 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 651 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
611} 652}
612 653
613static int nilfs_setup_super(struct nilfs_sb_info *sbi) 654static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
614{ 655{
615 struct the_nilfs *nilfs = sbi->s_nilfs; 656 struct the_nilfs *nilfs = sbi->s_nilfs;
616 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 657 struct nilfs_super_block **sbp;
617 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); 658 int max_mnt_count;
618 int mnt_count = le16_to_cpu(sbp->s_mnt_count); 659 int mnt_count;
660
661 /* nilfs->ns_sem must be locked by the caller. */
662 sbp = nilfs_prepare_super(sbi, 0);
663 if (!sbp)
664 return -EIO;
665
666 if (!is_mount)
667 goto skip_mount_setup;
668
669 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
670 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
619 671
620 /* nilfs->sem must be locked by the caller. */
621 if (nilfs->ns_mount_state & NILFS_ERROR_FS) { 672 if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
622 printk(KERN_WARNING 673 printk(KERN_WARNING
623 "NILFS warning: mounting fs with errors\n"); 674 "NILFS warning: mounting fs with errors\n");
@@ -628,12 +679,17 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
628#endif 679#endif
629 } 680 }
630 if (!max_mnt_count) 681 if (!max_mnt_count)
631 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 682 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
683
684 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
685 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
632 686
633 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); 687skip_mount_setup:
634 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); 688 sbp[0]->s_state =
635 sbp->s_mtime = cpu_to_le64(get_seconds()); 689 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
636 return nilfs_commit_super(sbi, 1); 690 /* synchronize sbp[1] with sbp[0] */
691 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
692 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
637} 693}
638 694
639struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, 695struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -670,7 +726,165 @@ int nilfs_store_magic_and_option(struct super_block *sb,
670 sbi->s_interval = le32_to_cpu(sbp->s_c_interval); 726 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
671 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); 727 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
672 728
673 return !parse_options(data, sb) ? -EINVAL : 0 ; 729 return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
730}
731
732int nilfs_check_feature_compatibility(struct super_block *sb,
733 struct nilfs_super_block *sbp)
734{
735 __u64 features;
736
737 features = le64_to_cpu(sbp->s_feature_incompat) &
738 ~NILFS_FEATURE_INCOMPAT_SUPP;
739 if (features) {
740 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
741 "optional features (%llx)\n",
742 (unsigned long long)features);
743 return -EINVAL;
744 }
745 features = le64_to_cpu(sbp->s_feature_compat_ro) &
746 ~NILFS_FEATURE_COMPAT_RO_SUPP;
747 if (!(sb->s_flags & MS_RDONLY) && features) {
748 printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
749 "unsupported optional features (%llx)\n",
750 (unsigned long long)features);
751 return -EINVAL;
752 }
753 return 0;
754}
755
756static int nilfs_get_root_dentry(struct super_block *sb,
757 struct nilfs_root *root,
758 struct dentry **root_dentry)
759{
760 struct inode *inode;
761 struct dentry *dentry;
762 int ret = 0;
763
764 inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
765 if (IS_ERR(inode)) {
766 printk(KERN_ERR "NILFS: get root inode failed\n");
767 ret = PTR_ERR(inode);
768 goto out;
769 }
770 if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
771 iput(inode);
772 printk(KERN_ERR "NILFS: corrupt root inode.\n");
773 ret = -EINVAL;
774 goto out;
775 }
776
777 if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
778 dentry = d_find_alias(inode);
779 if (!dentry) {
780 dentry = d_alloc_root(inode);
781 if (!dentry) {
782 iput(inode);
783 ret = -ENOMEM;
784 goto failed_dentry;
785 }
786 } else {
787 iput(inode);
788 }
789 } else {
790 dentry = d_obtain_alias(inode);
791 if (IS_ERR(dentry)) {
792 ret = PTR_ERR(dentry);
793 goto failed_dentry;
794 }
795 }
796 *root_dentry = dentry;
797 out:
798 return ret;
799
800 failed_dentry:
801 printk(KERN_ERR "NILFS: get root dentry failed\n");
802 goto out;
803}
804
805static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
806 struct dentry **root_dentry)
807{
808 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
809 struct nilfs_root *root;
810 int ret;
811
812 down_read(&nilfs->ns_segctor_sem);
813 ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
814 up_read(&nilfs->ns_segctor_sem);
815 if (ret < 0) {
816 ret = (ret == -ENOENT) ? -EINVAL : ret;
817 goto out;
818 } else if (!ret) {
819 printk(KERN_ERR "NILFS: The specified checkpoint is "
820 "not a snapshot (checkpoint number=%llu).\n",
821 (unsigned long long)cno);
822 ret = -EINVAL;
823 goto out;
824 }
825
826 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
827 if (ret) {
828 printk(KERN_ERR "NILFS: error loading snapshot "
829 "(checkpoint number=%llu).\n",
830 (unsigned long long)cno);
831 goto out;
832 }
833 ret = nilfs_get_root_dentry(s, root, root_dentry);
834 nilfs_put_root(root);
835 out:
836 return ret;
837}
838
839static int nilfs_tree_was_touched(struct dentry *root_dentry)
840{
841 return atomic_read(&root_dentry->d_count) > 1;
842}
843
844/**
845 * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
846 * @root_dentry: root dentry of the tree to be shrunk
847 *
848 * This function returns true if the tree was in-use.
849 */
850static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
851{
852 if (have_submounts(root_dentry))
853 return true;
854 shrink_dcache_parent(root_dentry);
855 return nilfs_tree_was_touched(root_dentry);
856}
857
858int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
859{
860 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
861 struct nilfs_root *root;
862 struct inode *inode;
863 struct dentry *dentry;
864 int ret;
865
866 if (cno < 0 || cno > nilfs->ns_cno)
867 return false;
868
869 if (cno >= nilfs_last_cno(nilfs))
870 return true; /* protect recent checkpoints */
871
872 ret = false;
873 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
874 if (root) {
875 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
876 if (inode) {
877 dentry = d_find_alias(inode);
878 if (dentry) {
879 if (nilfs_tree_was_touched(dentry))
880 ret = nilfs_try_to_shrink_tree(dentry);
881 dput(dentry);
882 }
883 iput(inode);
884 }
885 nilfs_put_root(root);
886 }
887 return ret;
674} 888}
675 889
676/** 890/**
@@ -678,17 +892,17 @@ int nilfs_store_magic_and_option(struct super_block *sb,
678 * @sb: super_block 892 * @sb: super_block
679 * @data: mount options 893 * @data: mount options
680 * @silent: silent mode flag 894 * @silent: silent mode flag
681 * @nilfs: the_nilfs struct
682 * 895 *
683 * This function is called exclusively by nilfs->ns_mount_mutex. 896 * This function is called exclusively by nilfs->ns_mount_mutex.
684 * So, the recovery process is protected from other simultaneous mounts. 897 * So, the recovery process is protected from other simultaneous mounts.
685 */ 898 */
686static int 899static int
687nilfs_fill_super(struct super_block *sb, void *data, int silent, 900nilfs_fill_super(struct super_block *sb, void *data, int silent)
688 struct the_nilfs *nilfs)
689{ 901{
902 struct the_nilfs *nilfs;
690 struct nilfs_sb_info *sbi; 903 struct nilfs_sb_info *sbi;
691 struct inode *root; 904 struct nilfs_root *fsroot;
905 struct backing_dev_info *bdi;
692 __u64 cno; 906 __u64 cno;
693 int err; 907 int err;
694 908
@@ -697,19 +911,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
697 return -ENOMEM; 911 return -ENOMEM;
698 912
699 sb->s_fs_info = sbi; 913 sb->s_fs_info = sbi;
914 sbi->s_super = sb;
700 915
701 get_nilfs(nilfs); 916 nilfs = alloc_nilfs(sb->s_bdev);
917 if (!nilfs) {
918 err = -ENOMEM;
919 goto failed_sbi;
920 }
702 sbi->s_nilfs = nilfs; 921 sbi->s_nilfs = nilfs;
703 sbi->s_super = sb;
704 atomic_set(&sbi->s_count, 1);
705 922
706 err = init_nilfs(nilfs, sbi, (char *)data); 923 err = init_nilfs(nilfs, sbi, (char *)data);
707 if (err) 924 if (err)
708 goto failed_sbi; 925 goto failed_nilfs;
709 926
710 spin_lock_init(&sbi->s_inode_lock); 927 spin_lock_init(&sbi->s_inode_lock);
711 INIT_LIST_HEAD(&sbi->s_dirty_files); 928 INIT_LIST_HEAD(&sbi->s_dirty_files);
712 INIT_LIST_HEAD(&sbi->s_list);
713 929
714 /* 930 /*
715 * Following initialization is overlapped because 931 * Following initialization is overlapped because
@@ -725,141 +941,80 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
725 sb->s_export_op = &nilfs_export_ops; 941 sb->s_export_op = &nilfs_export_ops;
726 sb->s_root = NULL; 942 sb->s_root = NULL;
727 sb->s_time_gran = 1; 943 sb->s_time_gran = 1;
728 sb->s_bdi = nilfs->ns_bdi; 944
945 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
946 sb->s_bdi = bdi ? : &default_backing_dev_info;
729 947
730 err = load_nilfs(nilfs, sbi); 948 err = load_nilfs(nilfs, sbi);
731 if (err) 949 if (err)
732 goto failed_sbi; 950 goto failed_nilfs;
733 951
734 cno = nilfs_last_cno(nilfs); 952 cno = nilfs_last_cno(nilfs);
735 953 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
736 if (sb->s_flags & MS_RDONLY) {
737 if (nilfs_test_opt(sbi, SNAPSHOT)) {
738 down_read(&nilfs->ns_segctor_sem);
739 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
740 sbi->s_snapshot_cno);
741 up_read(&nilfs->ns_segctor_sem);
742 if (err < 0) {
743 if (err == -ENOENT)
744 err = -EINVAL;
745 goto failed_sbi;
746 }
747 if (!err) {
748 printk(KERN_ERR
749 "NILFS: The specified checkpoint is "
750 "not a snapshot "
751 "(checkpoint number=%llu).\n",
752 (unsigned long long)sbi->s_snapshot_cno);
753 err = -EINVAL;
754 goto failed_sbi;
755 }
756 cno = sbi->s_snapshot_cno;
757 }
758 }
759
760 err = nilfs_attach_checkpoint(sbi, cno);
761 if (err) { 954 if (err) {
762 printk(KERN_ERR "NILFS: error loading a checkpoint" 955 printk(KERN_ERR "NILFS: error loading last checkpoint "
763 " (checkpoint number=%llu).\n", (unsigned long long)cno); 956 "(checkpoint number=%llu).\n", (unsigned long long)cno);
764 goto failed_sbi; 957 goto failed_unload;
765 } 958 }
766 959
767 if (!(sb->s_flags & MS_RDONLY)) { 960 if (!(sb->s_flags & MS_RDONLY)) {
768 err = nilfs_attach_segment_constructor(sbi); 961 err = nilfs_attach_segment_constructor(sbi, fsroot);
769 if (err) 962 if (err)
770 goto failed_checkpoint; 963 goto failed_checkpoint;
771 } 964 }
772 965
773 root = nilfs_iget(sb, NILFS_ROOT_INO); 966 err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
774 if (IS_ERR(root)) { 967 if (err)
775 printk(KERN_ERR "NILFS: get root inode failed\n");
776 err = PTR_ERR(root);
777 goto failed_segctor;
778 }
779 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
780 iput(root);
781 printk(KERN_ERR "NILFS: corrupt root inode.\n");
782 err = -EINVAL;
783 goto failed_segctor;
784 }
785 sb->s_root = d_alloc_root(root);
786 if (!sb->s_root) {
787 iput(root);
788 printk(KERN_ERR "NILFS: get root dentry failed\n");
789 err = -ENOMEM;
790 goto failed_segctor; 968 goto failed_segctor;
791 } 969
970 nilfs_put_root(fsroot);
792 971
793 if (!(sb->s_flags & MS_RDONLY)) { 972 if (!(sb->s_flags & MS_RDONLY)) {
794 down_write(&nilfs->ns_sem); 973 down_write(&nilfs->ns_sem);
795 nilfs_setup_super(sbi); 974 nilfs_setup_super(sbi, true);
796 up_write(&nilfs->ns_sem); 975 up_write(&nilfs->ns_sem);
797 } 976 }
798 977
799 down_write(&nilfs->ns_super_sem);
800 if (!nilfs_test_opt(sbi, SNAPSHOT))
801 nilfs->ns_current = sbi;
802 up_write(&nilfs->ns_super_sem);
803
804 return 0; 978 return 0;
805 979
806 failed_segctor: 980 failed_segctor:
807 nilfs_detach_segment_constructor(sbi); 981 nilfs_detach_segment_constructor(sbi);
808 982
809 failed_checkpoint: 983 failed_checkpoint:
810 nilfs_detach_checkpoint(sbi); 984 nilfs_put_root(fsroot);
985
986 failed_unload:
987 iput(nilfs->ns_sufile);
988 iput(nilfs->ns_cpfile);
989 iput(nilfs->ns_dat);
990
991 failed_nilfs:
992 destroy_nilfs(nilfs);
811 993
812 failed_sbi: 994 failed_sbi:
813 put_nilfs(nilfs);
814 sb->s_fs_info = NULL; 995 sb->s_fs_info = NULL;
815 nilfs_put_sbinfo(sbi); 996 kfree(sbi);
816 return err; 997 return err;
817} 998}
818 999
819static int nilfs_remount(struct super_block *sb, int *flags, char *data) 1000static int nilfs_remount(struct super_block *sb, int *flags, char *data)
820{ 1001{
821 struct nilfs_sb_info *sbi = NILFS_SB(sb); 1002 struct nilfs_sb_info *sbi = NILFS_SB(sb);
822 struct nilfs_super_block *sbp;
823 struct the_nilfs *nilfs = sbi->s_nilfs; 1003 struct the_nilfs *nilfs = sbi->s_nilfs;
824 unsigned long old_sb_flags; 1004 unsigned long old_sb_flags;
825 struct nilfs_mount_options old_opts; 1005 struct nilfs_mount_options old_opts;
826 int was_snapshot, err; 1006 int err;
827
828 lock_kernel();
829 1007
830 down_write(&nilfs->ns_super_sem);
831 old_sb_flags = sb->s_flags; 1008 old_sb_flags = sb->s_flags;
832 old_opts.mount_opt = sbi->s_mount_opt; 1009 old_opts.mount_opt = sbi->s_mount_opt;
833 old_opts.snapshot_cno = sbi->s_snapshot_cno;
834 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
835 1010
836 if (!parse_options(data, sb)) { 1011 if (!parse_options(data, sb, 1)) {
837 err = -EINVAL; 1012 err = -EINVAL;
838 goto restore_opts; 1013 goto restore_opts;
839 } 1014 }
840 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 1015 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
841 1016
842 err = -EINVAL; 1017 err = -EINVAL;
843 if (was_snapshot) {
844 if (!(*flags & MS_RDONLY)) {
845 printk(KERN_ERR "NILFS (device %s): cannot remount "
846 "snapshot read/write.\n",
847 sb->s_id);
848 goto restore_opts;
849 } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
850 printk(KERN_ERR "NILFS (device %s): cannot "
851 "remount to a different snapshot.\n",
852 sb->s_id);
853 goto restore_opts;
854 }
855 } else {
856 if (nilfs_test_opt(sbi, SNAPSHOT)) {
857 printk(KERN_ERR "NILFS (device %s): cannot change "
858 "a regular mount to a snapshot.\n",
859 sb->s_id);
860 goto restore_opts;
861 }
862 }
863 1018
864 if (!nilfs_valid_fs(nilfs)) { 1019 if (!nilfs_valid_fs(nilfs)) {
865 printk(KERN_WARNING "NILFS (device %s): couldn't " 1020 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -880,40 +1035,47 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
880 * the RDONLY flag and then mark the partition as valid again. 1035 * the RDONLY flag and then mark the partition as valid again.
881 */ 1036 */
882 down_write(&nilfs->ns_sem); 1037 down_write(&nilfs->ns_sem);
883 sbp = nilfs->ns_sbp[0]; 1038 nilfs_cleanup_super(sbi);
884 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
885 (nilfs->ns_mount_state & NILFS_VALID_FS))
886 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
887 sbp->s_mtime = cpu_to_le64(get_seconds());
888 nilfs_commit_super(sbi, 1);
889 up_write(&nilfs->ns_sem); 1039 up_write(&nilfs->ns_sem);
890 } else { 1040 } else {
1041 __u64 features;
1042 struct nilfs_root *root;
1043
891 /* 1044 /*
892 * Mounting a RDONLY partition read-write, so reread and 1045 * Mounting a RDONLY partition read-write, so reread and
893 * store the current valid flag. (It may have been changed 1046 * store the current valid flag. (It may have been changed
894 * by fsck since we originally mounted the partition.) 1047 * by fsck since we originally mounted the partition.)
895 */ 1048 */
1049 down_read(&nilfs->ns_sem);
1050 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
1051 ~NILFS_FEATURE_COMPAT_RO_SUPP;
1052 up_read(&nilfs->ns_sem);
1053 if (features) {
1054 printk(KERN_WARNING "NILFS (device %s): couldn't "
1055 "remount RDWR because of unsupported optional "
1056 "features (%llx)\n",
1057 sb->s_id, (unsigned long long)features);
1058 err = -EROFS;
1059 goto restore_opts;
1060 }
1061
896 sb->s_flags &= ~MS_RDONLY; 1062 sb->s_flags &= ~MS_RDONLY;
897 1063
898 err = nilfs_attach_segment_constructor(sbi); 1064 root = NILFS_I(sb->s_root->d_inode)->i_root;
1065 err = nilfs_attach_segment_constructor(sbi, root);
899 if (err) 1066 if (err)
900 goto restore_opts; 1067 goto restore_opts;
901 1068
902 down_write(&nilfs->ns_sem); 1069 down_write(&nilfs->ns_sem);
903 nilfs_setup_super(sbi); 1070 nilfs_setup_super(sbi, true);
904 up_write(&nilfs->ns_sem); 1071 up_write(&nilfs->ns_sem);
905 } 1072 }
906 out: 1073 out:
907 up_write(&nilfs->ns_super_sem);
908 unlock_kernel();
909 return 0; 1074 return 0;
910 1075
911 restore_opts: 1076 restore_opts:
912 sb->s_flags = old_sb_flags; 1077 sb->s_flags = old_sb_flags;
913 sbi->s_mount_opt = old_opts.mount_opt; 1078 sbi->s_mount_opt = old_opts.mount_opt;
914 sbi->s_snapshot_cno = old_opts.snapshot_cno;
915 up_write(&nilfs->ns_super_sem);
916 unlock_kernel();
917 return err; 1079 return err;
918} 1080}
919 1081
@@ -933,7 +1095,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
933{ 1095{
934 char *p, *options = data; 1096 char *p, *options = data;
935 substring_t args[MAX_OPT_ARGS]; 1097 substring_t args[MAX_OPT_ARGS];
936 int option, token; 1098 int token;
937 int ret = 0; 1099 int ret = 0;
938 1100
939 do { 1101 do {
@@ -941,16 +1103,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
941 if (p != NULL && *p) { 1103 if (p != NULL && *p) {
942 token = match_token(p, tokens, args); 1104 token = match_token(p, tokens, args);
943 if (token == Opt_snapshot) { 1105 if (token == Opt_snapshot) {
944 if (!(sd->flags & MS_RDONLY)) 1106 if (!(sd->flags & MS_RDONLY)) {
945 ret++; 1107 ret++;
946 else { 1108 } else {
947 ret = match_int(&args[0], &option); 1109 sd->cno = simple_strtoull(args[0].from,
948 if (!ret) { 1110 NULL, 0);
949 if (option > 0) 1111 /*
950 sd->cno = option; 1112 * No need to see the end pointer;
951 else 1113 * match_token() has done syntax
952 ret++; 1114 * checking.
953 } 1115 */
1116 if (sd->cno == 0)
1117 ret++;
954 } 1118 }
955 } 1119 }
956 if (ret) 1120 if (ret)
@@ -967,43 +1131,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
967 1131
968static int nilfs_set_bdev_super(struct super_block *s, void *data) 1132static int nilfs_set_bdev_super(struct super_block *s, void *data)
969{ 1133{
970 struct nilfs_super_data *sd = data; 1134 s->s_bdev = data;
971
972 s->s_bdev = sd->bdev;
973 s->s_dev = s->s_bdev->bd_dev; 1135 s->s_dev = s->s_bdev->bd_dev;
974 return 0; 1136 return 0;
975} 1137}
976 1138
977static int nilfs_test_bdev_super(struct super_block *s, void *data) 1139static int nilfs_test_bdev_super(struct super_block *s, void *data)
978{ 1140{
979 struct nilfs_super_data *sd = data; 1141 return (void *)s->s_bdev == data;
980
981 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
982} 1142}
983 1143
984static int 1144static struct dentry *
985nilfs_get_sb(struct file_system_type *fs_type, int flags, 1145nilfs_mount(struct file_system_type *fs_type, int flags,
986 const char *dev_name, void *data, struct vfsmount *mnt) 1146 const char *dev_name, void *data)
987{ 1147{
988 struct nilfs_super_data sd; 1148 struct nilfs_super_data sd;
989 struct super_block *s; 1149 struct super_block *s;
990 fmode_t mode = FMODE_READ; 1150 fmode_t mode = FMODE_READ;
991 struct the_nilfs *nilfs; 1151 struct dentry *root_dentry;
992 int err, need_to_close = 1; 1152 int err, s_new = false;
993 1153
994 if (!(flags & MS_RDONLY)) 1154 if (!(flags & MS_RDONLY))
995 mode |= FMODE_WRITE; 1155 mode |= FMODE_WRITE;
996 1156
997 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1157 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
998 if (IS_ERR(sd.bdev)) 1158 if (IS_ERR(sd.bdev))
999 return PTR_ERR(sd.bdev); 1159 return ERR_CAST(sd.bdev);
1000 1160
1001 /*
1002 * To get mount instance using sget() vfs-routine, NILFS needs
1003 * much more information than normal filesystems to identify mount
1004 * instance. For snapshot mounts, not only a mount type (ro-mount
1005 * or rw-mount) but also a checkpoint number is required.
1006 */
1007 sd.cno = 0; 1161 sd.cno = 0;
1008 sd.flags = flags; 1162 sd.flags = flags;
1009 if (nilfs_identify((char *)data, &sd)) { 1163 if (nilfs_identify((char *)data, &sd)) {
@@ -1011,101 +1165,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1011 goto failed; 1165 goto failed;
1012 } 1166 }
1013 1167
1014 nilfs = find_or_create_nilfs(sd.bdev);
1015 if (!nilfs) {
1016 err = -ENOMEM;
1017 goto failed;
1018 }
1019
1020 mutex_lock(&nilfs->ns_mount_mutex);
1021
1022 if (!sd.cno) {
1023 /*
1024 * Check if an exclusive mount exists or not.
1025 * Snapshot mounts coexist with a current mount
1026 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1027 * ro-mount are mutually exclusive.
1028 */
1029 down_read(&nilfs->ns_super_sem);
1030 if (nilfs->ns_current &&
1031 ((nilfs->ns_current->s_super->s_flags ^ flags)
1032 & MS_RDONLY)) {
1033 up_read(&nilfs->ns_super_sem);
1034 err = -EBUSY;
1035 goto failed_unlock;
1036 }
1037 up_read(&nilfs->ns_super_sem);
1038 }
1039
1040 /* 1168 /*
1041 * Find existing nilfs_sb_info struct 1169 * once the super is inserted into the list by sget, s_umount
1170 * will protect the lockfs code from trying to start a snapshot
1171 * while we are mounting
1042 */ 1172 */
1043 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); 1173 mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
1044 1174 if (sd.bdev->bd_fsfreeze_count > 0) {
1045 /* 1175 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1046 * Get super block instance holding the nilfs_sb_info struct. 1176 err = -EBUSY;
1047 * A new instance is allocated if no existing mount is present or 1177 goto failed;
1048 * existing instance has been unmounted. 1178 }
1049 */ 1179 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
1050 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); 1180 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1051 if (sd.sbi)
1052 nilfs_put_sbinfo(sd.sbi);
1053
1054 if (IS_ERR(s)) { 1181 if (IS_ERR(s)) {
1055 err = PTR_ERR(s); 1182 err = PTR_ERR(s);
1056 goto failed_unlock; 1183 goto failed;
1057 } 1184 }
1058 1185
1059 if (!s->s_root) { 1186 if (!s->s_root) {
1060 char b[BDEVNAME_SIZE]; 1187 char b[BDEVNAME_SIZE];
1061 1188
1189 s_new = true;
1190
1062 /* New superblock instance created */ 1191 /* New superblock instance created */
1063 s->s_flags = flags; 1192 s->s_flags = flags;
1064 s->s_mode = mode; 1193 s->s_mode = mode;
1065 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1194 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1066 sb_set_blocksize(s, block_size(sd.bdev)); 1195 sb_set_blocksize(s, block_size(sd.bdev));
1067 1196
1068 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, 1197 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1069 nilfs);
1070 if (err) 1198 if (err)
1071 goto cancel_new; 1199 goto failed_super;
1072 1200
1073 s->s_flags |= MS_ACTIVE; 1201 s->s_flags |= MS_ACTIVE;
1074 need_to_close = 0; 1202 } else if (!sd.cno) {
1203 int busy = false;
1204
1205 if (nilfs_tree_was_touched(s->s_root)) {
1206 busy = nilfs_try_to_shrink_tree(s->s_root);
1207 if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
1208 printk(KERN_ERR "NILFS: the device already "
1209 "has a %s mount.\n",
1210 (s->s_flags & MS_RDONLY) ?
1211 "read-only" : "read/write");
1212 err = -EBUSY;
1213 goto failed_super;
1214 }
1215 }
1216 if (!busy) {
1217 /*
1218 * Try remount to setup mount states if the current
1219 * tree is not mounted and only snapshots use this sb.
1220 */
1221 err = nilfs_remount(s, &flags, data);
1222 if (err)
1223 goto failed_super;
1224 }
1075 } 1225 }
1076 1226
1077 mutex_unlock(&nilfs->ns_mount_mutex); 1227 if (sd.cno) {
1078 put_nilfs(nilfs); 1228 err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
1079 if (need_to_close) 1229 if (err)
1080 close_bdev_exclusive(sd.bdev, mode); 1230 goto failed_super;
1081 simple_set_mnt(mnt, s); 1231 } else {
1082 return 0; 1232 root_dentry = dget(s->s_root);
1233 }
1083 1234
1084 failed_unlock: 1235 if (!s_new)
1085 mutex_unlock(&nilfs->ns_mount_mutex); 1236 close_bdev_exclusive(sd.bdev, mode);
1086 put_nilfs(nilfs);
1087 failed:
1088 close_bdev_exclusive(sd.bdev, mode);
1089 1237
1090 return err; 1238 return root_dentry;
1091 1239
1092 cancel_new: 1240 failed_super:
1093 /* Abandoning the newly allocated superblock */
1094 mutex_unlock(&nilfs->ns_mount_mutex);
1095 put_nilfs(nilfs);
1096 deactivate_locked_super(s); 1241 deactivate_locked_super(s);
1097 /* 1242
1098 * deactivate_locked_super() invokes close_bdev_exclusive(). 1243 failed:
1099 * We must finish all post-cleaning before this call; 1244 if (!s_new)
1100 * put_nilfs() needs the block device. 1245 close_bdev_exclusive(sd.bdev, mode);
1101 */ 1246 return ERR_PTR(err);
1102 return err;
1103} 1247}
1104 1248
1105struct file_system_type nilfs_fs_type = { 1249struct file_system_type nilfs_fs_type = {
1106 .owner = THIS_MODULE, 1250 .owner = THIS_MODULE,
1107 .name = "nilfs2", 1251 .name = "nilfs2",
1108 .get_sb = nilfs_get_sb, 1252 .mount = nilfs_mount,
1109 .kill_sb = kill_block_super, 1253 .kill_sb = kill_block_super,
1110 .fs_flags = FS_REQUIRES_DEV, 1254 .fs_flags = FS_REQUIRES_DEV,
1111}; 1255};
@@ -1119,7 +1263,7 @@ static void nilfs_inode_init_once(void *obj)
1119 init_rwsem(&ii->xattr_sem); 1263 init_rwsem(&ii->xattr_sem);
1120#endif 1264#endif
1121 nilfs_btnode_cache_init_once(&ii->i_btnode_cache); 1265 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1122 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; 1266 ii->i_bmap = &ii->i_bmap_data;
1123 inode_init_once(&ii->vfs_inode); 1267 inode_init_once(&ii->vfs_inode);
1124} 1268}
1125 1269
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8c1097327ab..0254be2d73c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,8 +35,7 @@
35#include "segbuf.h" 35#include "segbuf.h"
36 36
37 37
38static LIST_HEAD(nilfs_objects); 38static int nilfs_valid_sb(struct nilfs_super_block *sbp);
39static DEFINE_SPINLOCK(nilfs_lock);
40 39
41void nilfs_set_last_segment(struct the_nilfs *nilfs, 40void nilfs_set_last_segment(struct the_nilfs *nilfs,
42 sector_t start_blocknr, u64 seq, __u64 cno) 41 sector_t start_blocknr, u64 seq, __u64 cno)
@@ -45,20 +44,27 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
45 nilfs->ns_last_pseg = start_blocknr; 44 nilfs->ns_last_pseg = start_blocknr;
46 nilfs->ns_last_seq = seq; 45 nilfs->ns_last_seq = seq;
47 nilfs->ns_last_cno = cno; 46 nilfs->ns_last_cno = cno;
47
48 if (!nilfs_sb_dirty(nilfs)) {
49 if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
50 goto stay_cursor;
51
52 set_nilfs_sb_dirty(nilfs);
53 }
54 nilfs->ns_prev_seq = nilfs->ns_last_seq;
55
56 stay_cursor:
48 spin_unlock(&nilfs->ns_last_segment_lock); 57 spin_unlock(&nilfs->ns_last_segment_lock);
49} 58}
50 59
51/** 60/**
52 * alloc_nilfs - allocate the_nilfs structure 61 * alloc_nilfs - allocate a nilfs object
53 * @bdev: block device to which the_nilfs is related 62 * @bdev: block device to which the_nilfs is related
54 * 63 *
55 * alloc_nilfs() allocates memory for the_nilfs and
56 * initializes its reference count and locks.
57 *
58 * Return Value: On success, pointer to the_nilfs is returned. 64 * Return Value: On success, pointer to the_nilfs is returned.
59 * On error, NULL is returned. 65 * On error, NULL is returned.
60 */ 66 */
61static struct the_nilfs *alloc_nilfs(struct block_device *bdev) 67struct the_nilfs *alloc_nilfs(struct block_device *bdev)
62{ 68{
63 struct the_nilfs *nilfs; 69 struct the_nilfs *nilfs;
64 70
@@ -67,92 +73,25 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
67 return NULL; 73 return NULL;
68 74
69 nilfs->ns_bdev = bdev; 75 nilfs->ns_bdev = bdev;
70 atomic_set(&nilfs->ns_count, 1);
71 atomic_set(&nilfs->ns_ndirtyblks, 0); 76 atomic_set(&nilfs->ns_ndirtyblks, 0);
72 init_rwsem(&nilfs->ns_sem); 77 init_rwsem(&nilfs->ns_sem);
73 init_rwsem(&nilfs->ns_super_sem); 78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
74 mutex_init(&nilfs->ns_mount_mutex);
75 init_rwsem(&nilfs->ns_writer_sem);
76 INIT_LIST_HEAD(&nilfs->ns_list);
77 INIT_LIST_HEAD(&nilfs->ns_supers);
78 spin_lock_init(&nilfs->ns_last_segment_lock); 79 spin_lock_init(&nilfs->ns_last_segment_lock);
79 nilfs->ns_gc_inodes_h = NULL; 80 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock);
80 init_rwsem(&nilfs->ns_segctor_sem); 82 init_rwsem(&nilfs->ns_segctor_sem);
81 83
82 return nilfs; 84 return nilfs;
83} 85}
84 86
85/** 87/**
86 * find_or_create_nilfs - find or create nilfs object 88 * destroy_nilfs - destroy nilfs object
87 * @bdev: block device to which the_nilfs is related 89 * @nilfs: nilfs object to be released
88 *
89 * find_nilfs() looks up an existent nilfs object created on the
90 * device and gets the reference count of the object. If no nilfs object
91 * is found on the device, a new nilfs object is allocated.
92 *
93 * Return Value: On success, pointer to the nilfs object is returned.
94 * On error, NULL is returned.
95 */ 90 */
96struct the_nilfs *find_or_create_nilfs(struct block_device *bdev) 91void destroy_nilfs(struct the_nilfs *nilfs)
97{ 92{
98 struct the_nilfs *nilfs, *new = NULL;
99
100 retry:
101 spin_lock(&nilfs_lock);
102 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
103 if (nilfs->ns_bdev == bdev) {
104 get_nilfs(nilfs);
105 spin_unlock(&nilfs_lock);
106 if (new)
107 put_nilfs(new);
108 return nilfs; /* existing object */
109 }
110 }
111 if (new) {
112 list_add_tail(&new->ns_list, &nilfs_objects);
113 spin_unlock(&nilfs_lock);
114 return new; /* new object */
115 }
116 spin_unlock(&nilfs_lock);
117
118 new = alloc_nilfs(bdev);
119 if (new)
120 goto retry;
121 return NULL; /* insufficient memory */
122}
123
124/**
125 * put_nilfs - release a reference to the_nilfs
126 * @nilfs: the_nilfs structure to be released
127 *
128 * put_nilfs() decrements a reference counter of the_nilfs.
129 * If the reference count reaches zero, the_nilfs is freed.
130 */
131void put_nilfs(struct the_nilfs *nilfs)
132{
133 spin_lock(&nilfs_lock);
134 if (!atomic_dec_and_test(&nilfs->ns_count)) {
135 spin_unlock(&nilfs_lock);
136 return;
137 }
138 list_del_init(&nilfs->ns_list);
139 spin_unlock(&nilfs_lock);
140
141 /*
142 * Increment of ns_count never occurs below because the caller
143 * of get_nilfs() holds at least one reference to the_nilfs.
144 * Thus its exclusion control is not required here.
145 */
146
147 might_sleep(); 93 might_sleep();
148 if (nilfs_loaded(nilfs)) {
149 nilfs_mdt_destroy(nilfs->ns_sufile);
150 nilfs_mdt_destroy(nilfs->ns_cpfile);
151 nilfs_mdt_destroy(nilfs->ns_dat);
152 nilfs_mdt_destroy(nilfs->ns_gc_dat);
153 }
154 if (nilfs_init(nilfs)) { 94 if (nilfs_init(nilfs)) {
155 nilfs_destroy_gccache(nilfs);
156 brelse(nilfs->ns_sbh[0]); 95 brelse(nilfs->ns_sbh[0]);
157 brelse(nilfs->ns_sbh[1]); 96 brelse(nilfs->ns_sbh[1]);
158 } 97 }
@@ -160,16 +99,17 @@ void put_nilfs(struct the_nilfs *nilfs)
160} 99}
161 100
162static int nilfs_load_super_root(struct the_nilfs *nilfs, 101static int nilfs_load_super_root(struct the_nilfs *nilfs,
163 struct nilfs_sb_info *sbi, sector_t sr_block) 102 struct super_block *sb, sector_t sr_block)
164{ 103{
165 struct buffer_head *bh_sr; 104 struct buffer_head *bh_sr;
166 struct nilfs_super_root *raw_sr; 105 struct nilfs_super_root *raw_sr;
167 struct nilfs_super_block **sbp = nilfs->ns_sbp; 106 struct nilfs_super_block **sbp = nilfs->ns_sbp;
107 struct nilfs_inode *rawi;
168 unsigned dat_entry_size, segment_usage_size, checkpoint_size; 108 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
169 unsigned inode_size; 109 unsigned inode_size;
170 int err; 110 int err;
171 111
172 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); 112 err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
173 if (unlikely(err)) 113 if (unlikely(err))
174 return err; 114 return err;
175 115
@@ -181,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
181 121
182 inode_size = nilfs->ns_inode_size; 122 inode_size = nilfs->ns_inode_size;
183 123
184 err = -ENOMEM; 124 rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
185 nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); 125 err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
186 if (unlikely(!nilfs->ns_dat)) 126 if (err)
187 goto failed; 127 goto failed;
188 128
189 nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); 129 rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
190 if (unlikely(!nilfs->ns_gc_dat)) 130 err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
131 if (err)
191 goto failed_dat; 132 goto failed_dat;
192 133
193 nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); 134 rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
194 if (unlikely(!nilfs->ns_cpfile)) 135 err = nilfs_sufile_read(sb, segment_usage_size, rawi,
195 goto failed_gc_dat; 136 &nilfs->ns_sufile);
196 137 if (err)
197 nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
198 if (unlikely(!nilfs->ns_sufile))
199 goto failed_cpfile; 138 goto failed_cpfile;
200 139
201 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
202
203 err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
204 NILFS_SR_DAT_OFFSET(inode_size));
205 if (unlikely(err))
206 goto failed_sufile;
207
208 err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
209 NILFS_SR_CPFILE_OFFSET(inode_size));
210 if (unlikely(err))
211 goto failed_sufile;
212
213 err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
214 NILFS_SR_SUFILE_OFFSET(inode_size));
215 if (unlikely(err))
216 goto failed_sufile;
217
218 raw_sr = (struct nilfs_super_root *)bh_sr->b_data; 140 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
219 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); 141 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
220 142
@@ -222,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
222 brelse(bh_sr); 144 brelse(bh_sr);
223 return err; 145 return err;
224 146
225 failed_sufile:
226 nilfs_mdt_destroy(nilfs->ns_sufile);
227
228 failed_cpfile: 147 failed_cpfile:
229 nilfs_mdt_destroy(nilfs->ns_cpfile); 148 iput(nilfs->ns_cpfile);
230
231 failed_gc_dat:
232 nilfs_mdt_destroy(nilfs->ns_gc_dat);
233 149
234 failed_dat: 150 failed_dat:
235 nilfs_mdt_destroy(nilfs->ns_dat); 151 iput(nilfs->ns_dat);
236 goto failed; 152 goto failed;
237} 153}
238 154
@@ -248,6 +164,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
248} 164}
249 165
250/** 166/**
167 * nilfs_store_log_cursor - load log cursor from a super block
168 * @nilfs: nilfs object
169 * @sbp: buffer storing super block to be read
170 *
171 * nilfs_store_log_cursor() reads the last position of the log
172 * containing a super root from a given super block, and initializes
173 * relevant information on the nilfs object preparatory for log
174 * scanning and recovery.
175 */
176static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
177 struct nilfs_super_block *sbp)
178{
179 int ret = 0;
180
181 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
182 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
183 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
184
185 nilfs->ns_prev_seq = nilfs->ns_last_seq;
186 nilfs->ns_seg_seq = nilfs->ns_last_seq;
187 nilfs->ns_segnum =
188 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
189 nilfs->ns_cno = nilfs->ns_last_cno + 1;
190 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
191 printk(KERN_ERR "NILFS invalid last segment number.\n");
192 ret = -EINVAL;
193 }
194 return ret;
195}
196
197/**
251 * load_nilfs - load and recover the nilfs 198 * load_nilfs - load and recover the nilfs
252 * @nilfs: the_nilfs structure to be released 199 * @nilfs: the_nilfs structure to be released
253 * @sbi: nilfs_sb_info used to recover past segment 200 * @sbi: nilfs_sb_info used to recover past segment
@@ -264,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
264 int valid_fs = nilfs_valid_fs(nilfs); 211 int valid_fs = nilfs_valid_fs(nilfs);
265 int err; 212 int err;
266 213
267 if (nilfs_loaded(nilfs)) {
268 if (valid_fs ||
269 ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
270 return 0;
271 printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
272 "recovery state.\n");
273 return -EINVAL;
274 }
275
276 if (!valid_fs) { 214 if (!valid_fs) {
277 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); 215 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
278 if (s_flags & MS_RDONLY) { 216 if (s_flags & MS_RDONLY) {
@@ -285,13 +223,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
285 223
286 nilfs_init_recovery_info(&ri); 224 nilfs_init_recovery_info(&ri);
287 225
288 err = nilfs_search_super_root(nilfs, sbi, &ri); 226 err = nilfs_search_super_root(nilfs, &ri);
289 if (unlikely(err)) { 227 if (unlikely(err)) {
290 printk(KERN_ERR "NILFS: error searching super root.\n"); 228 struct nilfs_super_block **sbp = nilfs->ns_sbp;
291 goto failed; 229 int blocksize;
230
231 if (err != -EINVAL)
232 goto scan_error;
233
234 if (!nilfs_valid_sb(sbp[1])) {
235 printk(KERN_WARNING
236 "NILFS warning: unable to fall back to spare"
237 "super block\n");
238 goto scan_error;
239 }
240 printk(KERN_INFO
241 "NILFS: try rollback from an earlier position\n");
242
243 /*
244 * restore super block with its spare and reconfigure
245 * relevant states of the nilfs object.
246 */
247 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
248 nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
249 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
250
251 /* verify consistency between two super blocks */
252 blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
253 if (blocksize != nilfs->ns_blocksize) {
254 printk(KERN_WARNING
255 "NILFS warning: blocksize differs between "
256 "two super blocks (%d != %d)\n",
257 blocksize, nilfs->ns_blocksize);
258 goto scan_error;
259 }
260
261 err = nilfs_store_log_cursor(nilfs, sbp[0]);
262 if (err)
263 goto scan_error;
264
265 /* drop clean flag to allow roll-forward and recovery */
266 nilfs->ns_mount_state &= ~NILFS_VALID_FS;
267 valid_fs = 0;
268
269 err = nilfs_search_super_root(nilfs, &ri);
270 if (err)
271 goto scan_error;
292 } 272 }
293 273
294 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); 274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
295 if (unlikely(err)) { 275 if (unlikely(err)) {
296 printk(KERN_ERR "NILFS: error loading super root.\n"); 276 printk(KERN_ERR "NILFS: error loading super root.\n");
297 goto failed; 277 goto failed;
@@ -301,11 +281,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
301 goto skip_recovery; 281 goto skip_recovery;
302 282
303 if (s_flags & MS_RDONLY) { 283 if (s_flags & MS_RDONLY) {
284 __u64 features;
285
304 if (nilfs_test_opt(sbi, NORECOVERY)) { 286 if (nilfs_test_opt(sbi, NORECOVERY)) {
305 printk(KERN_INFO "NILFS: norecovery option specified. " 287 printk(KERN_INFO "NILFS: norecovery option specified. "
306 "skipping roll-forward recovery\n"); 288 "skipping roll-forward recovery\n");
307 goto skip_recovery; 289 goto skip_recovery;
308 } 290 }
291 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
292 ~NILFS_FEATURE_COMPAT_RO_SUPP;
293 if (features) {
294 printk(KERN_ERR "NILFS: couldn't proceed with "
295 "recovery because of unsupported optional "
296 "features (%llx)\n",
297 (unsigned long long)features);
298 err = -EROFS;
299 goto failed_unload;
300 }
309 if (really_read_only) { 301 if (really_read_only) {
310 printk(KERN_ERR "NILFS: write access " 302 printk(KERN_ERR "NILFS: write access "
311 "unavailable, cannot proceed.\n"); 303 "unavailable, cannot proceed.\n");
@@ -320,14 +312,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
320 goto failed_unload; 312 goto failed_unload;
321 } 313 }
322 314
323 err = nilfs_recover_logical_segments(nilfs, sbi, &ri); 315 err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
324 if (err) 316 if (err)
325 goto failed_unload; 317 goto failed_unload;
326 318
327 down_write(&nilfs->ns_sem); 319 down_write(&nilfs->ns_sem);
328 nilfs->ns_mount_state |= NILFS_VALID_FS; 320 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
329 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 321 err = nilfs_cleanup_super(sbi);
330 err = nilfs_commit_super(sbi, 1);
331 up_write(&nilfs->ns_sem); 322 up_write(&nilfs->ns_sem);
332 323
333 if (err) { 324 if (err) {
@@ -343,10 +334,14 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
343 sbi->s_super->s_flags = s_flags; 334 sbi->s_super->s_flags = s_flags;
344 return 0; 335 return 0;
345 336
337 scan_error:
338 printk(KERN_ERR "NILFS: error searching super root.\n");
339 goto failed;
340
346 failed_unload: 341 failed_unload:
347 nilfs_mdt_destroy(nilfs->ns_cpfile); 342 iput(nilfs->ns_cpfile);
348 nilfs_mdt_destroy(nilfs->ns_sufile); 343 iput(nilfs->ns_sufile);
349 nilfs_mdt_destroy(nilfs->ns_dat); 344 iput(nilfs->ns_dat);
350 345
351 failed: 346 failed:
352 nilfs_clear_recovery_info(&ri); 347 nilfs_clear_recovery_info(&ri);
@@ -368,8 +363,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
368static int nilfs_store_disk_layout(struct the_nilfs *nilfs, 363static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
369 struct nilfs_super_block *sbp) 364 struct nilfs_super_block *sbp)
370{ 365{
371 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { 366 if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
372 printk(KERN_ERR "NILFS: revision mismatch " 367 printk(KERN_ERR "NILFS: unsupported revision "
373 "(superblock rev.=%d.%d, current rev.=%d.%d). " 368 "(superblock rev.=%d.%d, current rev.=%d.%d). "
374 "Please check the version of mkfs.nilfs.\n", 369 "Please check the version of mkfs.nilfs.\n",
375 le32_to_cpu(sbp->s_rev_level), 370 le32_to_cpu(sbp->s_rev_level),
@@ -509,14 +504,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
509 return -EINVAL; 504 return -EINVAL;
510 } 505 }
511 506
512 if (swp) { 507 if (!valid[!swp])
513 printk(KERN_WARNING "NILFS warning: broken superblock. " 508 printk(KERN_WARNING "NILFS warning: broken superblock. "
514 "using spare superblock.\n"); 509 "using spare superblock.\n");
510 if (swp)
515 nilfs_swap_super_block(nilfs); 511 nilfs_swap_super_block(nilfs);
516 }
517 512
518 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); 513 nilfs->ns_sbwcount = 0;
519 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; 514 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
520 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); 515 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
521 *sbpp = sbp[0]; 516 *sbpp = sbp[0];
522 return 0; 517 return 0;
@@ -531,12 +526,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
531 * 526 *
532 * init_nilfs() performs common initialization per block device (e.g. 527 * init_nilfs() performs common initialization per block device (e.g.
533 * reading the super block, getting disk layout information, initializing 528 * reading the super block, getting disk layout information, initializing
534 * shared fields in the_nilfs). It takes on some portion of the jobs 529 * shared fields in the_nilfs).
535 * typically done by a fill_super() routine. This division arises from
536 * the nature that multiple NILFS instances may be simultaneously
537 * mounted on a device.
538 * For multiple mounts on the same device, only the first mount
539 * invokes these tasks.
540 * 530 *
541 * Return Value: On success, 0 is returned. On error, a negative error 531 * Return Value: On success, 0 is returned. On error, a negative error
542 * code is returned. 532 * code is returned.
@@ -545,30 +535,12 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
545{ 535{
546 struct super_block *sb = sbi->s_super; 536 struct super_block *sb = sbi->s_super;
547 struct nilfs_super_block *sbp; 537 struct nilfs_super_block *sbp;
548 struct backing_dev_info *bdi;
549 int blocksize; 538 int blocksize;
550 int err; 539 int err;
551 540
552 down_write(&nilfs->ns_sem); 541 down_write(&nilfs->ns_sem);
553 if (nilfs_init(nilfs)) {
554 /* Load values from existing the_nilfs */
555 sbp = nilfs->ns_sbp[0];
556 err = nilfs_store_magic_and_option(sb, sbp, data);
557 if (err)
558 goto out;
559
560 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
561 if (sb->s_blocksize != blocksize &&
562 !sb_set_blocksize(sb, blocksize)) {
563 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
564 blocksize);
565 err = -EINVAL;
566 }
567 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
568 goto out;
569 }
570 542
571 blocksize = sb_min_blocksize(sb, BLOCK_SIZE); 543 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
572 if (!blocksize) { 544 if (!blocksize) {
573 printk(KERN_ERR "NILFS: unable to set blocksize\n"); 545 printk(KERN_ERR "NILFS: unable to set blocksize\n");
574 err = -EINVAL; 546 err = -EINVAL;
@@ -582,7 +554,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
582 if (err) 554 if (err)
583 goto failed_sbh; 555 goto failed_sbh;
584 556
557 err = nilfs_check_feature_compatibility(sb, sbp);
558 if (err)
559 goto failed_sbh;
560
585 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 561 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
562 if (blocksize < NILFS_MIN_BLOCK_SIZE ||
563 blocksize > NILFS_MAX_BLOCK_SIZE) {
564 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
565 "filesystem blocksize %d\n", blocksize);
566 err = -EINVAL;
567 goto failed_sbh;
568 }
586 if (sb->s_blocksize != blocksize) { 569 if (sb->s_blocksize != blocksize) {
587 int hw_blocksize = bdev_logical_block_size(sb->s_bdev); 570 int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
588 571
@@ -604,6 +587,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
604 when reloading fails. */ 587 when reloading fails. */
605 } 588 }
606 nilfs->ns_blocksize_bits = sb->s_blocksize_bits; 589 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
590 nilfs->ns_blocksize = blocksize;
607 591
608 err = nilfs_store_disk_layout(nilfs, sbp); 592 err = nilfs_store_disk_layout(nilfs, sbp);
609 if (err) 593 if (err)
@@ -613,29 +597,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
613 597
614 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 598 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
615 599
616 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; 600 err = nilfs_store_log_cursor(nilfs, sbp);
617 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
618
619 /* Finding last segment */
620 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
621 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
622 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
623
624 nilfs->ns_seg_seq = nilfs->ns_last_seq;
625 nilfs->ns_segnum =
626 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
627 nilfs->ns_cno = nilfs->ns_last_cno + 1;
628 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
629 printk(KERN_ERR "NILFS invalid last segment number.\n");
630 err = -EINVAL;
631 goto failed_sbh;
632 }
633 /* Dummy values */
634 nilfs->ns_free_segments_count =
635 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
636
637 /* Initialize gcinode cache */
638 err = nilfs_init_gccache(nilfs);
639 if (err) 601 if (err)
640 goto failed_sbh; 602 goto failed_sbh;
641 603
@@ -673,8 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
673 ret = blkdev_issue_discard(nilfs->ns_bdev, 635 ret = blkdev_issue_discard(nilfs->ns_bdev,
674 start * sects_per_block, 636 start * sects_per_block,
675 nblocks * sects_per_block, 637 nblocks * sects_per_block,
676 GFP_NOFS, 638 GFP_NOFS, 0);
677 BLKDEV_IFL_BARRIER);
678 if (ret < 0) 639 if (ret < 0)
679 return ret; 640 return ret;
680 nblocks = 0; 641 nblocks = 0;
@@ -684,7 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
684 ret = blkdev_issue_discard(nilfs->ns_bdev, 645 ret = blkdev_issue_discard(nilfs->ns_bdev,
685 start * sects_per_block, 646 start * sects_per_block,
686 nblocks * sects_per_block, 647 nblocks * sects_per_block,
687 GFP_NOFS, BLKDEV_IFL_BARRIER); 648 GFP_NOFS, 0);
688 return ret; 649 return ret;
689} 650}
690 651
@@ -711,79 +672,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
711 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; 672 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
712} 673}
713 674
714/** 675struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
715 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
716 * @nilfs: nilfs object
717 * @rw_mount: mount type (non-zero value for read/write mount)
718 * @cno: checkpoint number (zero for read-only mount)
719 *
720 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
721 * @rw_mount and @cno (in case of snapshots) matched. If no instance
722 * was found, NULL is returned. Although the super block instance can
723 * be unmounted after this function returns, the nilfs_sb_info struct
724 * is kept on memory until nilfs_put_sbinfo() is called.
725 */
726struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
727 int rw_mount, __u64 cno)
728{ 676{
729 struct nilfs_sb_info *sbi; 677 struct rb_node *n;
730 678 struct nilfs_root *root;
731 down_read(&nilfs->ns_super_sem); 679
732 /* 680 spin_lock(&nilfs->ns_cptree_lock);
733 * The SNAPSHOT flag and sb->s_flags are supposed to be 681 n = nilfs->ns_cptree.rb_node;
734 * protected with nilfs->ns_super_sem. 682 while (n) {
735 */ 683 root = rb_entry(n, struct nilfs_root, rb_node);
736 sbi = nilfs->ns_current; 684
737 if (rw_mount) { 685 if (cno < root->cno) {
738 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) 686 n = n->rb_left;
739 goto found; /* read/write mount */ 687 } else if (cno > root->cno) {
740 else 688 n = n->rb_right;
741 goto out; 689 } else {
742 } else if (cno == 0) { 690 atomic_inc(&root->count);
743 if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) 691 spin_unlock(&nilfs->ns_cptree_lock);
744 goto found; /* read-only mount */ 692 return root;
745 else 693 }
746 goto out;
747 } 694 }
695 spin_unlock(&nilfs->ns_cptree_lock);
748 696
749 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
750 if (nilfs_test_opt(sbi, SNAPSHOT) &&
751 sbi->s_snapshot_cno == cno)
752 goto found; /* snapshot mount */
753 }
754 out:
755 up_read(&nilfs->ns_super_sem);
756 return NULL; 697 return NULL;
757
758 found:
759 atomic_inc(&sbi->s_count);
760 up_read(&nilfs->ns_super_sem);
761 return sbi;
762} 698}
763 699
764int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 700struct nilfs_root *
765 int snapshot_mount) 701nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
766{ 702{
767 struct nilfs_sb_info *sbi; 703 struct rb_node **p, *parent;
768 int ret = 0; 704 struct nilfs_root *root, *new;
705
706 root = nilfs_lookup_root(nilfs, cno);
707 if (root)
708 return root;
709
710 new = kmalloc(sizeof(*root), GFP_KERNEL);
711 if (!new)
712 return NULL;
713
714 spin_lock(&nilfs->ns_cptree_lock);
769 715
770 down_read(&nilfs->ns_super_sem); 716 p = &nilfs->ns_cptree.rb_node;
771 if (cno == 0 || cno > nilfs->ns_cno) 717 parent = NULL;
772 goto out_unlock;
773 718
774 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { 719 while (*p) {
775 if (sbi->s_snapshot_cno == cno && 720 parent = *p;
776 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { 721 root = rb_entry(parent, struct nilfs_root, rb_node);
777 /* exclude read-only mounts */ 722
778 ret++; 723 if (cno < root->cno) {
779 break; 724 p = &(*p)->rb_left;
725 } else if (cno > root->cno) {
726 p = &(*p)->rb_right;
727 } else {
728 atomic_inc(&root->count);
729 spin_unlock(&nilfs->ns_cptree_lock);
730 kfree(new);
731 return root;
780 } 732 }
781 } 733 }
782 /* for protecting recent checkpoints */
783 if (cno >= nilfs_last_cno(nilfs))
784 ret++;
785 734
786 out_unlock: 735 new->cno = cno;
787 up_read(&nilfs->ns_super_sem); 736 new->ifile = NULL;
788 return ret; 737 new->nilfs = nilfs;
738 atomic_set(&new->count, 1);
739 atomic_set(&new->inodes_count, 0);
740 atomic_set(&new->blocks_count, 0);
741
742 rb_link_node(&new->rb_node, parent, p);
743 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
744
745 spin_unlock(&nilfs->ns_cptree_lock);
746
747 return new;
748}
749
750void nilfs_put_root(struct nilfs_root *root)
751{
752 if (atomic_dec_and_test(&root->count)) {
753 struct the_nilfs *nilfs = root->nilfs;
754
755 spin_lock(&nilfs->ns_cptree_lock);
756 rb_erase(&root->rb_node, &nilfs->ns_cptree);
757 spin_unlock(&nilfs->ns_cptree_lock);
758 if (root->ifile)
759 iput(root->ifile);
760
761 kfree(root);
762 }
789} 763}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab97453369..69226e14b74 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/rbtree.h>
29#include <linux/fs.h> 30#include <linux/fs.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
@@ -45,21 +46,13 @@ enum {
45/** 46/**
46 * struct the_nilfs - struct to supervise multiple nilfs mount points 47 * struct the_nilfs - struct to supervise multiple nilfs mount points
47 * @ns_flags: flags 48 * @ns_flags: flags
48 * @ns_count: reference count
49 * @ns_list: list head for nilfs_list
50 * @ns_bdev: block device 49 * @ns_bdev: block device
51 * @ns_bdi: backing dev info
52 * @ns_writer: back pointer to writable nilfs_sb_info
53 * @ns_sem: semaphore for shared states 50 * @ns_sem: semaphore for shared states
54 * @ns_super_sem: semaphore for global operations across super block instances
55 * @ns_mount_mutex: mutex protecting mount process of nilfs
56 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 51 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 52 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super blocks 53 * @ns_sbwtime: previous write time of super block
54 * @ns_sbwcount: write count of super block
61 * @ns_sbsize: size of valid data in super block 55 * @ns_sbsize: size of valid data in super block
62 * @ns_supers: list of nilfs super block structs
63 * @ns_seg_seq: segment sequence counter 56 * @ns_seg_seq: segment sequence counter
64 * @ns_segnum: index number of the latest full segment. 57 * @ns_segnum: index number of the latest full segment.
65 * @ns_nextnum: index number of the full segment index to be used next 58 * @ns_nextnum: index number of the full segment index to be used next
@@ -73,15 +66,16 @@ enum {
73 * @ns_last_seq: sequence value of the latest segment 66 * @ns_last_seq: sequence value of the latest segment
74 * @ns_last_cno: checkpoint number of the latest segment 67 * @ns_last_cno: checkpoint number of the latest segment
75 * @ns_prot_seq: least sequence number of segments which must not be reclaimed 68 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
76 * @ns_free_segments_count: counter of free segments 69 * @ns_prev_seq: base sequence number used to decide if advance log cursor
77 * @ns_segctor_sem: segment constructor semaphore 70 * @ns_segctor_sem: segment constructor semaphore
78 * @ns_dat: DAT file inode 71 * @ns_dat: DAT file inode
79 * @ns_cpfile: checkpoint file inode 72 * @ns_cpfile: checkpoint file inode
80 * @ns_sufile: segusage file inode 73 * @ns_sufile: segusage file inode
81 * @ns_gc_dat: shadow inode of the DAT file inode for GC 74 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
75 * @ns_cptree_lock: lock protecting @ns_cptree
82 * @ns_gc_inodes: dummy inodes to keep live blocks 76 * @ns_gc_inodes: dummy inodes to keep live blocks
83 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
84 * @ns_blocksize_bits: bit length of block size 77 * @ns_blocksize_bits: bit length of block size
78 * @ns_blocksize: block size
85 * @ns_nsegments: number of segments in filesystem 79 * @ns_nsegments: number of segments in filesystem
86 * @ns_blocks_per_segment: number of blocks per segment 80 * @ns_blocks_per_segment: number of blocks per segment
87 * @ns_r_segments_percentage: reserved segments percentage 81 * @ns_r_segments_percentage: reserved segments percentage
@@ -93,22 +87,9 @@ enum {
93 */ 87 */
94struct the_nilfs { 88struct the_nilfs {
95 unsigned long ns_flags; 89 unsigned long ns_flags;
96 atomic_t ns_count;
97 struct list_head ns_list;
98 90
99 struct block_device *ns_bdev; 91 struct block_device *ns_bdev;
100 struct backing_dev_info *ns_bdi;
101 struct nilfs_sb_info *ns_writer;
102 struct rw_semaphore ns_sem; 92 struct rw_semaphore ns_sem;
103 struct rw_semaphore ns_super_sem;
104 struct mutex ns_mount_mutex;
105 struct rw_semaphore ns_writer_sem;
106
107 /*
108 * components protected by ns_super_sem
109 */
110 struct nilfs_sb_info *ns_current;
111 struct list_head ns_supers;
112 93
113 /* 94 /*
114 * used for 95 * used for
@@ -119,7 +100,8 @@ struct the_nilfs {
119 */ 100 */
120 struct buffer_head *ns_sbh[2]; 101 struct buffer_head *ns_sbh[2];
121 struct nilfs_super_block *ns_sbp[2]; 102 struct nilfs_super_block *ns_sbp[2];
122 time_t ns_sbwtime[2]; 103 time_t ns_sbwtime;
104 unsigned ns_sbwcount;
123 unsigned ns_sbsize; 105 unsigned ns_sbsize;
124 unsigned ns_mount_state; 106 unsigned ns_mount_state;
125 107
@@ -149,7 +131,7 @@ struct the_nilfs {
149 u64 ns_last_seq; 131 u64 ns_last_seq;
150 __u64 ns_last_cno; 132 __u64 ns_last_cno;
151 u64 ns_prot_seq; 133 u64 ns_prot_seq;
152 unsigned long ns_free_segments_count; 134 u64 ns_prev_seq;
153 135
154 struct rw_semaphore ns_segctor_sem; 136 struct rw_semaphore ns_segctor_sem;
155 137
@@ -160,14 +142,17 @@ struct the_nilfs {
160 struct inode *ns_dat; 142 struct inode *ns_dat;
161 struct inode *ns_cpfile; 143 struct inode *ns_cpfile;
162 struct inode *ns_sufile; 144 struct inode *ns_sufile;
163 struct inode *ns_gc_dat;
164 145
165 /* GC inode list and hash table head */ 146 /* Checkpoint tree */
147 struct rb_root ns_cptree;
148 spinlock_t ns_cptree_lock;
149
150 /* GC inode list */
166 struct list_head ns_gc_inodes; 151 struct list_head ns_gc_inodes;
167 struct hlist_head *ns_gc_inodes_h;
168 152
169 /* Disk layout information (static) */ 153 /* Disk layout information (static) */
170 unsigned int ns_blocksize_bits; 154 unsigned int ns_blocksize_bits;
155 unsigned int ns_blocksize;
171 unsigned long ns_nsegments; 156 unsigned long ns_nsegments;
172 unsigned long ns_blocks_per_segment; 157 unsigned long ns_blocks_per_segment;
173 unsigned long ns_r_segments_percentage; 158 unsigned long ns_r_segments_percentage;
@@ -178,9 +163,6 @@ struct the_nilfs {
178 u32 ns_crc_seed; 163 u32 ns_crc_seed;
179}; 164};
180 165
181#define NILFS_GCINODE_HASH_BITS 8
182#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
183
184#define THE_NILFS_FNS(bit, name) \ 166#define THE_NILFS_FNS(bit, name) \
185static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ 167static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
186{ \ 168{ \
@@ -201,65 +183,67 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
201THE_NILFS_FNS(GC_RUNNING, gc_running) 183THE_NILFS_FNS(GC_RUNNING, gc_running)
202THE_NILFS_FNS(SB_DIRTY, sb_dirty) 184THE_NILFS_FNS(SB_DIRTY, sb_dirty)
203 185
186/**
187 * struct nilfs_root - nilfs root object
188 * @cno: checkpoint number
189 * @rb_node: red-black tree node
190 * @count: refcount of this structure
191 * @nilfs: nilfs object
192 * @ifile: inode file
193 * @root: root inode
194 * @inodes_count: number of inodes
195 * @blocks_count: number of blocks (Reserved)
196 */
197struct nilfs_root {
198 __u64 cno;
199 struct rb_node rb_node;
200
201 atomic_t count;
202 struct the_nilfs *nilfs;
203 struct inode *ifile;
204
205 atomic_t inodes_count;
206 atomic_t blocks_count;
207};
208
209/* Special checkpoint number */
210#define NILFS_CPTREE_CURRENT_CNO 0
211
204/* Minimum interval of periodical update of superblocks (in seconds) */ 212/* Minimum interval of periodical update of superblocks (in seconds) */
205#define NILFS_SB_FREQ 10 213#define NILFS_SB_FREQ 10
206#define NILFS_ALTSB_FREQ 60 /* spare superblock */
207 214
208static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) 215static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
209{ 216{
210 u64 t = get_seconds(); 217 u64 t = get_seconds();
211 return t < nilfs->ns_sbwtime[0] || 218 return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
212 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
213} 219}
214 220
215static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) 221static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
216{ 222{
217 u64 t = get_seconds(); 223 int flip_bits = nilfs->ns_sbwcount & 0x0FL;
218 struct nilfs_super_block **sbp = nilfs->ns_sbp; 224 return (flip_bits != 0x08 && flip_bits != 0x0F);
219 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
220} 225}
221 226
222void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 227void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
223struct the_nilfs *find_or_create_nilfs(struct block_device *); 228struct the_nilfs *alloc_nilfs(struct block_device *bdev);
224void put_nilfs(struct the_nilfs *); 229void destroy_nilfs(struct the_nilfs *nilfs);
225int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 230int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
226int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 231int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
227int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 232int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
228int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 233int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
234struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
235struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
236 __u64 cno);
237void nilfs_put_root(struct nilfs_root *root);
229struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 238struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
230int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
231int nilfs_near_disk_full(struct the_nilfs *); 239int nilfs_near_disk_full(struct the_nilfs *);
232void nilfs_fall_back_super_block(struct the_nilfs *); 240void nilfs_fall_back_super_block(struct the_nilfs *);
233void nilfs_swap_super_block(struct the_nilfs *); 241void nilfs_swap_super_block(struct the_nilfs *);
234 242
235 243
236static inline void get_nilfs(struct the_nilfs *nilfs) 244static inline void nilfs_get_root(struct nilfs_root *root)
237{
238 /* Caller must have at least one reference of the_nilfs. */
239 atomic_inc(&nilfs->ns_count);
240}
241
242static inline void
243nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
244{
245 down_write(&nilfs->ns_writer_sem);
246 nilfs->ns_writer = sbi;
247 up_write(&nilfs->ns_writer_sem);
248}
249
250static inline void
251nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
252{
253 down_write(&nilfs->ns_writer_sem);
254 if (sbi == nilfs->ns_writer)
255 nilfs->ns_writer = NULL;
256 up_write(&nilfs->ns_writer_sem);
257}
258
259static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
260{ 245{
261 if (atomic_dec_and_test(&sbi->s_count)) 246 atomic_inc(&root->count);
262 kfree(sbi);
263} 247}
264 248
265static inline int nilfs_valid_fs(struct the_nilfs *nilfs) 249static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d346..6e40e42a43d 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
19 19
20const struct file_operations def_blk_fops = { 20const struct file_operations def_blk_fops = {
21 .open = no_blkdev_open, 21 .open = no_blkdev_open,
22 .llseek = noop_llseek,
22}; 23};
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d0..22c629eedd8 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c4..ae5f33a6d86 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o
2 3
3obj-y += dnotify/ 4obj-y += dnotify/
4obj-y += inotify/ 5obj-y += inotify/
6obj-y += fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964d..3344bdd5506 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
29int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
30 30
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex); 34static DEFINE_MUTEX(dnotify_mark_mutex);
35 35
36/* 36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which 37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
38 * is being watched by dnotify. If multiple userspace applications are watching 38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn 39 * the same directory with dnotify their information is chained in dn
40 */ 40 */
41struct dnotify_mark_entry { 41struct dnotify_mark {
42 struct fsnotify_mark_entry fsn_entry; 42 struct fsnotify_mark fsn_mark;
43 struct dnotify_struct *dn; 43 struct dnotify_struct *dn;
44}; 44};
45 45
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
51 * it calls the fsnotify function so it can update the set of all events relevant 51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode. 52 * to this inode.
53 */ 53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) 54static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
55{ 55{
56 __u32 new_mask, old_mask; 56 __u32 new_mask, old_mask;
57 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry, 58 struct dnotify_mark *dn_mark = container_of(fsn_mark,
59 struct dnotify_mark_entry, 59 struct dnotify_mark,
60 fsn_entry); 60 fsn_mark);
61 61
62 assert_spin_locked(&entry->lock); 62 assert_spin_locked(&fsn_mark->lock);
63 63
64 old_mask = entry->mask; 64 old_mask = fsn_mark->mask;
65 new_mask = 0; 65 new_mask = 0;
66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) 66 for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
68 entry->mask = new_mask; 68 fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
69 69
70 if (old_mask == new_mask) 70 if (old_mask == new_mask)
71 return; 71 return;
72 72
73 if (entry->inode) 73 if (fsn_mark->i.inode)
74 fsnotify_recalc_inode_mask(entry->inode); 74 fsnotify_recalc_inode_mask(fsn_mark->i.inode);
75} 75}
76 76
77/* 77/*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
83 * events. 83 * events.
84 */ 84 */
85static int dnotify_handle_event(struct fsnotify_group *group, 85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_mark *inode_mark,
87 struct fsnotify_mark *vfsmount_mark,
86 struct fsnotify_event *event) 88 struct fsnotify_event *event)
87{ 89{
88 struct fsnotify_mark_entry *entry = NULL; 90 struct dnotify_mark *dn_mark;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell; 91 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
95 96
96 to_tell = event->to_tell; 97 BUG_ON(vfsmount_mark);
97 98
98 spin_lock(&to_tell->i_lock); 99 to_tell = event->to_tell;
99 entry = fsnotify_find_mark_entry(group, to_tell);
100 spin_unlock(&to_tell->i_lock);
101 100
102 /* unlikely since we alreay passed dnotify_should_send_event() */ 101 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
103 if (unlikely(!entry))
104 return 0;
105 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
106 102
107 spin_lock(&entry->lock); 103 spin_lock(&inode_mark->lock);
108 prev = &dnentry->dn; 104 prev = &dn_mark->dn;
109 while ((dn = *prev) != NULL) { 105 while ((dn = *prev) != NULL) {
110 if ((dn->dn_mask & test_mask) == 0) { 106 if ((dn->dn_mask & test_mask) == 0) {
111 prev = &dn->dn_next; 107 prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
118 else { 114 else {
119 *prev = dn->dn_next; 115 *prev = dn->dn_next;
120 kmem_cache_free(dnotify_struct_cache, dn); 116 kmem_cache_free(dnotify_struct_cache, dn);
121 dnotify_recalc_inode_mask(entry); 117 dnotify_recalc_inode_mask(inode_mark);
122 } 118 }
123 } 119 }
124 120
125 spin_unlock(&entry->lock); 121 spin_unlock(&inode_mark->lock);
126 fsnotify_put_mark(entry);
127 122
128 return 0; 123 return 0;
129} 124}
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
133 * userspace notification for that pair. 128 * userspace notification for that pair.
134 */ 129 */
135static bool dnotify_should_send_event(struct fsnotify_group *group, 130static bool dnotify_should_send_event(struct fsnotify_group *group,
136 struct inode *inode, __u32 mask) 131 struct inode *inode,
132 struct fsnotify_mark *inode_mark,
133 struct fsnotify_mark *vfsmount_mark,
134 __u32 mask, void *data, int data_type)
137{ 135{
138 struct fsnotify_mark_entry *entry;
139 bool send;
140
141 /* !dir_notify_enable should never get here, don't waste time checking
142 if (!dir_notify_enable)
143 return 0; */
144
145 /* not a dir, dnotify doesn't care */ 136 /* not a dir, dnotify doesn't care */
146 if (!S_ISDIR(inode->i_mode)) 137 if (!S_ISDIR(inode->i_mode))
147 return false; 138 return false;
148 139
149 spin_lock(&inode->i_lock); 140 return true;
150 entry = fsnotify_find_mark_entry(group, inode);
151 spin_unlock(&inode->i_lock);
152
153 /* no mark means no dnotify watch */
154 if (!entry)
155 return false;
156
157 mask = (mask & ~FS_EVENT_ON_CHILD);
158 send = (mask & entry->mask);
159
160 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
161
162 return send;
163} 141}
164 142
165static void dnotify_free_mark(struct fsnotify_mark_entry *entry) 143static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
166{ 144{
167 struct dnotify_mark_entry *dnentry = container_of(entry, 145 struct dnotify_mark *dn_mark = container_of(fsn_mark,
168 struct dnotify_mark_entry, 146 struct dnotify_mark,
169 fsn_entry); 147 fsn_mark);
170 148
171 BUG_ON(dnentry->dn); 149 BUG_ON(dn_mark->dn);
172 150
173 kmem_cache_free(dnotify_mark_entry_cache, dnentry); 151 kmem_cache_free(dnotify_mark_cache, dn_mark);
174} 152}
175 153
176static struct fsnotify_ops dnotify_fsnotify_ops = { 154static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
183 161
184/* 162/*
185 * Called every time a file is closed. Looks first for a dnotify mark on the 163 * Called every time a file is closed. Looks first for a dnotify mark on the
186 * inode. If one is found run all of the ->dn entries attached to that 164 * inode. If one is found run all of the ->dn structures attached to that
187 * mark for one relevant to this process closing the file and remove that 165 * mark for one relevant to this process closing the file and remove that
188 * dnotify_struct. If that was the last dnotify_struct also remove the 166 * dnotify_struct. If that was the last dnotify_struct also remove the
189 * fsnotify_mark_entry. 167 * fsnotify_mark.
190 */ 168 */
191void dnotify_flush(struct file *filp, fl_owner_t id) 169void dnotify_flush(struct file *filp, fl_owner_t id)
192{ 170{
193 struct fsnotify_mark_entry *entry; 171 struct fsnotify_mark *fsn_mark;
194 struct dnotify_mark_entry *dnentry; 172 struct dnotify_mark *dn_mark;
195 struct dnotify_struct *dn; 173 struct dnotify_struct *dn;
196 struct dnotify_struct **prev; 174 struct dnotify_struct **prev;
197 struct inode *inode; 175 struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
200 if (!S_ISDIR(inode->i_mode)) 178 if (!S_ISDIR(inode->i_mode))
201 return; 179 return;
202 180
203 spin_lock(&inode->i_lock); 181 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
204 entry = fsnotify_find_mark_entry(dnotify_group, inode); 182 if (!fsn_mark)
205 spin_unlock(&inode->i_lock);
206 if (!entry)
207 return; 183 return;
208 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); 184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
209 185
210 mutex_lock(&dnotify_mark_mutex); 186 mutex_lock(&dnotify_mark_mutex);
211 187
212 spin_lock(&entry->lock); 188 spin_lock(&fsn_mark->lock);
213 prev = &dnentry->dn; 189 prev = &dn_mark->dn;
214 while ((dn = *prev) != NULL) { 190 while ((dn = *prev) != NULL) {
215 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 191 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
216 *prev = dn->dn_next; 192 *prev = dn->dn_next;
217 kmem_cache_free(dnotify_struct_cache, dn); 193 kmem_cache_free(dnotify_struct_cache, dn);
218 dnotify_recalc_inode_mask(entry); 194 dnotify_recalc_inode_mask(fsn_mark);
219 break; 195 break;
220 } 196 }
221 prev = &dn->dn_next; 197 prev = &dn->dn_next;
222 } 198 }
223 199
224 spin_unlock(&entry->lock); 200 spin_unlock(&fsn_mark->lock);
225 201
226 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 202 /* nothing else could have found us thanks to the dnotify_mark_mutex */
227 if (dnentry->dn == NULL) 203 if (dn_mark->dn == NULL)
228 fsnotify_destroy_mark_by_entry(entry); 204 fsnotify_destroy_mark(fsn_mark);
229
230 fsnotify_recalc_group_mask(dnotify_group);
231 205
232 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_mark_mutex);
233 207
234 fsnotify_put_mark(entry); 208 fsnotify_put_mark(fsn_mark);
235} 209}
236 210
237/* this conversion is done only at watch creation */ 211/* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
259 233
260/* 234/*
261 * If multiple processes watch the same inode with dnotify there is only one 235 * If multiple processes watch the same inode with dnotify there is only one
262 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct 236 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
263 * onto that mark. This function either attaches the new dnotify_struct onto 237 * onto that mark. This function either attaches the new dnotify_struct onto
264 * that list, or it |= the mask onto an existing dnofiy_struct. 238 * that list, or it |= the mask onto an existing dnofiy_struct.
265 */ 239 */
266static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, 240static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
267 fl_owner_t id, int fd, struct file *filp, __u32 mask) 241 fl_owner_t id, int fd, struct file *filp, __u32 mask)
268{ 242{
269 struct dnotify_struct *odn; 243 struct dnotify_struct *odn;
270 244
271 odn = dnentry->dn; 245 odn = dn_mark->dn;
272 while (odn != NULL) { 246 while (odn != NULL) {
273 /* adding more events to existing dnofiy_struct? */ 247 /* adding more events to existing dnofiy_struct? */
274 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { 248 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
283 dn->dn_fd = fd; 257 dn->dn_fd = fd;
284 dn->dn_filp = filp; 258 dn->dn_filp = filp;
285 dn->dn_owner = id; 259 dn->dn_owner = id;
286 dn->dn_next = dnentry->dn; 260 dn->dn_next = dn_mark->dn;
287 dnentry->dn = dn; 261 dn_mark->dn = dn;
288 262
289 return 0; 263 return 0;
290} 264}
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
296 */ 270 */
297int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 271int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
298{ 272{
299 struct dnotify_mark_entry *new_dnentry, *dnentry; 273 struct dnotify_mark *new_dn_mark, *dn_mark;
300 struct fsnotify_mark_entry *new_entry, *entry; 274 struct fsnotify_mark *new_fsn_mark, *fsn_mark;
301 struct dnotify_struct *dn; 275 struct dnotify_struct *dn;
302 struct inode *inode; 276 struct inode *inode;
303 fl_owner_t id = current->files; 277 fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
306 __u32 mask; 280 __u32 mask;
307 281
308 /* we use these to tell if we need to kfree */ 282 /* we use these to tell if we need to kfree */
309 new_entry = NULL; 283 new_fsn_mark = NULL;
310 dn = NULL; 284 dn = NULL;
311 285
312 if (!dir_notify_enable) { 286 if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
336 } 310 }
337 311
338 /* new fsnotify mark, we expect most fcntl calls to add a new mark */ 312 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
339 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); 313 new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
340 if (!new_dnentry) { 314 if (!new_dn_mark) {
341 error = -ENOMEM; 315 error = -ENOMEM;
342 goto out_err; 316 goto out_err;
343 } 317 }
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
345 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ 319 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
346 mask = convert_arg(arg); 320 mask = convert_arg(arg);
347 321
348 /* set up the new_entry and new_dnentry */ 322 /* set up the new_fsn_mark and new_dn_mark */
349 new_entry = &new_dnentry->fsn_entry; 323 new_fsn_mark = &new_dn_mark->fsn_mark;
350 fsnotify_init_mark(new_entry, dnotify_free_mark); 324 fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
351 new_entry->mask = mask; 325 new_fsn_mark->mask = mask;
352 new_dnentry->dn = NULL; 326 new_dn_mark->dn = NULL;
353 327
354 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
355 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_mark_mutex);
356 330
357 /* add the new_entry or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
358 spin_lock(&inode->i_lock); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
359 entry = fsnotify_find_mark_entry(dnotify_group, inode); 333 if (fsn_mark) {
360 spin_unlock(&inode->i_lock); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
361 if (entry) { 335 spin_lock(&fsn_mark->lock);
362 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
363 spin_lock(&entry->lock);
364 } else { 336 } else {
365 fsnotify_add_mark(new_entry, dnotify_group, inode); 337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
366 spin_lock(&new_entry->lock); 338 spin_lock(&new_fsn_mark->lock);
367 entry = new_entry; 339 fsn_mark = new_fsn_mark;
368 dnentry = new_dnentry; 340 dn_mark = new_dn_mark;
369 /* we used new_entry, so don't free it */ 341 /* we used new_fsn_mark, so don't free it */
370 new_entry = NULL; 342 new_fsn_mark = NULL;
371 } 343 }
372 344
373 rcu_read_lock(); 345 rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
376 348
377 /* if (f != filp) means that we lost a race and another task/thread 349 /* if (f != filp) means that we lost a race and another task/thread
378 * actually closed the fd we are still playing with before we grabbed 350 * actually closed the fd we are still playing with before we grabbed
379 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the 351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
380 * only time we clean up the mark entries we need to get our mark off 352 * only time we clean up the marks we need to get our mark off
381 * the list. */ 353 * the list. */
382 if (f != filp) { 354 if (f != filp) {
383 /* if we added ourselves, shoot ourselves, it's possible that 355 /* if we added ourselves, shoot ourselves, it's possible that
384 * the flush actually did shoot this entry. That's fine too 356 * the flush actually did shoot this fsn_mark. That's fine too
385 * since multiple calls to destroy_mark is perfectly safe, if 357 * since multiple calls to destroy_mark is perfectly safe, if
386 * we found a dnentry already attached to the inode, just sod 358 * we found a dn_mark already attached to the inode, just sod
387 * off silently as the flush at close time dealt with it. 359 * off silently as the flush at close time dealt with it.
388 */ 360 */
389 if (dnentry == new_dnentry) 361 if (dn_mark == new_dn_mark)
390 destroy = 1; 362 destroy = 1;
391 goto out; 363 goto out;
392 } 364 }
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
394 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 366 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
395 if (error) { 367 if (error) {
396 /* if we added, we must shoot */ 368 /* if we added, we must shoot */
397 if (dnentry == new_dnentry) 369 if (dn_mark == new_dn_mark)
398 destroy = 1; 370 destroy = 1;
399 goto out; 371 goto out;
400 } 372 }
401 373
402 error = attach_dn(dn, dnentry, id, fd, filp, mask); 374 error = attach_dn(dn, dn_mark, id, fd, filp, mask);
403 /* !error means that we attached the dn to the dnentry, so don't free it */ 375 /* !error means that we attached the dn to the dn_mark, so don't free it */
404 if (!error) 376 if (!error)
405 dn = NULL; 377 dn = NULL;
406 /* -EEXIST means that we didn't add this new dn and used an old one. 378 /* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
408 else if (error == -EEXIST) 380 else if (error == -EEXIST)
409 error = 0; 381 error = 0;
410 382
411 dnotify_recalc_inode_mask(entry); 383 dnotify_recalc_inode_mask(fsn_mark);
412out: 384out:
413 spin_unlock(&entry->lock); 385 spin_unlock(&fsn_mark->lock);
414 386
415 if (destroy) 387 if (destroy)
416 fsnotify_destroy_mark_by_entry(entry); 388 fsnotify_destroy_mark(fsn_mark);
417
418 fsnotify_recalc_group_mask(dnotify_group);
419 389
420 mutex_unlock(&dnotify_mark_mutex); 390 mutex_unlock(&dnotify_mark_mutex);
421 fsnotify_put_mark(entry); 391 fsnotify_put_mark(fsn_mark);
422out_err: 392out_err:
423 if (new_entry) 393 if (new_fsn_mark)
424 fsnotify_put_mark(new_entry); 394 fsnotify_put_mark(new_fsn_mark);
425 if (dn) 395 if (dn)
426 kmem_cache_free(dnotify_struct_cache, dn); 396 kmem_cache_free(dnotify_struct_cache, dn);
427 return error; 397 return error;
@@ -430,10 +400,9 @@ out_err:
430static int __init dnotify_init(void) 400static int __init dnotify_init(void)
431{ 401{
432 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); 402 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
433 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); 403 dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
434 404
435 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, 405 dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
436 0, &dnotify_fsnotify_ops);
437 if (IS_ERR(dnotify_group)) 406 if (IS_ERR(dnotify_group))
438 panic("unable to allocate fsnotify group for dnotify\n"); 407 panic("unable to allocate fsnotify group for dnotify\n");
439 return 0; 408 return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 00000000000..3ac36b7bf6b
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
1config FANOTIFY
2 bool "Filesystem wide access notification"
3 select FSNOTIFY
4 select ANON_INODES
5 default n
6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with
10 the event.
11
12 If unsure, say Y.
13
14config FANOTIFY_ACCESS_PERMISSIONS
15 bool "fanotify permissions checking"
16 depends on FANOTIFY
17 depends on SECURITY
18 default n
19 ---help---
20 Say Y here is you want fanotify listeners to be able to make permissions
21 decisions concerning filesystem events. This is used by some fanotify
22 listeners which need to scan files before allowing the system access to
23 use those files. This is used by some anti-malware vendors and by some
24 hierarchical storage managent systems.
25
26 If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 00000000000..0999213e7e6
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 00000000000..b04f88eed09
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,224 @@
1#include <linux/fanotify.h>
2#include <linux/fdtable.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/init.h>
5#include <linux/jiffies.h>
6#include <linux/kernel.h> /* UINT_MAX */
7#include <linux/mount.h>
8#include <linux/sched.h>
9#include <linux/types.h>
10#include <linux/wait.h>
11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
13{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new);
15
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH):
21 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry))
23 return true;
24 case (FSNOTIFY_EVENT_NONE):
25 return true;
26 default:
27 BUG();
28 };
29 }
30 return false;
31}
32
33/* and the list better be locked by something too! */
34static struct fsnotify_event *fanotify_merge(struct list_head *list,
35 struct fsnotify_event *event)
36{
37 struct fsnotify_event_holder *test_holder;
38 struct fsnotify_event *test_event = NULL;
39 struct fsnotify_event *new_event;
40
41 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
42
43
44 list_for_each_entry_reverse(test_holder, list, event_list) {
45 if (should_merge(test_holder->event, event)) {
46 test_event = test_holder->event;
47 break;
48 }
49 }
50
51 if (!test_event)
52 return NULL;
53
54 fsnotify_get_event(test_event);
55
56 /* if they are exactly the same we are done */
57 if (test_event->mask == event->mask)
58 return test_event;
59
60 /*
61 * if the refcnt == 2 this is the only queue
62 * for this event and so we can update the mask
63 * in place.
64 */
65 if (atomic_read(&test_event->refcnt) == 2) {
66 test_event->mask |= event->mask;
67 return test_event;
68 }
69
70 new_event = fsnotify_clone_event(test_event);
71
72 /* done with test_event */
73 fsnotify_put_event(test_event);
74
75 /* couldn't allocate memory, merge was not possible */
76 if (unlikely(!new_event))
77 return ERR_PTR(-ENOMEM);
78
79 /* build new event and replace it on the list */
80 new_event->mask = (test_event->mask | event->mask);
81 fsnotify_replace_event(test_holder, new_event);
82
83 /* we hold a reference on new_event from clone_event */
84 return new_event;
85}
86
87#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
88static int fanotify_get_response_from_access(struct fsnotify_group *group,
89 struct fsnotify_event *event)
90{
91 int ret;
92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94
95 wait_event(group->fanotify_data.access_waitq, event->response);
96
97 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock);
99 switch (event->response) {
100 case FAN_ALLOW:
101 ret = 0;
102 break;
103 case FAN_DENY:
104 default:
105 ret = -EPERM;
106 }
107 event->response = 0;
108 spin_unlock(&event->lock);
109
110 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
111 group, event, ret);
112
113 return ret;
114}
115#endif
116
117static int fanotify_handle_event(struct fsnotify_group *group,
118 struct fsnotify_mark *inode_mark,
119 struct fsnotify_mark *fanotify_mark,
120 struct fsnotify_event *event)
121{
122 int ret = 0;
123 struct fsnotify_event *notify_event = NULL;
124
125 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
126 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
127 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
128 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
129 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
130 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
134 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
135
136 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
137
138 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
139 if (IS_ERR(notify_event))
140 return PTR_ERR(notify_event);
141
142#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
143 if (event->mask & FAN_ALL_PERM_EVENTS) {
144 /* if we merged we need to wait on the new event */
145 if (notify_event)
146 event = notify_event;
147 ret = fanotify_get_response_from_access(group, event);
148 }
149#endif
150
151 if (notify_event)
152 fsnotify_put_event(notify_event);
153
154 return ret;
155}
156
157static bool fanotify_should_send_event(struct fsnotify_group *group,
158 struct inode *to_tell,
159 struct fsnotify_mark *inode_mark,
160 struct fsnotify_mark *vfsmnt_mark,
161 __u32 event_mask, void *data, int data_type)
162{
163 __u32 marks_mask, marks_ignored_mask;
164 struct path *path = data;
165
166 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
167 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
168 inode_mark, vfsmnt_mark, event_mask, data, data_type);
169
170 /* if we don't have enough info to send an event to userspace say no */
171 if (data_type != FSNOTIFY_EVENT_PATH)
172 return false;
173
174 /* sorry, fanotify only gives a damn about files and dirs */
175 if (!S_ISREG(path->dentry->d_inode->i_mode) &&
176 !S_ISDIR(path->dentry->d_inode->i_mode))
177 return false;
178
179 if (inode_mark && vfsmnt_mark) {
180 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
181 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
182 } else if (inode_mark) {
183 /*
184 * if the event is for a child and this inode doesn't care about
185 * events on the child, don't send it!
186 */
187 if ((event_mask & FS_EVENT_ON_CHILD) &&
188 !(inode_mark->mask & FS_EVENT_ON_CHILD))
189 return false;
190 marks_mask = inode_mark->mask;
191 marks_ignored_mask = inode_mark->ignored_mask;
192 } else if (vfsmnt_mark) {
193 marks_mask = vfsmnt_mark->mask;
194 marks_ignored_mask = vfsmnt_mark->ignored_mask;
195 } else {
196 BUG();
197 }
198
199 if (S_ISDIR(path->dentry->d_inode->i_mode) &&
200 (marks_ignored_mask & FS_ISDIR))
201 return false;
202
203 if (event_mask & marks_mask & ~marks_ignored_mask)
204 return true;
205
206 return false;
207}
208
209static void fanotify_free_group_priv(struct fsnotify_group *group)
210{
211 struct user_struct *user;
212
213 user = group->fanotify_data.user;
214 atomic_dec(&user->fanotify_listeners);
215 free_uid(user);
216}
217
218const struct fsnotify_ops fanotify_fsnotify_ops = {
219 .handle_event = fanotify_handle_event,
220 .should_send_event = fanotify_should_send_event,
221 .free_group_priv = fanotify_free_group_priv,
222 .free_event_priv = NULL,
223 .freeing_mark = NULL,
224};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 00000000000..063224812b7
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,870 @@
1#include <linux/fanotify.h>
2#include <linux/fcntl.h>
3#include <linux/file.h>
4#include <linux/fs.h>
5#include <linux/anon_inodes.h>
6#include <linux/fsnotify_backend.h>
7#include <linux/init.h>
8#include <linux/mount.h>
9#include <linux/namei.h>
10#include <linux/poll.h>
11#include <linux/security.h>
12#include <linux/syscalls.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/uaccess.h>
16
17#include <asm/ioctls.h>
18
19#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
20#define FANOTIFY_DEFAULT_MAX_MARKS 8192
21#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
22
23extern const struct fsnotify_ops fanotify_fsnotify_ops;
24
25static struct kmem_cache *fanotify_mark_cache __read_mostly;
26static struct kmem_cache *fanotify_response_event_cache __read_mostly;
27
28struct fanotify_response_event {
29 struct list_head list;
30 __s32 fd;
31 struct fsnotify_event *event;
32};
33
34/*
35 * Get an fsnotify notification event if one exists and is small
36 * enough to fit in "count". Return an error pointer if the count
37 * is not large enough.
38 *
39 * Called with the group->notification_mutex held.
40 */
41static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
42 size_t count)
43{
44 BUG_ON(!mutex_is_locked(&group->notification_mutex));
45
46 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
47
48 if (fsnotify_notify_queue_is_empty(group))
49 return NULL;
50
51 if (FAN_EVENT_METADATA_LEN > count)
52 return ERR_PTR(-EINVAL);
53
54 /* held the notification_mutex the whole time, so this is the
55 * same event we peeked above */
56 return fsnotify_remove_notify_event(group);
57}
58
59static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
60{
61 int client_fd;
62 struct dentry *dentry;
63 struct vfsmount *mnt;
64 struct file *new_file;
65
66 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
67
68 client_fd = get_unused_fd();
69 if (client_fd < 0)
70 return client_fd;
71
72 if (event->data_type != FSNOTIFY_EVENT_PATH) {
73 WARN_ON(1);
74 put_unused_fd(client_fd);
75 return -EINVAL;
76 }
77
78 /*
79 * we need a new file handle for the userspace program so it can read even if it was
80 * originally opened O_WRONLY.
81 */
82 dentry = dget(event->path.dentry);
83 mnt = mntget(event->path.mnt);
84 /* it's possible this event was an overflow event. in that case dentry and mnt
85 * are NULL; That's fine, just don't call dentry open */
86 if (dentry && mnt)
87 new_file = dentry_open(dentry, mnt,
88 group->fanotify_data.f_flags | FMODE_NONOTIFY,
89 current_cred());
90 else
91 new_file = ERR_PTR(-EOVERFLOW);
92 if (IS_ERR(new_file)) {
93 /*
94 * we still send an event even if we can't open the file. this
95 * can happen when say tasks are gone and we try to open their
96 * /proc files or we try to open a WRONLY file like in sysfs
97 * we just send the errno to userspace since there isn't much
98 * else we can do.
99 */
100 put_unused_fd(client_fd);
101 client_fd = PTR_ERR(new_file);
102 } else {
103 fd_install(client_fd, new_file);
104 }
105
106 return client_fd;
107}
108
109static ssize_t fill_event_metadata(struct fsnotify_group *group,
110 struct fanotify_event_metadata *metadata,
111 struct fsnotify_event *event)
112{
113 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
114 group, metadata, event);
115
116 metadata->event_len = FAN_EVENT_METADATA_LEN;
117 metadata->vers = FANOTIFY_METADATA_VERSION;
118 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
119 metadata->pid = pid_vnr(event->tgid);
120 metadata->fd = create_fd(group, event);
121
122 return metadata->fd;
123}
124
125#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
126static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
127 __s32 fd)
128{
129 struct fanotify_response_event *re, *return_re = NULL;
130
131 mutex_lock(&group->fanotify_data.access_mutex);
132 list_for_each_entry(re, &group->fanotify_data.access_list, list) {
133 if (re->fd != fd)
134 continue;
135
136 list_del_init(&re->list);
137 return_re = re;
138 break;
139 }
140 mutex_unlock(&group->fanotify_data.access_mutex);
141
142 pr_debug("%s: found return_re=%p\n", __func__, return_re);
143
144 return return_re;
145}
146
147static int process_access_response(struct fsnotify_group *group,
148 struct fanotify_response *response_struct)
149{
150 struct fanotify_response_event *re;
151 __s32 fd = response_struct->fd;
152 __u32 response = response_struct->response;
153
154 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
155 fd, response);
156 /*
157 * make sure the response is valid, if invalid we do nothing and either
158 * userspace can send a valid responce or we will clean it up after the
159 * timeout
160 */
161 switch (response) {
162 case FAN_ALLOW:
163 case FAN_DENY:
164 break;
165 default:
166 return -EINVAL;
167 }
168
169 if (fd < 0)
170 return -EINVAL;
171
172 re = dequeue_re(group, fd);
173 if (!re)
174 return -ENOENT;
175
176 re->event->response = response;
177
178 wake_up(&group->fanotify_data.access_waitq);
179
180 kmem_cache_free(fanotify_response_event_cache, re);
181
182 return 0;
183}
184
185static int prepare_for_access_response(struct fsnotify_group *group,
186 struct fsnotify_event *event,
187 __s32 fd)
188{
189 struct fanotify_response_event *re;
190
191 if (!(event->mask & FAN_ALL_PERM_EVENTS))
192 return 0;
193
194 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
195 if (!re)
196 return -ENOMEM;
197
198 re->event = event;
199 re->fd = fd;
200
201 mutex_lock(&group->fanotify_data.access_mutex);
202
203 if (group->fanotify_data.bypass_perm) {
204 mutex_unlock(&group->fanotify_data.access_mutex);
205 kmem_cache_free(fanotify_response_event_cache, re);
206 event->response = FAN_ALLOW;
207 return 0;
208 }
209
210 list_add_tail(&re->list, &group->fanotify_data.access_list);
211 mutex_unlock(&group->fanotify_data.access_mutex);
212
213 return 0;
214}
215
216static void remove_access_response(struct fsnotify_group *group,
217 struct fsnotify_event *event,
218 __s32 fd)
219{
220 struct fanotify_response_event *re;
221
222 if (!(event->mask & FAN_ALL_PERM_EVENTS))
223 return;
224
225 re = dequeue_re(group, fd);
226 if (!re)
227 return;
228
229 BUG_ON(re->event != event);
230
231 kmem_cache_free(fanotify_response_event_cache, re);
232
233 return;
234}
235#else
236static int prepare_for_access_response(struct fsnotify_group *group,
237 struct fsnotify_event *event,
238 __s32 fd)
239{
240 return 0;
241}
242
243static void remove_access_response(struct fsnotify_group *group,
244 struct fsnotify_event *event,
245 __s32 fd)
246{
247 return;
248}
249#endif
250
251static ssize_t copy_event_to_user(struct fsnotify_group *group,
252 struct fsnotify_event *event,
253 char __user *buf)
254{
255 struct fanotify_event_metadata fanotify_event_metadata;
256 int fd, ret;
257
258 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
259
260 fd = fill_event_metadata(group, &fanotify_event_metadata, event);
261 if (fd < 0)
262 return fd;
263
264 ret = prepare_for_access_response(group, event, fd);
265 if (ret)
266 goto out_close_fd;
267
268 ret = -EFAULT;
269 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
270 goto out_kill_access_response;
271
272 return FAN_EVENT_METADATA_LEN;
273
274out_kill_access_response:
275 remove_access_response(group, event, fd);
276out_close_fd:
277 sys_close(fd);
278 return ret;
279}
280
281/* intofiy userspace file descriptor functions */
282static unsigned int fanotify_poll(struct file *file, poll_table *wait)
283{
284 struct fsnotify_group *group = file->private_data;
285 int ret = 0;
286
287 poll_wait(file, &group->notification_waitq, wait);
288 mutex_lock(&group->notification_mutex);
289 if (!fsnotify_notify_queue_is_empty(group))
290 ret = POLLIN | POLLRDNORM;
291 mutex_unlock(&group->notification_mutex);
292
293 return ret;
294}
295
296static ssize_t fanotify_read(struct file *file, char __user *buf,
297 size_t count, loff_t *pos)
298{
299 struct fsnotify_group *group;
300 struct fsnotify_event *kevent;
301 char __user *start;
302 int ret;
303 DEFINE_WAIT(wait);
304
305 start = buf;
306 group = file->private_data;
307
308 pr_debug("%s: group=%p\n", __func__, group);
309
310 while (1) {
311 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
312
313 mutex_lock(&group->notification_mutex);
314 kevent = get_one_event(group, count);
315 mutex_unlock(&group->notification_mutex);
316
317 if (kevent) {
318 ret = PTR_ERR(kevent);
319 if (IS_ERR(kevent))
320 break;
321 ret = copy_event_to_user(group, kevent, buf);
322 fsnotify_put_event(kevent);
323 if (ret < 0)
324 break;
325 buf += ret;
326 count -= ret;
327 continue;
328 }
329
330 ret = -EAGAIN;
331 if (file->f_flags & O_NONBLOCK)
332 break;
333 ret = -ERESTARTSYS;
334 if (signal_pending(current))
335 break;
336
337 if (start != buf)
338 break;
339
340 schedule();
341 }
342
343 finish_wait(&group->notification_waitq, &wait);
344 if (start != buf && ret != -EFAULT)
345 ret = buf - start;
346 return ret;
347}
348
349static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
350{
351#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
352 struct fanotify_response response = { .fd = -1, .response = -1 };
353 struct fsnotify_group *group;
354 int ret;
355
356 group = file->private_data;
357
358 if (count > sizeof(response))
359 count = sizeof(response);
360
361 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
362
363 if (copy_from_user(&response, buf, count))
364 return -EFAULT;
365
366 ret = process_access_response(group, &response);
367 if (ret < 0)
368 count = ret;
369
370 return count;
371#else
372 return -EINVAL;
373#endif
374}
375
376static int fanotify_release(struct inode *ignored, struct file *file)
377{
378 struct fsnotify_group *group = file->private_data;
379
380#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
381 struct fanotify_response_event *re, *lre;
382
383 mutex_lock(&group->fanotify_data.access_mutex);
384
385 group->fanotify_data.bypass_perm = true;
386
387 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
388 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
389 re, re->event);
390
391 list_del_init(&re->list);
392 re->event->response = FAN_ALLOW;
393
394 kmem_cache_free(fanotify_response_event_cache, re);
395 }
396 mutex_unlock(&group->fanotify_data.access_mutex);
397
398 wake_up(&group->fanotify_data.access_waitq);
399#endif
400 /* matches the fanotify_init->fsnotify_alloc_group */
401 fsnotify_put_group(group);
402
403 return 0;
404}
405
406static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
407{
408 struct fsnotify_group *group;
409 struct fsnotify_event_holder *holder;
410 void __user *p;
411 int ret = -ENOTTY;
412 size_t send_len = 0;
413
414 group = file->private_data;
415
416 p = (void __user *) arg;
417
418 switch (cmd) {
419 case FIONREAD:
420 mutex_lock(&group->notification_mutex);
421 list_for_each_entry(holder, &group->notification_list, event_list)
422 send_len += FAN_EVENT_METADATA_LEN;
423 mutex_unlock(&group->notification_mutex);
424 ret = put_user(send_len, (int __user *) p);
425 break;
426 }
427
428 return ret;
429}
430
431static const struct file_operations fanotify_fops = {
432 .poll = fanotify_poll,
433 .read = fanotify_read,
434 .write = fanotify_write,
435 .fasync = NULL,
436 .release = fanotify_release,
437 .unlocked_ioctl = fanotify_ioctl,
438 .compat_ioctl = fanotify_ioctl,
439 .llseek = noop_llseek,
440};
441
442static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
443{
444 kmem_cache_free(fanotify_mark_cache, fsn_mark);
445}
446
447static int fanotify_find_path(int dfd, const char __user *filename,
448 struct path *path, unsigned int flags)
449{
450 int ret;
451
452 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
453 dfd, filename, flags);
454
455 if (filename == NULL) {
456 struct file *file;
457 int fput_needed;
458
459 ret = -EBADF;
460 file = fget_light(dfd, &fput_needed);
461 if (!file)
462 goto out;
463
464 ret = -ENOTDIR;
465 if ((flags & FAN_MARK_ONLYDIR) &&
466 !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
467 fput_light(file, fput_needed);
468 goto out;
469 }
470
471 *path = file->f_path;
472 path_get(path);
473 fput_light(file, fput_needed);
474 } else {
475 unsigned int lookup_flags = 0;
476
477 if (!(flags & FAN_MARK_DONT_FOLLOW))
478 lookup_flags |= LOOKUP_FOLLOW;
479 if (flags & FAN_MARK_ONLYDIR)
480 lookup_flags |= LOOKUP_DIRECTORY;
481
482 ret = user_path_at(dfd, filename, lookup_flags, path);
483 if (ret)
484 goto out;
485 }
486
487 /* you can only watch an inode if you have read permissions on it */
488 ret = inode_permission(path->dentry->d_inode, MAY_READ);
489 if (ret)
490 path_put(path);
491out:
492 return ret;
493}
494
495static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
496 __u32 mask,
497 unsigned int flags)
498{
499 __u32 oldmask;
500
501 spin_lock(&fsn_mark->lock);
502 if (!(flags & FAN_MARK_IGNORED_MASK)) {
503 oldmask = fsn_mark->mask;
504 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
505 } else {
506 oldmask = fsn_mark->ignored_mask;
507 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
508 }
509 spin_unlock(&fsn_mark->lock);
510
511 if (!(oldmask & ~mask))
512 fsnotify_destroy_mark(fsn_mark);
513
514 return mask & oldmask;
515}
516
517static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
518 struct vfsmount *mnt, __u32 mask,
519 unsigned int flags)
520{
521 struct fsnotify_mark *fsn_mark = NULL;
522 __u32 removed;
523
524 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
525 if (!fsn_mark)
526 return -ENOENT;
527
528 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
529 fsnotify_put_mark(fsn_mark);
530 if (removed & mnt->mnt_fsnotify_mask)
531 fsnotify_recalc_vfsmount_mask(mnt);
532
533 return 0;
534}
535
536static int fanotify_remove_inode_mark(struct fsnotify_group *group,
537 struct inode *inode, __u32 mask,
538 unsigned int flags)
539{
540 struct fsnotify_mark *fsn_mark = NULL;
541 __u32 removed;
542
543 fsn_mark = fsnotify_find_inode_mark(group, inode);
544 if (!fsn_mark)
545 return -ENOENT;
546
547 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
548 /* matches the fsnotify_find_inode_mark() */
549 fsnotify_put_mark(fsn_mark);
550 if (removed & inode->i_fsnotify_mask)
551 fsnotify_recalc_inode_mask(inode);
552
553 return 0;
554}
555
556static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
557 __u32 mask,
558 unsigned int flags)
559{
560 __u32 oldmask = -1;
561
562 spin_lock(&fsn_mark->lock);
563 if (!(flags & FAN_MARK_IGNORED_MASK)) {
564 oldmask = fsn_mark->mask;
565 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
566 } else {
567 __u32 tmask = fsn_mark->ignored_mask | mask;
568 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
569 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
570 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
571 }
572
573 if (!(flags & FAN_MARK_ONDIR)) {
574 __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
575 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
576 }
577
578 spin_unlock(&fsn_mark->lock);
579
580 return mask & ~oldmask;
581}
582
583static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
584 struct vfsmount *mnt, __u32 mask,
585 unsigned int flags)
586{
587 struct fsnotify_mark *fsn_mark;
588 __u32 added;
589
590 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
591 if (!fsn_mark) {
592 int ret;
593
594 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
595 return -ENOSPC;
596
597 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
598 if (!fsn_mark)
599 return -ENOMEM;
600
601 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
602 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
603 if (ret) {
604 fanotify_free_mark(fsn_mark);
605 return ret;
606 }
607 }
608 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
609 fsnotify_put_mark(fsn_mark);
610 if (added & ~mnt->mnt_fsnotify_mask)
611 fsnotify_recalc_vfsmount_mask(mnt);
612
613 return 0;
614}
615
616static int fanotify_add_inode_mark(struct fsnotify_group *group,
617 struct inode *inode, __u32 mask,
618 unsigned int flags)
619{
620 struct fsnotify_mark *fsn_mark;
621 __u32 added;
622
623 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
624
625 /*
626 * If some other task has this inode open for write we should not add
627 * an ignored mark, unless that ignored mark is supposed to survive
628 * modification changes anyway.
629 */
630 if ((flags & FAN_MARK_IGNORED_MASK) &&
631 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
632 (atomic_read(&inode->i_writecount) > 0))
633 return 0;
634
635 fsn_mark = fsnotify_find_inode_mark(group, inode);
636 if (!fsn_mark) {
637 int ret;
638
639 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
640 return -ENOSPC;
641
642 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
643 if (!fsn_mark)
644 return -ENOMEM;
645
646 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
647 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
648 if (ret) {
649 fanotify_free_mark(fsn_mark);
650 return ret;
651 }
652 }
653 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
654 fsnotify_put_mark(fsn_mark);
655 if (added & ~inode->i_fsnotify_mask)
656 fsnotify_recalc_inode_mask(inode);
657 return 0;
658}
659
660/* fanotify syscalls */
661SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
662{
663 struct fsnotify_group *group;
664 int f_flags, fd;
665 struct user_struct *user;
666
667 pr_debug("%s: flags=%d event_f_flags=%d\n",
668 __func__, flags, event_f_flags);
669
670 if (!capable(CAP_SYS_ADMIN))
671 return -EPERM;
672
673 if (flags & ~FAN_ALL_INIT_FLAGS)
674 return -EINVAL;
675
676 user = get_current_user();
677 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
678 free_uid(user);
679 return -EMFILE;
680 }
681
682 f_flags = O_RDWR | FMODE_NONOTIFY;
683 if (flags & FAN_CLOEXEC)
684 f_flags |= O_CLOEXEC;
685 if (flags & FAN_NONBLOCK)
686 f_flags |= O_NONBLOCK;
687
688 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
689 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
690 if (IS_ERR(group))
691 return PTR_ERR(group);
692
693 group->fanotify_data.user = user;
694 atomic_inc(&user->fanotify_listeners);
695
696 group->fanotify_data.f_flags = event_f_flags;
697#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
698 mutex_init(&group->fanotify_data.access_mutex);
699 init_waitqueue_head(&group->fanotify_data.access_waitq);
700 INIT_LIST_HEAD(&group->fanotify_data.access_list);
701#endif
702 switch (flags & FAN_ALL_CLASS_BITS) {
703 case FAN_CLASS_NOTIF:
704 group->priority = FS_PRIO_0;
705 break;
706 case FAN_CLASS_CONTENT:
707 group->priority = FS_PRIO_1;
708 break;
709 case FAN_CLASS_PRE_CONTENT:
710 group->priority = FS_PRIO_2;
711 break;
712 default:
713 fd = -EINVAL;
714 goto out_put_group;
715 }
716
717 if (flags & FAN_UNLIMITED_QUEUE) {
718 fd = -EPERM;
719 if (!capable(CAP_SYS_ADMIN))
720 goto out_put_group;
721 group->max_events = UINT_MAX;
722 } else {
723 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
724 }
725
726 if (flags & FAN_UNLIMITED_MARKS) {
727 fd = -EPERM;
728 if (!capable(CAP_SYS_ADMIN))
729 goto out_put_group;
730 group->fanotify_data.max_marks = UINT_MAX;
731 } else {
732 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
733 }
734
735 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
736 if (fd < 0)
737 goto out_put_group;
738
739 return fd;
740
741out_put_group:
742 fsnotify_put_group(group);
743 return fd;
744}
745
746SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
747 __u64 mask, int dfd,
748 const char __user * pathname)
749{
750 struct inode *inode = NULL;
751 struct vfsmount *mnt = NULL;
752 struct fsnotify_group *group;
753 struct file *filp;
754 struct path path;
755 int ret, fput_needed;
756
757 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
758 __func__, fanotify_fd, flags, dfd, pathname, mask);
759
760 /* we only use the lower 32 bits as of right now. */
761 if (mask & ((__u64)0xffffffff << 32))
762 return -EINVAL;
763
764 if (flags & ~FAN_ALL_MARK_FLAGS)
765 return -EINVAL;
766 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
767 case FAN_MARK_ADD:
768 case FAN_MARK_REMOVE:
769 case FAN_MARK_FLUSH:
770 break;
771 default:
772 return -EINVAL;
773 }
774
775 if (mask & FAN_ONDIR) {
776 flags |= FAN_MARK_ONDIR;
777 mask &= ~FAN_ONDIR;
778 }
779
780#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
781 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
782#else
783 if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
784#endif
785 return -EINVAL;
786
787 filp = fget_light(fanotify_fd, &fput_needed);
788 if (unlikely(!filp))
789 return -EBADF;
790
791 /* verify that this is indeed an fanotify instance */
792 ret = -EINVAL;
793 if (unlikely(filp->f_op != &fanotify_fops))
794 goto fput_and_out;
795 group = filp->private_data;
796
797 /*
798 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
799 * allowed to set permissions events.
800 */
801 ret = -EINVAL;
802 if (mask & FAN_ALL_PERM_EVENTS &&
803 group->priority == FS_PRIO_0)
804 goto fput_and_out;
805
806 ret = fanotify_find_path(dfd, pathname, &path, flags);
807 if (ret)
808 goto fput_and_out;
809
810 /* inode held in place by reference to path; group by fget on fd */
811 if (!(flags & FAN_MARK_MOUNT))
812 inode = path.dentry->d_inode;
813 else
814 mnt = path.mnt;
815
816 /* create/update an inode mark */
817 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
818 case FAN_MARK_ADD:
819 if (flags & FAN_MARK_MOUNT)
820 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
821 else
822 ret = fanotify_add_inode_mark(group, inode, mask, flags);
823 break;
824 case FAN_MARK_REMOVE:
825 if (flags & FAN_MARK_MOUNT)
826 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
827 else
828 ret = fanotify_remove_inode_mark(group, inode, mask, flags);
829 break;
830 case FAN_MARK_FLUSH:
831 if (flags & FAN_MARK_MOUNT)
832 fsnotify_clear_vfsmount_marks_by_group(group);
833 else
834 fsnotify_clear_inode_marks_by_group(group);
835 break;
836 default:
837 ret = -EINVAL;
838 }
839
840 path_put(&path);
841fput_and_out:
842 fput_light(filp, fput_needed);
843 return ret;
844}
845
846#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
847asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
848 long dfd, long pathname)
849{
850 return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
851 mask, (int) dfd,
852 (const char __user *) pathname);
853}
854SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
855#endif
856
857/*
858 * fanotify_user_setup - Our initialization function. Note that we cannnot return
859 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
860 * must result in panic().
861 */
862static int __init fanotify_user_setup(void)
863{
864 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
865 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
866 SLAB_PANIC);
867
868 return 0;
869}
870device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af8..20dc218707c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/mount.h>
24#include <linux/srcu.h> 25#include <linux/srcu.h>
25 26
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
35} 36}
36EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); 37EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
37 38
39void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
40{
41 fsnotify_clear_marks_by_mount(mnt);
42}
43
38/* 44/*
39 * Given an inode, first check if we care what happens to our children. Inotify 45 * Given an inode, first check if we care what happens to our children. Inotify
40 * and dnotify both tell their parents about events. If we care about any event 46 * and dnotify both tell their parents about events. If we care about any event
@@ -78,112 +84,225 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
78} 84}
79 85
80/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
81void __fsnotify_parent(struct dentry *dentry, __u32 mask) 87int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
82{ 88{
83 struct dentry *parent; 89 struct dentry *parent;
84 struct inode *p_inode; 90 struct inode *p_inode;
85 bool send = false; 91 int ret = 0;
86 bool should_update_children = false; 92
93 if (!dentry)
94 dentry = path->dentry;
87 95
88 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 96 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
89 return; 97 return 0;
90 98
91 spin_lock(&dentry->d_lock); 99 parent = dget_parent(dentry);
92 parent = dentry->d_parent;
93 p_inode = parent->d_inode; 100 p_inode = parent->d_inode;
94 101
95 if (fsnotify_inode_watches_children(p_inode)) { 102 if (unlikely(!fsnotify_inode_watches_children(p_inode)))
96 if (p_inode->i_fsnotify_mask & mask) { 103 __fsnotify_update_child_dentry_flags(p_inode);
97 dget(parent); 104 else if (p_inode->i_fsnotify_mask & mask) {
98 send = true;
99 }
100 } else {
101 /*
102 * The parent doesn't care about events on it's children but
103 * at least one child thought it did. We need to run all the
104 * children and update their d_flags to let them know p_inode
105 * doesn't care about them any more.
106 */
107 dget(parent);
108 should_update_children = true;
109 }
110
111 spin_unlock(&dentry->d_lock);
112
113 if (send) {
114 /* we are notifying a parent so come up with the new mask which 105 /* we are notifying a parent so come up with the new mask which
115 * specifies these are events which came from a child. */ 106 * specifies these are events which came from a child. */
116 mask |= FS_EVENT_ON_CHILD; 107 mask |= FS_EVENT_ON_CHILD;
117 108
118 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 109 if (path)
119 dentry->d_name.name, 0); 110 ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
120 dput(parent); 111 dentry->d_name.name, 0);
112 else
113 ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
114 dentry->d_name.name, 0);
121 } 115 }
122 116
123 if (unlikely(should_update_children)) { 117 dput(parent);
124 __fsnotify_update_child_dentry_flags(p_inode); 118
125 dput(parent); 119 return ret;
126 }
127} 120}
128EXPORT_SYMBOL_GPL(__fsnotify_parent); 121EXPORT_SYMBOL_GPL(__fsnotify_parent);
129 122
123static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
124 struct fsnotify_mark *inode_mark,
125 struct fsnotify_mark *vfsmount_mark,
126 __u32 mask, void *data,
127 int data_is, u32 cookie,
128 const unsigned char *file_name,
129 struct fsnotify_event **event)
130{
131 struct fsnotify_group *group = NULL;
132 __u32 inode_test_mask = 0;
133 __u32 vfsmount_test_mask = 0;
134
135 if (unlikely(!inode_mark && !vfsmount_mark)) {
136 BUG();
137 return 0;
138 }
139
140 /* clear ignored on inode modification */
141 if (mask & FS_MODIFY) {
142 if (inode_mark &&
143 !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
144 inode_mark->ignored_mask = 0;
145 if (vfsmount_mark &&
146 !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
147 vfsmount_mark->ignored_mask = 0;
148 }
149
150 /* does the inode mark tell us to do something? */
151 if (inode_mark) {
152 group = inode_mark->group;
153 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
154 inode_test_mask &= inode_mark->mask;
155 inode_test_mask &= ~inode_mark->ignored_mask;
156 }
157
158 /* does the vfsmount_mark tell us to do something? */
159 if (vfsmount_mark) {
160 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
161 group = vfsmount_mark->group;
162 vfsmount_test_mask &= vfsmount_mark->mask;
163 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
164 if (inode_mark)
165 vfsmount_test_mask &= ~inode_mark->ignored_mask;
166 }
167
168 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
169 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
170 " data=%p data_is=%d cookie=%d event=%p\n",
171 __func__, group, to_tell, mnt, mask, inode_mark,
172 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
173 data_is, cookie, *event);
174
175 if (!inode_test_mask && !vfsmount_test_mask)
176 return 0;
177
178 if (group->ops->should_send_event(group, to_tell, inode_mark,
179 vfsmount_mark, mask, data,
180 data_is) == false)
181 return 0;
182
183 if (!*event) {
184 *event = fsnotify_create_event(to_tell, mask, data,
185 data_is, file_name,
186 cookie, GFP_KERNEL);
187 if (!*event)
188 return -ENOMEM;
189 }
190 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
191}
192
130/* 193/*
131 * This is the main call to fsnotify. The VFS calls into hook specific functions 194 * This is the main call to fsnotify. The VFS calls into hook specific functions
132 * in linux/fsnotify.h. Those functions then in turn call here. Here will call 195 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
133 * out to all of the registered fsnotify_group. Those groups can then use the 196 * out to all of the registered fsnotify_group. Those groups can then use the
134 * notification event in whatever means they feel necessary. 197 * notification event in whatever means they feel necessary.
135 */ 198 */
136void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) 199int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
200 const unsigned char *file_name, u32 cookie)
137{ 201{
138 struct fsnotify_group *group; 202 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
203 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
204 struct fsnotify_group *inode_group, *vfsmount_group;
139 struct fsnotify_event *event = NULL; 205 struct fsnotify_event *event = NULL;
140 int idx; 206 struct vfsmount *mnt;
207 int idx, ret = 0;
141 /* global tests shouldn't care about events on child only the specific event */ 208 /* global tests shouldn't care about events on child only the specific event */
142 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 209 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
143 210
144 if (list_empty(&fsnotify_groups)) 211 if (data_is == FSNOTIFY_EVENT_PATH)
145 return; 212 mnt = ((struct path *)data)->mnt;
146 213 else
147 if (!(test_mask & fsnotify_mask)) 214 mnt = NULL;
148 return;
149 215
150 if (!(test_mask & to_tell->i_fsnotify_mask))
151 return;
152 /* 216 /*
153 * SRCU!! the groups list is very very much read only and the path is 217 * if this is a modify event we may need to clear the ignored masks
154 * very hot. The VAST majority of events are not going to need to do 218 * otherwise return if neither the inode nor the vfsmount care about
155 * anything other than walk the list so it's crazy to pre-allocate. 219 * this type of event.
156 */ 220 */
157 idx = srcu_read_lock(&fsnotify_grp_srcu); 221 if (!(mask & FS_MODIFY) &&
158 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { 222 !(test_mask & to_tell->i_fsnotify_mask) &&
159 if (test_mask & group->mask) { 223 !(mnt && test_mask & mnt->mnt_fsnotify_mask))
160 if (!group->ops->should_send_event(group, to_tell, mask)) 224 return 0;
161 continue; 225
162 if (!event) { 226 idx = srcu_read_lock(&fsnotify_mark_srcu);
163 event = fsnotify_create_event(to_tell, mask, data, 227
164 data_is, file_name, cookie, 228 if ((mask & FS_MODIFY) ||
165 GFP_KERNEL); 229 (test_mask & to_tell->i_fsnotify_mask))
166 /* shit, we OOM'd and now we can't tell, maybe 230 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
167 * someday someone else will want to do something 231 &fsnotify_mark_srcu);
168 * here */ 232
169 if (!event) 233 if (mnt && ((mask & FS_MODIFY) ||
170 break; 234 (test_mask & mnt->mnt_fsnotify_mask))) {
171 } 235 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
172 group->ops->handle_event(group, event); 236 &fsnotify_mark_srcu);
237 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
238 &fsnotify_mark_srcu);
239 }
240
241 while (inode_node || vfsmount_node) {
242 inode_group = vfsmount_group = NULL;
243
244 if (inode_node) {
245 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
246 struct fsnotify_mark, i.i_list);
247 inode_group = inode_mark->group;
248 }
249
250 if (vfsmount_node) {
251 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
252 struct fsnotify_mark, m.m_list);
253 vfsmount_group = vfsmount_mark->group;
254 }
255
256 if (inode_group > vfsmount_group) {
257 /* handle inode */
258 ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
259 data_is, cookie, file_name, &event);
260 /* we didn't use the vfsmount_mark */
261 vfsmount_group = NULL;
262 } else if (vfsmount_group > inode_group) {
263 ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
264 data_is, cookie, file_name, &event);
265 inode_group = NULL;
266 } else {
267 ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
268 mask, data, data_is, cookie, file_name,
269 &event);
173 } 270 }
271
272 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
273 goto out;
274
275 if (inode_group)
276 inode_node = srcu_dereference(inode_node->next,
277 &fsnotify_mark_srcu);
278 if (vfsmount_group)
279 vfsmount_node = srcu_dereference(vfsmount_node->next,
280 &fsnotify_mark_srcu);
174 } 281 }
175 srcu_read_unlock(&fsnotify_grp_srcu, idx); 282 ret = 0;
283out:
284 srcu_read_unlock(&fsnotify_mark_srcu, idx);
176 /* 285 /*
177 * fsnotify_create_event() took a reference so the event can't be cleaned 286 * fsnotify_create_event() took a reference so the event can't be cleaned
178 * up while we are still trying to add it to lists, drop that one. 287 * up while we are still trying to add it to lists, drop that one.
179 */ 288 */
180 if (event) 289 if (event)
181 fsnotify_put_event(event); 290 fsnotify_put_event(event);
291
292 return ret;
182} 293}
183EXPORT_SYMBOL_GPL(fsnotify); 294EXPORT_SYMBOL_GPL(fsnotify);
184 295
185static __init int fsnotify_init(void) 296static __init int fsnotify_init(void)
186{ 297{
187 return init_srcu_struct(&fsnotify_grp_srcu); 298 int ret;
299
300 BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
301
302 ret = init_srcu_struct(&fsnotify_mark_srcu);
303 if (ret)
304 panic("initializing fsnotify_mark_srcu");
305
306 return 0;
188} 307}
189subsys_initcall(fsnotify_init); 308core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2..85e7d2b431d 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
6#include <linux/srcu.h> 6#include <linux/srcu.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */ 9/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group); 10extern void fsnotify_flush_notify(struct fsnotify_group *group);
18 11
12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu;
14
15extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
16 __u32 mask);
17/* add a mark to an inode */
18extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
19 struct fsnotify_group *group, struct inode *inode,
20 int allow_dups);
21/* add a mark to a vfsmount */
22extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups);
25
19/* final kfree of a group */ 26/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group); 27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21 28
29/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */
32extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
22/* run the list of all marks associated with inode and flag them to be freed */ 33/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode); 34extern void fsnotify_clear_marks_by_inode(struct inode *inode);
35/* run the list of all marks associated with vfsmount and flag them to be freed */
36extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
24/* 37/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares 38 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children. 39 * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc..d309f38449c 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30 30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/* 31/*
90 * Final freeing of a group 32 * Final freeing of a group
91 */ 33 */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
110 */ 52 */
111static void fsnotify_destroy_group(struct fsnotify_group *group) 53static void fsnotify_destroy_group(struct fsnotify_group *group)
112{ 54{
113 /* clear all inode mark entries for this group */ 55 /* clear all inode marks for this group */
114 fsnotify_clear_marks_by_group(group); 56 fsnotify_clear_marks_by_group(group);
115 57
58 synchronize_srcu(&fsnotify_mark_srcu);
59
116 /* past the point of no return, matches the initial value of 1 */ 60 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks)) 61 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group); 62 fsnotify_final_destroy_group(group);
119} 63}
120 64
121/* 65/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through. 66 * Drop a reference to a group. Free it if it's through.
149 */ 67 */
150void fsnotify_put_group(struct fsnotify_group *group) 68void fsnotify_put_group(struct fsnotify_group *group)
151{ 69{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) 70 if (atomic_dec_and_test(&group->refcnt))
153 return; 71 fsnotify_destroy_group(group);
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197} 72}
198 73
199/* 74/*
200 * Either finds an existing group which matches the group_num, mask, and ops or 75 * Create a new fsnotify_group and hold a reference for the group returned.
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */ 76 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, 77struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
205 const struct fsnotify_ops *ops)
206{ 78{
207 struct fsnotify_group *group, *tgroup; 79 struct fsnotify_group *group;
208 80
209 /* very low use, simpler locking if we just always alloc */ 81 group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group) 82 if (!group)
212 return ERR_PTR(-ENOMEM); 83 return ERR_PTR(-ENOMEM);
213 84
85 /* set to 0 when there a no external references to this group */
214 atomic_set(&group->refcnt, 1); 86 atomic_set(&group->refcnt, 1);
215 87 /*
216 group->on_group_list = 0; 88 * hits 0 when there are no external references AND no marks for
217 group->group_num = group_num; 89 * this group
218 group->mask = mask; 90 */
91 atomic_set(&group->num_marks, 1);
219 92
220 mutex_init(&group->notification_mutex); 93 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list); 94 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq); 95 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX; 96 group->max_events = UINT_MAX;
225 97
226 spin_lock_init(&group->mark_lock); 98 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0); 99 INIT_LIST_HEAD(&group->marks_list);
228 INIT_LIST_HEAD(&group->mark_entries);
229 100
230 group->ops = ops; 101 group->ops = ops;
231 102
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group; 103 return group;
254} 104}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c..4c29fcf557d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */ 17 */
18 18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h> 19#include <linux/fs.h>
86#include <linux/init.h> 20#include <linux/init.h>
87#include <linux/kernel.h> 21#include <linux/kernel.h>
@@ -95,30 +29,19 @@
95#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
96#include "fsnotify.h" 30#include "fsnotify.h"
97 31
98void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
99{
100 atomic_inc(&entry->refcnt);
101}
102
103void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
104{
105 if (atomic_dec_and_test(&entry->refcnt))
106 entry->free_mark(entry);
107}
108
109/* 32/*
110 * Recalculate the mask of events relevant to a given inode locked. 33 * Recalculate the mask of events relevant to a given inode locked.
111 */ 34 */
112static void fsnotify_recalc_inode_mask_locked(struct inode *inode) 35static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
113{ 36{
114 struct fsnotify_mark_entry *entry; 37 struct fsnotify_mark *mark;
115 struct hlist_node *pos; 38 struct hlist_node *pos;
116 __u32 new_mask = 0; 39 __u32 new_mask = 0;
117 40
118 assert_spin_locked(&inode->i_lock); 41 assert_spin_locked(&inode->i_lock);
119 42
120 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) 43 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
121 new_mask |= entry->mask; 44 new_mask |= mark->mask;
122 inode->i_fsnotify_mask = new_mask; 45 inode->i_fsnotify_mask = new_mask;
123} 46}
124 47
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
135 __fsnotify_update_child_dentry_flags(inode); 58 __fsnotify_update_child_dentry_flags(inode);
136} 59}
137 60
138/* 61void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
139 * Any time a mark is getting freed we end up here.
140 * The caller had better be holding a reference to this mark so we don't actually
141 * do the final put under the entry->lock
142 */
143void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
144{ 62{
145 struct fsnotify_group *group; 63 struct inode *inode = mark->i.inode;
146 struct inode *inode;
147 64
148 spin_lock(&entry->lock); 65 assert_spin_locked(&mark->lock);
66 assert_spin_locked(&mark->group->mark_lock);
149 67
150 group = entry->group;
151 inode = entry->inode;
152
153 BUG_ON(group && !inode);
154 BUG_ON(!group && inode);
155
156 /* if !group something else already marked this to die */
157 if (!group) {
158 spin_unlock(&entry->lock);
159 return;
160 }
161
162 /* 1 from caller and 1 for being on i_list/g_list */
163 BUG_ON(atomic_read(&entry->refcnt) < 2);
164
165 spin_lock(&group->mark_lock);
166 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
167 69
168 hlist_del_init(&entry->i_list); 70 hlist_del_init_rcu(&mark->i.i_list);
169 entry->inode = NULL; 71 mark->i.inode = NULL;
170
171 list_del_init(&entry->g_list);
172 entry->group = NULL;
173
174 fsnotify_put_mark(entry); /* for i_list and g_list */
175 72
176 /* 73 /*
177 * this mark is now off the inode->i_fsnotify_mark_entries list and we 74 * this mark is now off the inode->i_fsnotify_marks list and we
178 * hold the inode->i_lock, so this is the perfect time to update the 75 * hold the inode->i_lock, so this is the perfect time to update the
179 * inode->i_fsnotify_mask 76 * inode->i_fsnotify_mask
180 */ 77 */
181 fsnotify_recalc_inode_mask_locked(inode); 78 fsnotify_recalc_inode_mask_locked(inode);
182 79
183 spin_unlock(&inode->i_lock); 80 spin_unlock(&inode->i_lock);
184 spin_unlock(&group->mark_lock);
185 spin_unlock(&entry->lock);
186
187 /*
188 * Some groups like to know that marks are being freed. This is a
189 * callback to the group function to let it know that this entry
190 * is being freed.
191 */
192 if (group->ops->freeing_mark)
193 group->ops->freeing_mark(entry, group);
194
195 /*
196 * __fsnotify_update_child_dentry_flags(inode);
197 *
198 * I really want to call that, but we can't, we have no idea if the inode
199 * still exists the second we drop the entry->lock.
200 *
201 * The next time an event arrive to this inode from one of it's children
202 * __fsnotify_parent will see that the inode doesn't care about it's
203 * children and will update all of these flags then. So really this
204 * is just a lazy update (and could be a perf win...)
205 */
206
207
208 iput(inode);
209
210 /*
211 * it's possible that this group tried to destroy itself, but this
212 * this mark was simultaneously being freed by inode. If that's the
213 * case, we finish freeing the group here.
214 */
215 if (unlikely(atomic_dec_and_test(&group->num_marks)))
216 fsnotify_final_destroy_group(group);
217}
218
219/*
220 * Given a group, destroy all of the marks associated with that group.
221 */
222void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
223{
224 struct fsnotify_mark_entry *lentry, *entry;
225 LIST_HEAD(free_list);
226
227 spin_lock(&group->mark_lock);
228 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
229 list_add(&entry->free_g_list, &free_list);
230 list_del_init(&entry->g_list);
231 fsnotify_get_mark(entry);
232 }
233 spin_unlock(&group->mark_lock);
234
235 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
236 fsnotify_destroy_mark_by_entry(entry);
237 fsnotify_put_mark(entry);
238 }
239} 81}
240 82
241/* 83/*
@@ -243,113 +85,151 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
243 */ 85 */
244void fsnotify_clear_marks_by_inode(struct inode *inode) 86void fsnotify_clear_marks_by_inode(struct inode *inode)
245{ 87{
246 struct fsnotify_mark_entry *entry, *lentry; 88 struct fsnotify_mark *mark, *lmark;
247 struct hlist_node *pos, *n; 89 struct hlist_node *pos, *n;
248 LIST_HEAD(free_list); 90 LIST_HEAD(free_list);
249 91
250 spin_lock(&inode->i_lock); 92 spin_lock(&inode->i_lock);
251 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { 93 hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
252 list_add(&entry->free_i_list, &free_list); 94 list_add(&mark->i.free_i_list, &free_list);
253 hlist_del_init(&entry->i_list); 95 hlist_del_init_rcu(&mark->i.i_list);
254 fsnotify_get_mark(entry); 96 fsnotify_get_mark(mark);
255 } 97 }
256 spin_unlock(&inode->i_lock); 98 spin_unlock(&inode->i_lock);
257 99
258 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { 100 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
259 fsnotify_destroy_mark_by_entry(entry); 101 fsnotify_destroy_mark(mark);
260 fsnotify_put_mark(entry); 102 fsnotify_put_mark(mark);
261 } 103 }
262} 104}
263 105
264/* 106/*
107 * Given a group clear all of the inode marks associated with that group.
108 */
109void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
110{
111 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
112}
113
114/*
265 * given a group and inode, find the mark associated with that combination. 115 * given a group and inode, find the mark associated with that combination.
266 * if found take a reference to that mark and return it, else return NULL 116 * if found take a reference to that mark and return it, else return NULL
267 */ 117 */
268struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, 118struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
269 struct inode *inode) 119 struct inode *inode)
270{ 120{
271 struct fsnotify_mark_entry *entry; 121 struct fsnotify_mark *mark;
272 struct hlist_node *pos; 122 struct hlist_node *pos;
273 123
274 assert_spin_locked(&inode->i_lock); 124 assert_spin_locked(&inode->i_lock);
275 125
276 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { 126 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
277 if (entry->group == group) { 127 if (mark->group == group) {
278 fsnotify_get_mark(entry); 128 fsnotify_get_mark(mark);
279 return entry; 129 return mark;
280 } 130 }
281 } 131 }
282 return NULL; 132 return NULL;
283} 133}
284 134
285/* 135/*
286 * Nothing fancy, just initialize lists and locks and counters. 136 * given a group and inode, find the mark associated with that combination.
137 * if found take a reference to that mark and return it, else return NULL
287 */ 138 */
288void fsnotify_init_mark(struct fsnotify_mark_entry *entry, 139struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
289 void (*free_mark)(struct fsnotify_mark_entry *entry)) 140 struct inode *inode)
141{
142 struct fsnotify_mark *mark;
143
144 spin_lock(&inode->i_lock);
145 mark = fsnotify_find_inode_mark_locked(group, inode);
146 spin_unlock(&inode->i_lock);
147
148 return mark;
149}
290 150
151/*
152 * If we are setting a mark mask on an inode mark we should pin the inode
153 * in memory.
154 */
155void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
156 __u32 mask)
291{ 157{
292 spin_lock_init(&entry->lock); 158 struct inode *inode;
293 atomic_set(&entry->refcnt, 1); 159
294 INIT_HLIST_NODE(&entry->i_list); 160 assert_spin_locked(&mark->lock);
295 entry->group = NULL; 161
296 entry->mask = 0; 162 if (mask &&
297 entry->inode = NULL; 163 mark->i.inode &&
298 entry->free_mark = free_mark; 164 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
165 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
166 inode = igrab(mark->i.inode);
167 /*
168 * we shouldn't be able to get here if the inode wasn't
169 * already safely held in memory. But bug in case it
170 * ever is wrong.
171 */
172 BUG_ON(!inode);
173 }
299} 174}
300 175
301/* 176/*
302 * Attach an initialized mark entry to a given group and inode. 177 * Attach an initialized mark to a given inode.
303 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
304 * event types should be delivered to which group and for which inodes. 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to priority, highest number first, and then by
181 * the group's location in memory.
305 */ 182 */
306int fsnotify_add_mark(struct fsnotify_mark_entry *entry, 183int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
307 struct fsnotify_group *group, struct inode *inode) 184 struct fsnotify_group *group, struct inode *inode,
185 int allow_dups)
308{ 186{
309 struct fsnotify_mark_entry *lentry; 187 struct fsnotify_mark *lmark;
188 struct hlist_node *node, *last = NULL;
310 int ret = 0; 189 int ret = 0;
311 190
312 inode = igrab(inode); 191 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
313 if (unlikely(!inode)) 192
314 return -EINVAL; 193 assert_spin_locked(&mark->lock);
194 assert_spin_locked(&group->mark_lock);
315 195
316 /*
317 * LOCKING ORDER!!!!
318 * entry->lock
319 * group->mark_lock
320 * inode->i_lock
321 */
322 spin_lock(&entry->lock);
323 spin_lock(&group->mark_lock);
324 spin_lock(&inode->i_lock); 196 spin_lock(&inode->i_lock);
325 197
326 lentry = fsnotify_find_mark_entry(group, inode); 198 mark->i.inode = inode;
327 if (!lentry) {
328 entry->group = group;
329 entry->inode = inode;
330 199
331 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 200 /* is mark the first mark? */
332 list_add(&entry->g_list, &group->mark_entries); 201 if (hlist_empty(&inode->i_fsnotify_marks)) {
202 hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
203 goto out;
204 }
333 205
334 fsnotify_get_mark(entry); /* for i_list and g_list */ 206 /* should mark be in the middle of the current list? */
207 hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
208 last = node;
335 209
336 atomic_inc(&group->num_marks); 210 if ((lmark->group == group) && !allow_dups) {
211 ret = -EEXIST;
212 goto out;
213 }
337 214
338 fsnotify_recalc_inode_mask_locked(inode); 215 if (mark->group->priority < lmark->group->priority)
339 } 216 continue;
340 217
341 spin_unlock(&inode->i_lock); 218 if ((mark->group->priority == lmark->group->priority) &&
342 spin_unlock(&group->mark_lock); 219 (mark->group < lmark->group))
343 spin_unlock(&entry->lock); 220 continue;
344 221
345 if (lentry) { 222 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
346 ret = -EEXIST; 223 goto out;
347 iput(inode);
348 fsnotify_put_mark(lentry);
349 } else {
350 __fsnotify_update_child_dentry_flags(inode);
351 } 224 }
352 225
226 BUG_ON(last == NULL);
227 /* mark should be the last entry. last is the current last entry */
228 hlist_add_after_rcu(last, &mark->i.i_list);
229out:
230 fsnotify_recalc_inode_mask_locked(inode);
231 spin_unlock(&inode->i_lock);
232
353 return ret; 233 return ret;
354} 234}
355 235
@@ -365,15 +245,16 @@ void fsnotify_unmount_inodes(struct list_head *list)
365{ 245{
366 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
367 247
248 spin_lock(&inode_lock);
368 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
369 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
370 251
371 /* 252 /*
372 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, 253 * We cannot __iget() an inode in state I_FREEING,
373 * I_WILL_FREE, or I_NEW which is fine because by that point 254 * I_WILL_FREE, or I_NEW which is fine because by that point
374 * the inode cannot have any associated watches. 255 * the inode cannot have any associated watches.
375 */ 256 */
376 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) 257 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
377 continue; 258 continue;
378 259
379 /* 260 /*
@@ -397,7 +278,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
397 /* In case the dropping of a reference would nuke next_i. */ 278 /* In case the dropping of a reference would nuke next_i. */
398 if ((&next_i->i_sb_list != list) && 279 if ((&next_i->i_sb_list != list) &&
399 atomic_read(&next_i->i_count) && 280 atomic_read(&next_i->i_count) &&
400 !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { 281 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
401 __iget(next_i); 282 __iget(next_i);
402 need_iput = next_i; 283 need_iput = next_i;
403 } 284 }
@@ -422,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
422 303
423 spin_lock(&inode_lock); 304 spin_lock(&inode_lock);
424 } 305 }
306 spin_unlock(&inode_lock);
425} 307}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cf..b981fc0c837 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default n
4 ---help---
5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 file change notification system. It is a replacement for dnotify.
7 This option only provides the legacy inotify in kernel API. There
8 are no in tree kernel users of this interface since it is deprecated.
9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
11
12 For more information, see <file:Documentation/filesystems/inotify.txt>
13
14 If unsure, say N.
15
16config INOTIFY_USER 1config INOTIFY_USER
17 bool "Inotify support for userspace" 2 bool "Inotify support for userspace"
18 select ANON_INODES 3 select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 94382817136..a380dabe09d 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 27b75ebc746..00000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,873 +0,0 @@
1/*
2 * fs/inotify.c - inode-based file event notifications
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
9 *
10 * Copyright (C) 2005 John McCutchan
11 * Copyright 2006 Hewlett-Packard Development Company, L.P.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by the
15 * Free Software Foundation; either version 2, or (at your option) any
16 * later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/spinlock.h>
27#include <linux/idr.h>
28#include <linux/slab.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/list.h>
33#include <linux/writeback.h>
34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
36
37static atomic_t inotify_cookie;
38
39/*
40 * Lock ordering:
41 *
42 * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
43 * iprune_mutex (synchronize shrink_icache_memory())
44 * inode_lock (protects the super_block->s_inodes list)
45 * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
46 * inotify_handle->mutex (protects inotify_handle and watches->h_list)
47 *
48 * The inode->inotify_mutex and inotify_handle->mutex and held during execution
49 * of a caller's event handler. Thus, the caller must not hold any locks
50 * taken in their event handler while calling any of the published inotify
51 * interfaces.
52 */
53
54/*
55 * Lifetimes of the three main data structures--inotify_handle, inode, and
56 * inotify_watch--are managed by reference count.
57 *
58 * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
59 * Additional references can bump the count via get_inotify_handle() and drop
60 * the count via put_inotify_handle().
61 *
62 * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
63 * to remove_watch_no_event(). Additional references can bump the count via
64 * get_inotify_watch() and drop the count via put_inotify_watch(). The caller
65 * is reponsible for the final put after receiving IN_IGNORED, or when using
66 * IN_ONESHOT after receiving the first event. Inotify does the final put if
67 * inotify_destroy() is called.
68 *
69 * inode: Pinned so long as the inode is associated with a watch, from
70 * inotify_add_watch() to the final put_inotify_watch().
71 */
72
73/*
74 * struct inotify_handle - represents an inotify instance
75 *
76 * This structure is protected by the mutex 'mutex'.
77 */
78struct inotify_handle {
79 struct idr idr; /* idr mapping wd -> watch */
80 struct mutex mutex; /* protects this bad boy */
81 struct list_head watches; /* list of watches */
82 atomic_t count; /* reference count */
83 u32 last_wd; /* the last wd allocated */
84 const struct inotify_operations *in_ops; /* inotify caller operations */
85};
86
87static inline void get_inotify_handle(struct inotify_handle *ih)
88{
89 atomic_inc(&ih->count);
90}
91
92static inline void put_inotify_handle(struct inotify_handle *ih)
93{
94 if (atomic_dec_and_test(&ih->count)) {
95 idr_destroy(&ih->idr);
96 kfree(ih);
97 }
98}
99
100/**
101 * get_inotify_watch - grab a reference to an inotify_watch
102 * @watch: watch to grab
103 */
104void get_inotify_watch(struct inotify_watch *watch)
105{
106 atomic_inc(&watch->count);
107}
108EXPORT_SYMBOL_GPL(get_inotify_watch);
109
110int pin_inotify_watch(struct inotify_watch *watch)
111{
112 struct super_block *sb = watch->inode->i_sb;
113 if (atomic_inc_not_zero(&sb->s_active)) {
114 atomic_inc(&watch->count);
115 return 1;
116 }
117 return 0;
118}
119
120/**
121 * put_inotify_watch - decrements the ref count on a given watch. cleans up
122 * watch references if the count reaches zero. inotify_watch is freed by
123 * inotify callers via the destroy_watch() op.
124 * @watch: watch to release
125 */
126void put_inotify_watch(struct inotify_watch *watch)
127{
128 if (atomic_dec_and_test(&watch->count)) {
129 struct inotify_handle *ih = watch->ih;
130
131 iput(watch->inode);
132 ih->in_ops->destroy_watch(watch);
133 put_inotify_handle(ih);
134 }
135}
136EXPORT_SYMBOL_GPL(put_inotify_watch);
137
138void unpin_inotify_watch(struct inotify_watch *watch)
139{
140 struct super_block *sb = watch->inode->i_sb;
141 put_inotify_watch(watch);
142 deactivate_super(sb);
143}
144
145/*
146 * inotify_handle_get_wd - returns the next WD for use by the given handle
147 *
148 * Callers must hold ih->mutex. This function can sleep.
149 */
150static int inotify_handle_get_wd(struct inotify_handle *ih,
151 struct inotify_watch *watch)
152{
153 int ret;
154
155 do {
156 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
157 return -ENOSPC;
158 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
159 } while (ret == -EAGAIN);
160
161 if (likely(!ret))
162 ih->last_wd = watch->wd;
163
164 return ret;
165}
166
167/*
168 * inotify_inode_watched - returns nonzero if there are watches on this inode
169 * and zero otherwise. We call this lockless, we do not care if we race.
170 */
171static inline int inotify_inode_watched(struct inode *inode)
172{
173 return !list_empty(&inode->inotify_watches);
174}
175
176/*
177 * Get child dentry flag into synch with parent inode.
178 * Flag should always be clear for negative dentrys.
179 */
180static void set_dentry_child_flags(struct inode *inode, int watched)
181{
182 struct dentry *alias;
183
184 spin_lock(&dcache_lock);
185 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
186 struct dentry *child;
187
188 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
189 if (!child->d_inode)
190 continue;
191
192 spin_lock(&child->d_lock);
193 if (watched)
194 child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
195 else
196 child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
197 spin_unlock(&child->d_lock);
198 }
199 }
200 spin_unlock(&dcache_lock);
201}
202
203/*
204 * inotify_find_handle - find the watch associated with the given inode and
205 * handle
206 *
207 * Callers must hold inode->inotify_mutex.
208 */
209static struct inotify_watch *inode_find_handle(struct inode *inode,
210 struct inotify_handle *ih)
211{
212 struct inotify_watch *watch;
213
214 list_for_each_entry(watch, &inode->inotify_watches, i_list) {
215 if (watch->ih == ih)
216 return watch;
217 }
218
219 return NULL;
220}
221
222/*
223 * remove_watch_no_event - remove watch without the IN_IGNORED event.
224 *
225 * Callers must hold both inode->inotify_mutex and ih->mutex.
226 */
227static void remove_watch_no_event(struct inotify_watch *watch,
228 struct inotify_handle *ih)
229{
230 list_del(&watch->i_list);
231 list_del(&watch->h_list);
232
233 if (!inotify_inode_watched(watch->inode))
234 set_dentry_child_flags(watch->inode, 0);
235
236 idr_remove(&ih->idr, watch->wd);
237}
238
239/**
240 * inotify_remove_watch_locked - Remove a watch from both the handle and the
241 * inode. Sends the IN_IGNORED event signifying that the inode is no longer
242 * watched. May be invoked from a caller's event handler.
243 * @ih: inotify handle associated with watch
244 * @watch: watch to remove
245 *
246 * Callers must hold both inode->inotify_mutex and ih->mutex.
247 */
248void inotify_remove_watch_locked(struct inotify_handle *ih,
249 struct inotify_watch *watch)
250{
251 remove_watch_no_event(watch, ih);
252 ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
253}
254EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
255
256/* Kernel API for producing events */
257
258/*
259 * inotify_d_instantiate - instantiate dcache entry for inode
260 */
261void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
262{
263 struct dentry *parent;
264
265 if (!inode)
266 return;
267
268 spin_lock(&entry->d_lock);
269 parent = entry->d_parent;
270 if (parent->d_inode && inotify_inode_watched(parent->d_inode))
271 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
272 spin_unlock(&entry->d_lock);
273}
274
275/*
276 * inotify_d_move - dcache entry has been moved
277 */
278void inotify_d_move(struct dentry *entry)
279{
280 struct dentry *parent;
281
282 parent = entry->d_parent;
283 if (inotify_inode_watched(parent->d_inode))
284 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
285 else
286 entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
287}
288
289/**
290 * inotify_inode_queue_event - queue an event to all watches on this inode
291 * @inode: inode event is originating from
292 * @mask: event mask describing this event
293 * @cookie: cookie for synchronization, or zero
294 * @name: filename, if any
295 * @n_inode: inode associated with name
296 */
297void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
298 const char *name, struct inode *n_inode)
299{
300 struct inotify_watch *watch, *next;
301
302 if (!inotify_inode_watched(inode))
303 return;
304
305 mutex_lock(&inode->inotify_mutex);
306 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
307 u32 watch_mask = watch->mask;
308 if (watch_mask & mask) {
309 struct inotify_handle *ih= watch->ih;
310 mutex_lock(&ih->mutex);
311 if (watch_mask & IN_ONESHOT)
312 remove_watch_no_event(watch, ih);
313 ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
314 name, n_inode);
315 mutex_unlock(&ih->mutex);
316 }
317 }
318 mutex_unlock(&inode->inotify_mutex);
319}
320EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
321
322/**
323 * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
324 * @dentry: the dentry in question, we queue against this dentry's parent
325 * @mask: event mask describing this event
326 * @cookie: cookie for synchronization, or zero
327 * @name: filename, if any
328 */
329void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
330 u32 cookie, const char *name)
331{
332 struct dentry *parent;
333 struct inode *inode;
334
335 if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
336 return;
337
338 spin_lock(&dentry->d_lock);
339 parent = dentry->d_parent;
340 inode = parent->d_inode;
341
342 if (inotify_inode_watched(inode)) {
343 dget(parent);
344 spin_unlock(&dentry->d_lock);
345 inotify_inode_queue_event(inode, mask, cookie, name,
346 dentry->d_inode);
347 dput(parent);
348 } else
349 spin_unlock(&dentry->d_lock);
350}
351EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
352
353/**
354 * inotify_get_cookie - return a unique cookie for use in synchronizing events.
355 */
356u32 inotify_get_cookie(void)
357{
358 return atomic_inc_return(&inotify_cookie);
359}
360EXPORT_SYMBOL_GPL(inotify_get_cookie);
361
362/**
363 * inotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
364 * @list: list of inodes being unmounted (sb->s_inodes)
365 *
366 * Called with inode_lock held, protecting the unmounting super block's list
367 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
368 * We temporarily drop inode_lock, however, and CAN block.
369 */
370void inotify_unmount_inodes(struct list_head *list)
371{
372 struct inode *inode, *next_i, *need_iput = NULL;
373
374 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
375 struct inotify_watch *watch, *next_w;
376 struct inode *need_iput_tmp;
377 struct list_head *watches;
378
379 /*
380 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
381 * I_WILL_FREE, or I_NEW which is fine because by that point
382 * the inode cannot have any associated watches.
383 */
384 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
385 continue;
386
387 /*
388 * If i_count is zero, the inode cannot have any watches and
389 * doing an __iget/iput with MS_ACTIVE clear would actually
390 * evict all inodes with zero i_count from icache which is
391 * unnecessarily violent and may in fact be illegal to do.
392 */
393 if (!atomic_read(&inode->i_count))
394 continue;
395
396 need_iput_tmp = need_iput;
397 need_iput = NULL;
398 /* In case inotify_remove_watch_locked() drops a reference. */
399 if (inode != need_iput_tmp)
400 __iget(inode);
401 else
402 need_iput_tmp = NULL;
403 /* In case the dropping of a reference would nuke next_i. */
404 if ((&next_i->i_sb_list != list) &&
405 atomic_read(&next_i->i_count) &&
406 !(next_i->i_state & (I_CLEAR | I_FREEING |
407 I_WILL_FREE))) {
408 __iget(next_i);
409 need_iput = next_i;
410 }
411
412 /*
413 * We can safely drop inode_lock here because we hold
414 * references on both inode and next_i. Also no new inodes
415 * will be added since the umount has begun. Finally,
416 * iprune_mutex keeps shrink_icache_memory() away.
417 */
418 spin_unlock(&inode_lock);
419
420 if (need_iput_tmp)
421 iput(need_iput_tmp);
422
423 /* for each watch, send IN_UNMOUNT and then remove it */
424 mutex_lock(&inode->inotify_mutex);
425 watches = &inode->inotify_watches;
426 list_for_each_entry_safe(watch, next_w, watches, i_list) {
427 struct inotify_handle *ih= watch->ih;
428 get_inotify_watch(watch);
429 mutex_lock(&ih->mutex);
430 ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
431 NULL, NULL);
432 inotify_remove_watch_locked(ih, watch);
433 mutex_unlock(&ih->mutex);
434 put_inotify_watch(watch);
435 }
436 mutex_unlock(&inode->inotify_mutex);
437 iput(inode);
438
439 spin_lock(&inode_lock);
440 }
441}
442EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
443
444/**
445 * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
446 * @inode: inode that is about to be removed
447 */
448void inotify_inode_is_dead(struct inode *inode)
449{
450 struct inotify_watch *watch, *next;
451
452 mutex_lock(&inode->inotify_mutex);
453 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
454 struct inotify_handle *ih = watch->ih;
455 mutex_lock(&ih->mutex);
456 inotify_remove_watch_locked(ih, watch);
457 mutex_unlock(&ih->mutex);
458 }
459 mutex_unlock(&inode->inotify_mutex);
460}
461EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
462
463/* Kernel Consumer API */
464
465/**
466 * inotify_init - allocate and initialize an inotify instance
467 * @ops: caller's inotify operations
468 */
469struct inotify_handle *inotify_init(const struct inotify_operations *ops)
470{
471 struct inotify_handle *ih;
472
473 ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
474 if (unlikely(!ih))
475 return ERR_PTR(-ENOMEM);
476
477 idr_init(&ih->idr);
478 INIT_LIST_HEAD(&ih->watches);
479 mutex_init(&ih->mutex);
480 ih->last_wd = 0;
481 ih->in_ops = ops;
482 atomic_set(&ih->count, 0);
483 get_inotify_handle(ih);
484
485 return ih;
486}
487EXPORT_SYMBOL_GPL(inotify_init);
488
489/**
490 * inotify_init_watch - initialize an inotify watch
491 * @watch: watch to initialize
492 */
493void inotify_init_watch(struct inotify_watch *watch)
494{
495 INIT_LIST_HEAD(&watch->h_list);
496 INIT_LIST_HEAD(&watch->i_list);
497 atomic_set(&watch->count, 0);
498 get_inotify_watch(watch); /* initial get */
499}
500EXPORT_SYMBOL_GPL(inotify_init_watch);
501
502/*
503 * Watch removals suck violently. To kick the watch out we need (in this
504 * order) inode->inotify_mutex and ih->mutex. That's fine if we have
505 * a hold on inode; however, for all other cases we need to make damn sure
506 * we don't race with umount. We can *NOT* just grab a reference to a
507 * watch - inotify_unmount_inodes() will happily sail past it and we'll end
508 * with reference to inode potentially outliving its superblock. Ideally
509 * we just want to grab an active reference to superblock if we can; that
510 * will make sure we won't go into inotify_umount_inodes() until we are
511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
512 * case - what if we *are* racing with umount() and active references to
513 * superblock can't be acquired anymore? We can bump ->s_count, grab
514 * ->s_umount, which will wait until the superblock is shut down and the
515 * watch in question is pining for fjords.
516 *
517 * And yes, this is far beyond mere "not very pretty"; so's the entire
518 * concept of inotify to start with.
519 */
520
521/**
522 * pin_to_kill - pin the watch down for removal
523 * @ih: inotify handle
524 * @watch: watch to kill
525 *
526 * Called with ih->mutex held, drops it. Possible return values:
527 * 0 - nothing to do, it has died
528 * 1 - remove it, drop the reference and deactivate_super()
529 */
530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
531{
532 struct super_block *sb = watch->inode->i_sb;
533
534 if (atomic_inc_not_zero(&sb->s_active)) {
535 get_inotify_watch(watch);
536 mutex_unlock(&ih->mutex);
537 return 1; /* the best outcome */
538 }
539 spin_lock(&sb_lock);
540 sb->s_count++;
541 spin_unlock(&sb_lock);
542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
543 down_read(&sb->s_umount);
544 /* fs is already shut down; the watch is dead */
545 drop_super(sb);
546 return 0;
547}
548
549static void unpin_and_kill(struct inotify_watch *watch)
550{
551 struct super_block *sb = watch->inode->i_sb;
552 put_inotify_watch(watch);
553 deactivate_super(sb);
554}
555
556/**
557 * inotify_destroy - clean up and destroy an inotify instance
558 * @ih: inotify handle
559 */
560void inotify_destroy(struct inotify_handle *ih)
561{
562 /*
563 * Destroy all of the watches for this handle. Unfortunately, not very
564 * pretty. We cannot do a simple iteration over the list, because we
565 * do not know the inode until we iterate to the watch. But we need to
566 * hold inode->inotify_mutex before ih->mutex. The following works.
567 *
568 * AV: it had to become even uglier to start working ;-/
569 */
570 while (1) {
571 struct inotify_watch *watch;
572 struct list_head *watches;
573 struct super_block *sb;
574 struct inode *inode;
575
576 mutex_lock(&ih->mutex);
577 watches = &ih->watches;
578 if (list_empty(watches)) {
579 mutex_unlock(&ih->mutex);
580 break;
581 }
582 watch = list_first_entry(watches, struct inotify_watch, h_list);
583 sb = watch->inode->i_sb;
584 if (!pin_to_kill(ih, watch))
585 continue;
586
587 inode = watch->inode;
588 mutex_lock(&inode->inotify_mutex);
589 mutex_lock(&ih->mutex);
590
591 /* make sure we didn't race with another list removal */
592 if (likely(idr_find(&ih->idr, watch->wd))) {
593 remove_watch_no_event(watch, ih);
594 put_inotify_watch(watch);
595 }
596
597 mutex_unlock(&ih->mutex);
598 mutex_unlock(&inode->inotify_mutex);
599 unpin_and_kill(watch);
600 }
601
602 /* free this handle: the put matching the get in inotify_init() */
603 put_inotify_handle(ih);
604}
605EXPORT_SYMBOL_GPL(inotify_destroy);
606
607/**
608 * inotify_find_watch - find an existing watch for an (ih,inode) pair
609 * @ih: inotify handle
610 * @inode: inode to watch
611 * @watchp: pointer to existing inotify_watch
612 *
613 * Caller must pin given inode (via nameidata).
614 */
615s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
616 struct inotify_watch **watchp)
617{
618 struct inotify_watch *old;
619 int ret = -ENOENT;
620
621 mutex_lock(&inode->inotify_mutex);
622 mutex_lock(&ih->mutex);
623
624 old = inode_find_handle(inode, ih);
625 if (unlikely(old)) {
626 get_inotify_watch(old); /* caller must put watch */
627 *watchp = old;
628 ret = old->wd;
629 }
630
631 mutex_unlock(&ih->mutex);
632 mutex_unlock(&inode->inotify_mutex);
633
634 return ret;
635}
636EXPORT_SYMBOL_GPL(inotify_find_watch);
637
638/**
639 * inotify_find_update_watch - find and update the mask of an existing watch
640 * @ih: inotify handle
641 * @inode: inode's watch to update
642 * @mask: mask of events to watch
643 *
644 * Caller must pin given inode (via nameidata).
645 */
646s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
647 u32 mask)
648{
649 struct inotify_watch *old;
650 int mask_add = 0;
651 int ret;
652
653 if (mask & IN_MASK_ADD)
654 mask_add = 1;
655
656 /* don't allow invalid bits: we don't want flags set */
657 mask &= IN_ALL_EVENTS | IN_ONESHOT;
658 if (unlikely(!mask))
659 return -EINVAL;
660
661 mutex_lock(&inode->inotify_mutex);
662 mutex_lock(&ih->mutex);
663
664 /*
665 * Handle the case of re-adding a watch on an (inode,ih) pair that we
666 * are already watching. We just update the mask and return its wd.
667 */
668 old = inode_find_handle(inode, ih);
669 if (unlikely(!old)) {
670 ret = -ENOENT;
671 goto out;
672 }
673
674 if (mask_add)
675 old->mask |= mask;
676 else
677 old->mask = mask;
678 ret = old->wd;
679out:
680 mutex_unlock(&ih->mutex);
681 mutex_unlock(&inode->inotify_mutex);
682 return ret;
683}
684EXPORT_SYMBOL_GPL(inotify_find_update_watch);
685
686/**
687 * inotify_add_watch - add a watch to an inotify instance
688 * @ih: inotify handle
689 * @watch: caller allocated watch structure
690 * @inode: inode to watch
691 * @mask: mask of events to watch
692 *
693 * Caller must pin given inode (via nameidata).
694 * Caller must ensure it only calls inotify_add_watch() once per watch.
695 * Calls inotify_handle_get_wd() so may sleep.
696 */
697s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
698 struct inode *inode, u32 mask)
699{
700 int ret = 0;
701 int newly_watched;
702
703 /* don't allow invalid bits: we don't want flags set */
704 mask &= IN_ALL_EVENTS | IN_ONESHOT;
705 if (unlikely(!mask))
706 return -EINVAL;
707 watch->mask = mask;
708
709 mutex_lock(&inode->inotify_mutex);
710 mutex_lock(&ih->mutex);
711
712 /* Initialize a new watch */
713 ret = inotify_handle_get_wd(ih, watch);
714 if (unlikely(ret))
715 goto out;
716 ret = watch->wd;
717
718 /* save a reference to handle and bump the count to make it official */
719 get_inotify_handle(ih);
720 watch->ih = ih;
721
722 /*
723 * Save a reference to the inode and bump the ref count to make it
724 * official. We hold a reference to nameidata, which makes this safe.
725 */
726 watch->inode = igrab(inode);
727
728 /* Add the watch to the handle's and the inode's list */
729 newly_watched = !inotify_inode_watched(inode);
730 list_add(&watch->h_list, &ih->watches);
731 list_add(&watch->i_list, &inode->inotify_watches);
732 /*
733 * Set child flags _after_ adding the watch, so there is no race
734 * windows where newly instantiated children could miss their parent's
735 * watched flag.
736 */
737 if (newly_watched)
738 set_dentry_child_flags(inode, 1);
739
740out:
741 mutex_unlock(&ih->mutex);
742 mutex_unlock(&inode->inotify_mutex);
743 return ret;
744}
745EXPORT_SYMBOL_GPL(inotify_add_watch);
746
747/**
748 * inotify_clone_watch - put the watch next to existing one
749 * @old: already installed watch
750 * @new: new watch
751 *
752 * Caller must hold the inotify_mutex of inode we are dealing with;
753 * it is expected to remove the old watch before unlocking the inode.
754 */
755s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
756{
757 struct inotify_handle *ih = old->ih;
758 int ret = 0;
759
760 new->mask = old->mask;
761 new->ih = ih;
762
763 mutex_lock(&ih->mutex);
764
765 /* Initialize a new watch */
766 ret = inotify_handle_get_wd(ih, new);
767 if (unlikely(ret))
768 goto out;
769 ret = new->wd;
770
771 get_inotify_handle(ih);
772
773 new->inode = igrab(old->inode);
774
775 list_add(&new->h_list, &ih->watches);
776 list_add(&new->i_list, &old->inode->inotify_watches);
777out:
778 mutex_unlock(&ih->mutex);
779 return ret;
780}
781
782void inotify_evict_watch(struct inotify_watch *watch)
783{
784 get_inotify_watch(watch);
785 mutex_lock(&watch->ih->mutex);
786 inotify_remove_watch_locked(watch->ih, watch);
787 mutex_unlock(&watch->ih->mutex);
788}
789
790/**
791 * inotify_rm_wd - remove a watch from an inotify instance
792 * @ih: inotify handle
793 * @wd: watch descriptor to remove
794 *
795 * Can sleep.
796 */
797int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
798{
799 struct inotify_watch *watch;
800 struct super_block *sb;
801 struct inode *inode;
802
803 mutex_lock(&ih->mutex);
804 watch = idr_find(&ih->idr, wd);
805 if (unlikely(!watch)) {
806 mutex_unlock(&ih->mutex);
807 return -EINVAL;
808 }
809 sb = watch->inode->i_sb;
810 if (!pin_to_kill(ih, watch))
811 return 0;
812
813 inode = watch->inode;
814
815 mutex_lock(&inode->inotify_mutex);
816 mutex_lock(&ih->mutex);
817
818 /* make sure that we did not race */
819 if (likely(idr_find(&ih->idr, wd) == watch))
820 inotify_remove_watch_locked(ih, watch);
821
822 mutex_unlock(&ih->mutex);
823 mutex_unlock(&inode->inotify_mutex);
824 unpin_and_kill(watch);
825
826 return 0;
827}
828EXPORT_SYMBOL_GPL(inotify_rm_wd);
829
830/**
831 * inotify_rm_watch - remove a watch from an inotify instance
832 * @ih: inotify handle
833 * @watch: watch to remove
834 *
835 * Can sleep.
836 */
837int inotify_rm_watch(struct inotify_handle *ih,
838 struct inotify_watch *watch)
839{
840 return inotify_rm_wd(ih, watch->wd);
841}
842EXPORT_SYMBOL_GPL(inotify_rm_watch);
843
844/*
845 * inotify_setup - core initialization function
846 */
847static int __init inotify_setup(void)
848{
849 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
850 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
851 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
852 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
853 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
854 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
855 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
856 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
857 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
858 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
859 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
860 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862
863 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
865 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
866 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
867
868 atomic_set(&inotify_cookie, 0);
869
870 return 0;
871}
872
873module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8c..b6642e4de4b 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
9 int wd; 9 int wd;
10}; 10};
11 11
12struct inotify_inode_mark_entry { 12struct inotify_inode_mark {
13 /* fsnotify_mark_entry MUST be the first thing */ 13 struct fsnotify_mark fsn_mark;
14 struct fsnotify_mark_entry fsn_entry;
15 int wd; 14 int wd;
16}; 15};
17 16
18extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
19 struct fsnotify_group *group); 18 struct fsnotify_group *group);
20extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
21 20
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76a..a91b69a6a29 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
22 * General Public License for more details. 22 * General Public License for more details.
23 */ 23 */
24 24
25#include <linux/dcache.h> /* d_unlinked */
25#include <linux/fs.h> /* struct inode */ 26#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h> 28#include <linux/inotify.h>
@@ -32,26 +33,84 @@
32 33
33#include "inotify.h" 34#include "inotify.h"
34 35
35static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) 36/*
37 * Check if 2 events contain the same information. We do not compare private data
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
41{
42 if ((old->mask == new->mask) &&
43 (old->to_tell == new->to_tell) &&
44 (old->data_type == new->data_type) &&
45 (old->name_len == new->name_len)) {
46 switch (old->data_type) {
47 case (FSNOTIFY_EVENT_INODE):
48 /* remember, after old was put on the wait_q we aren't
49 * allowed to look at the inode any more, only thing
50 * left to check was if the file_name is the same */
51 if (!old->name_len ||
52 !strcmp(old->file_name, new->file_name))
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false;
69}
70
71static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event)
36{ 73{
37 struct fsnotify_mark_entry *entry; 74 struct fsnotify_event_holder *last_holder;
38 struct inotify_inode_mark_entry *ientry; 75 struct fsnotify_event *last_event;
76
77 /* and the list better be locked by something too */
78 spin_lock(&event->lock);
79
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event;
90}
91
92static int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark,
94 struct fsnotify_mark *vfsmount_mark,
95 struct fsnotify_event *event)
96{
97 struct inotify_inode_mark *i_mark;
39 struct inode *to_tell; 98 struct inode *to_tell;
40 struct inotify_event_private_data *event_priv; 99 struct inotify_event_private_data *event_priv;
41 struct fsnotify_event_private_data *fsn_event_priv; 100 struct fsnotify_event_private_data *fsn_event_priv;
42 int wd, ret; 101 struct fsnotify_event *added_event;
102 int wd, ret = 0;
103
104 BUG_ON(vfsmount_mark);
105
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
107 event, event->to_tell, event->mask);
43 108
44 to_tell = event->to_tell; 109 to_tell = event->to_tell;
45 110
46 spin_lock(&to_tell->i_lock); 111 i_mark = container_of(inode_mark, struct inotify_inode_mark,
47 entry = fsnotify_find_mark_entry(group, to_tell); 112 fsn_mark);
48 spin_unlock(&to_tell->i_lock); 113 wd = i_mark->wd;
49 /* race with watch removal? We already passes should_send */
50 if (unlikely(!entry))
51 return 0;
52 ientry = container_of(entry, struct inotify_inode_mark_entry,
53 fsn_entry);
54 wd = ientry->wd;
55 114
56 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
57 if (unlikely(!event_priv)) 116 if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 fsn_event_priv->group = group; 121 fsn_event_priv->group = group;
63 event_priv->wd = wd; 122 event_priv->wd = wd;
64 123
65 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 124 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
66 if (ret) { 125 if (added_event) {
67 inotify_free_event_priv(fsn_event_priv); 126 inotify_free_event_priv(fsn_event_priv);
68 /* EEXIST says we tail matched, EOVERFLOW isn't something 127 if (!IS_ERR(added_event))
69 * to report up the stack. */ 128 fsnotify_put_event(added_event);
70 if ((ret == -EEXIST) || 129 else
71 (ret == -EOVERFLOW)) 130 ret = PTR_ERR(added_event);
72 ret = 0;
73 } 131 }
74 132
75 /* 133 if (inode_mark->mask & IN_ONESHOT)
76 * If we hold the entry until after the event is on the queue 134 fsnotify_destroy_mark(inode_mark);
77 * IN_IGNORED won't be able to pass this event in the queue
78 */
79 fsnotify_put_mark(entry);
80 135
81 return ret; 136 return ret;
82} 137}
83 138
84static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) 139static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
85{ 140{
86 inotify_ignored_and_remove_idr(entry, group); 141 inotify_ignored_and_remove_idr(fsn_mark, group);
87} 142}
88 143
89static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) 144static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
145 struct fsnotify_mark *inode_mark,
146 struct fsnotify_mark *vfsmount_mark,
147 __u32 mask, void *data, int data_type)
90{ 148{
91 struct fsnotify_mark_entry *entry; 149 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
92 bool send; 150 (data_type == FSNOTIFY_EVENT_PATH)) {
93 151 struct path *path = data;
94 spin_lock(&inode->i_lock);
95 entry = fsnotify_find_mark_entry(group, inode);
96 spin_unlock(&inode->i_lock);
97 if (!entry)
98 return false;
99 152
100 mask = (mask & ~FS_EVENT_ON_CHILD); 153 if (d_unlinked(path->dentry))
101 send = (entry->mask & mask); 154 return false;
102 155 }
103 /* find took a reference */
104 fsnotify_put_mark(entry);
105 156
106 return send; 157 return true;
107} 158}
108 159
109/* 160/*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
115 */ 166 */
116static int idr_callback(int id, void *p, void *data) 167static int idr_callback(int id, void *p, void *data)
117{ 168{
118 struct fsnotify_mark_entry *entry; 169 struct fsnotify_mark *fsn_mark;
119 struct inotify_inode_mark_entry *ientry; 170 struct inotify_inode_mark *i_mark;
120 static bool warned = false; 171 static bool warned = false;
121 172
122 if (warned) 173 if (warned)
123 return 0; 174 return 0;
124 175
125 warned = true; 176 warned = true;
126 entry = p; 177 fsn_mark = p;
127 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 178 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
128 179
129 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " 180 WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
130 "idr. Probably leaking memory\n", id, p, data); 181 "idr. Probably leaking memory\n", id, p, data);
131 182
132 /* 183 /*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
135 * out why we got here and the panic is no worse than the original 186 * out why we got here and the panic is no worse than the original
136 * BUG() that was here. 187 * BUG() that was here.
137 */ 188 */
138 if (entry) 189 if (fsn_mark)
139 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", 190 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
140 entry->group, entry->inode, ientry->wd); 191 fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
141 return 0; 192 return 0;
142} 193}
143 194
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9b..444c305a468 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
46/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
47static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
48static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
49int inotify_max_user_watches __read_mostly; 49static int inotify_max_user_watches __read_mostly;
50 50
51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
52struct kmem_cache *event_priv_cachep __read_mostly; 52struct kmem_cache *event_priv_cachep __read_mostly;
53 53
54/*
55 * When inotify registers a new group it increments this and uses that
56 * value as an offset to set the fsnotify group "name" and priority.
57 */
58static atomic_t inotify_grp_num;
59
60#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
61 55
62#include <linux/sysctl.h> 56#include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
96{ 90{
97 __u32 mask; 91 __u32 mask;
98 92
99 /* everything should accept their own ignored and cares about children */ 93 /*
100 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); 94 * everything should accept their own ignored, cares about children,
95 * and should receive events when the inode is unmounted
96 */
97 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
101 98
102 /* mask off the flags used to open the fd */ 99 /* mask off the flags used to open the fd */
103 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); 100 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
104 101
105 return mask; 102 return mask;
106} 103}
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 141
145 event = fsnotify_peek_notify_event(group); 142 event = fsnotify_peek_notify_event(group);
146 143
144 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
145
147 if (event->name_len) 146 if (event->name_len)
148 event_size += roundup(event->name_len + 1, event_size); 147 event_size += roundup(event->name_len + 1, event_size);
149 148
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
173 size_t event_size = sizeof(struct inotify_event); 172 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 173 size_t name_len = 0;
175 174
175 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
176
176 /* we get the inotify watch descriptor from the event private data */ 177 /* we get the inotify watch descriptor from the event private data */
177 spin_lock(&event->lock); 178 spin_lock(&event->lock);
178 fsn_priv = fsnotify_remove_priv_from_event(group, event); 179 fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
245 kevent = get_one_event(group, count); 246 kevent = get_one_event(group, count);
246 mutex_unlock(&group->notification_mutex); 247 mutex_unlock(&group->notification_mutex);
247 248
249 pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
250
248 if (kevent) { 251 if (kevent) {
249 ret = PTR_ERR(kevent); 252 ret = PTR_ERR(kevent);
250 if (IS_ERR(kevent)) 253 if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
289 struct fsnotify_group *group = file->private_data; 292 struct fsnotify_group *group = file->private_data;
290 struct user_struct *user = group->inotify_data.user; 293 struct user_struct *user = group->inotify_data.user;
291 294
295 pr_debug("%s: group=%p\n", __func__, group);
296
292 fsnotify_clear_marks_by_group(group); 297 fsnotify_clear_marks_by_group(group);
293 298
294 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 299 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
312 group = file->private_data; 317 group = file->private_data;
313 p = (void __user *) arg; 318 p = (void __user *) arg;
314 319
320 pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
321
315 switch (cmd) { 322 switch (cmd) {
316 case FIONREAD: 323 case FIONREAD:
317 mutex_lock(&group->notification_mutex); 324 mutex_lock(&group->notification_mutex);
@@ -337,6 +344,7 @@ static const struct file_operations inotify_fops = {
337 .release = inotify_release, 344 .release = inotify_release,
338 .unlocked_ioctl = inotify_ioctl, 345 .unlocked_ioctl = inotify_ioctl,
339 .compat_ioctl = inotify_ioctl, 346 .compat_ioctl = inotify_ioctl,
347 .llseek = noop_llseek,
340}; 348};
341 349
342 350
@@ -357,59 +365,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
357 return error; 365 return error;
358} 366}
359 367
368static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
369 int *last_wd,
370 struct inotify_inode_mark *i_mark)
371{
372 int ret;
373
374 do {
375 if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
376 return -ENOMEM;
377
378 spin_lock(idr_lock);
379 ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
380 &i_mark->wd);
381 /* we added the mark to the idr, take a reference */
382 if (!ret) {
383 *last_wd = i_mark->wd;
384 fsnotify_get_mark(&i_mark->fsn_mark);
385 }
386 spin_unlock(idr_lock);
387 } while (ret == -EAGAIN);
388
389 return ret;
390}
391
392static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
393 int wd)
394{
395 struct idr *idr = &group->inotify_data.idr;
396 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
397 struct inotify_inode_mark *i_mark;
398
399 assert_spin_locked(idr_lock);
400
401 i_mark = idr_find(idr, wd);
402 if (i_mark) {
403 struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
404
405 fsnotify_get_mark(fsn_mark);
406 /* One ref for being in the idr, one ref we just took */
407 BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
408 }
409
410 return i_mark;
411}
412
413static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
414 int wd)
415{
416 struct inotify_inode_mark *i_mark;
417 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
418
419 spin_lock(idr_lock);
420 i_mark = inotify_idr_find_locked(group, wd);
421 spin_unlock(idr_lock);
422
423 return i_mark;
424}
425
426static void do_inotify_remove_from_idr(struct fsnotify_group *group,
427 struct inotify_inode_mark *i_mark)
428{
429 struct idr *idr = &group->inotify_data.idr;
430 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
431 int wd = i_mark->wd;
432
433 assert_spin_locked(idr_lock);
434
435 idr_remove(idr, wd);
436
437 /* removed from the idr, drop that ref */
438 fsnotify_put_mark(&i_mark->fsn_mark);
439}
440
360/* 441/*
361 * Remove the mark from the idr (if present) and drop the reference 442 * Remove the mark from the idr (if present) and drop the reference
362 * on the mark because it was in the idr. 443 * on the mark because it was in the idr.
363 */ 444 */
364static void inotify_remove_from_idr(struct fsnotify_group *group, 445static void inotify_remove_from_idr(struct fsnotify_group *group,
365 struct inotify_inode_mark_entry *ientry) 446 struct inotify_inode_mark *i_mark)
366{ 447{
367 struct idr *idr; 448 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
368 struct fsnotify_mark_entry *entry; 449 struct inotify_inode_mark *found_i_mark = NULL;
369 struct inotify_inode_mark_entry *found_ientry;
370 int wd; 450 int wd;
371 451
372 spin_lock(&group->inotify_data.idr_lock); 452 spin_lock(idr_lock);
373 idr = &group->inotify_data.idr; 453 wd = i_mark->wd;
374 wd = ientry->wd;
375 454
376 if (wd == -1) 455 /*
456 * does this i_mark think it is in the idr? we shouldn't get called
457 * if it wasn't....
458 */
459 if (wd == -1) {
460 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
461 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
462 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
377 goto out; 463 goto out;
464 }
378 465
379 entry = idr_find(&group->inotify_data.idr, wd); 466 /* Lets look in the idr to see if we find it */
380 if (unlikely(!entry)) 467 found_i_mark = inotify_idr_find_locked(group, wd);
468 if (unlikely(!found_i_mark)) {
469 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
470 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
471 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
381 goto out; 472 goto out;
473 }
382 474
383 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 475 /*
384 if (unlikely(found_ientry != ientry)) { 476 * We found an mark in the idr at the right wd, but it's
385 /* We found an entry in the idr with the right wd, but it's 477 * not the mark we were told to remove. eparis seriously
386 * not the entry we were told to remove. eparis seriously 478 * fucked up somewhere.
387 * fucked up somewhere. */ 479 */
388 WARN_ON(1); 480 if (unlikely(found_i_mark != i_mark)) {
389 ientry->wd = -1; 481 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
482 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
483 "found_i_mark->group=%p found_i_mark->inode=%p\n",
484 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
485 i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
486 found_i_mark->fsn_mark.group,
487 found_i_mark->fsn_mark.i.inode);
390 goto out; 488 goto out;
391 } 489 }
392 490
393 /* One ref for being in the idr, one ref held by the caller */ 491 /*
394 BUG_ON(atomic_read(&entry->refcnt) < 2); 492 * One ref for being in the idr
395 493 * one ref held by the caller trying to kill us
396 idr_remove(idr, wd); 494 * one ref grabbed by inotify_idr_find
397 ientry->wd = -1; 495 */
496 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
497 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
498 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
499 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
500 /* we can't really recover with bad ref cnting.. */
501 BUG();
502 }
398 503
399 /* removed from the idr, drop that ref */ 504 do_inotify_remove_from_idr(group, i_mark);
400 fsnotify_put_mark(entry);
401out: 505out:
402 spin_unlock(&group->inotify_data.idr_lock); 506 /* match the ref taken by inotify_idr_find_locked() */
507 if (found_i_mark)
508 fsnotify_put_mark(&found_i_mark->fsn_mark);
509 i_mark->wd = -1;
510 spin_unlock(idr_lock);
403} 511}
404 512
405/* 513/*
406 * Send IN_IGNORED for this wd, remove this wd from the idr. 514 * Send IN_IGNORED for this wd, remove this wd from the idr.
407 */ 515 */
408void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 516void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
409 struct fsnotify_group *group) 517 struct fsnotify_group *group)
410{ 518{
411 struct inotify_inode_mark_entry *ientry; 519 struct inotify_inode_mark *i_mark;
412 struct fsnotify_event *ignored_event; 520 struct fsnotify_event *ignored_event, *notify_event;
413 struct inotify_event_private_data *event_priv; 521 struct inotify_event_private_data *event_priv;
414 struct fsnotify_event_private_data *fsn_event_priv; 522 struct fsnotify_event_private_data *fsn_event_priv;
415 int ret; 523 int ret;
@@ -420,7 +528,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
420 if (!ignored_event) 528 if (!ignored_event)
421 return; 529 return;
422 530
423 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 531 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
424 532
425 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 533 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
426 if (unlikely(!event_priv)) 534 if (unlikely(!event_priv))
@@ -429,37 +537,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
429 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 537 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
430 538
431 fsn_event_priv->group = group; 539 fsn_event_priv->group = group;
432 event_priv->wd = ientry->wd; 540 event_priv->wd = i_mark->wd;
433 541
434 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); 542 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
435 if (ret) 543 if (notify_event) {
544 if (IS_ERR(notify_event))
545 ret = PTR_ERR(notify_event);
546 else
547 fsnotify_put_event(notify_event);
436 inotify_free_event_priv(fsn_event_priv); 548 inotify_free_event_priv(fsn_event_priv);
549 }
437 550
438skip_send_ignore: 551skip_send_ignore:
439 552
440 /* matches the reference taken when the event was created */ 553 /* matches the reference taken when the event was created */
441 fsnotify_put_event(ignored_event); 554 fsnotify_put_event(ignored_event);
442 555
443 /* remove this entry from the idr */ 556 /* remove this mark from the idr */
444 inotify_remove_from_idr(group, ientry); 557 inotify_remove_from_idr(group, i_mark);
445 558
446 atomic_dec(&group->inotify_data.user->inotify_watches); 559 atomic_dec(&group->inotify_data.user->inotify_watches);
447} 560}
448 561
449/* ding dong the mark is dead */ 562/* ding dong the mark is dead */
450static void inotify_free_mark(struct fsnotify_mark_entry *entry) 563static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
451{ 564{
452 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; 565 struct inotify_inode_mark *i_mark;
566
567 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
453 568
454 kmem_cache_free(inotify_inode_mark_cachep, ientry); 569 kmem_cache_free(inotify_inode_mark_cachep, i_mark);
455} 570}
456 571
457static int inotify_update_existing_watch(struct fsnotify_group *group, 572static int inotify_update_existing_watch(struct fsnotify_group *group,
458 struct inode *inode, 573 struct inode *inode,
459 u32 arg) 574 u32 arg)
460{ 575{
461 struct fsnotify_mark_entry *entry; 576 struct fsnotify_mark *fsn_mark;
462 struct inotify_inode_mark_entry *ientry; 577 struct inotify_inode_mark *i_mark;
463 __u32 old_mask, new_mask; 578 __u32 old_mask, new_mask;
464 __u32 mask; 579 __u32 mask;
465 int add = (arg & IN_MASK_ADD); 580 int add = (arg & IN_MASK_ADD);
@@ -467,52 +582,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
467 582
468 /* don't allow invalid bits: we don't want flags set */ 583 /* don't allow invalid bits: we don't want flags set */
469 mask = inotify_arg_to_mask(arg); 584 mask = inotify_arg_to_mask(arg);
470 if (unlikely(!mask)) 585 if (unlikely(!(mask & IN_ALL_EVENTS)))
471 return -EINVAL; 586 return -EINVAL;
472 587
473 spin_lock(&inode->i_lock); 588 fsn_mark = fsnotify_find_inode_mark(group, inode);
474 entry = fsnotify_find_mark_entry(group, inode); 589 if (!fsn_mark)
475 spin_unlock(&inode->i_lock);
476 if (!entry)
477 return -ENOENT; 590 return -ENOENT;
478 591
479 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 592 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
480 593
481 spin_lock(&entry->lock); 594 spin_lock(&fsn_mark->lock);
482 595
483 old_mask = entry->mask; 596 old_mask = fsn_mark->mask;
484 if (add) { 597 if (add)
485 entry->mask |= mask; 598 fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
486 new_mask = entry->mask; 599 else
487 } else { 600 fsnotify_set_mark_mask_locked(fsn_mark, mask);
488 entry->mask = mask; 601 new_mask = fsn_mark->mask;
489 new_mask = entry->mask;
490 }
491 602
492 spin_unlock(&entry->lock); 603 spin_unlock(&fsn_mark->lock);
493 604
494 if (old_mask != new_mask) { 605 if (old_mask != new_mask) {
495 /* more bits in old than in new? */ 606 /* more bits in old than in new? */
496 int dropped = (old_mask & ~new_mask); 607 int dropped = (old_mask & ~new_mask);
497 /* more bits in this entry than the inode's mask? */ 608 /* more bits in this fsn_mark than the inode's mask? */
498 int do_inode = (new_mask & ~inode->i_fsnotify_mask); 609 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
499 /* more bits in this entry than the group? */
500 int do_group = (new_mask & ~group->mask);
501 610
502 /* update the inode with this new entry */ 611 /* update the inode with this new fsn_mark */
503 if (dropped || do_inode) 612 if (dropped || do_inode)
504 fsnotify_recalc_inode_mask(inode); 613 fsnotify_recalc_inode_mask(inode);
505 614
506 /* update the group mask with the new mask */
507 if (dropped || do_group)
508 fsnotify_recalc_group_mask(group);
509 } 615 }
510 616
511 /* return the wd */ 617 /* return the wd */
512 ret = ientry->wd; 618 ret = i_mark->wd;
513 619
514 /* match the get from fsnotify_find_mark_entry() */ 620 /* match the get from fsnotify_find_mark() */
515 fsnotify_put_mark(entry); 621 fsnotify_put_mark(fsn_mark);
516 622
517 return ret; 623 return ret;
518} 624}
@@ -521,73 +627,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
521 struct inode *inode, 627 struct inode *inode,
522 u32 arg) 628 u32 arg)
523{ 629{
524 struct inotify_inode_mark_entry *tmp_ientry; 630 struct inotify_inode_mark *tmp_i_mark;
525 __u32 mask; 631 __u32 mask;
526 int ret; 632 int ret;
633 struct idr *idr = &group->inotify_data.idr;
634 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
527 635
528 /* don't allow invalid bits: we don't want flags set */ 636 /* don't allow invalid bits: we don't want flags set */
529 mask = inotify_arg_to_mask(arg); 637 mask = inotify_arg_to_mask(arg);
530 if (unlikely(!mask)) 638 if (unlikely(!(mask & IN_ALL_EVENTS)))
531 return -EINVAL; 639 return -EINVAL;
532 640
533 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); 641 tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
534 if (unlikely(!tmp_ientry)) 642 if (unlikely(!tmp_i_mark))
535 return -ENOMEM; 643 return -ENOMEM;
536 644
537 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); 645 fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
538 tmp_ientry->fsn_entry.mask = mask; 646 tmp_i_mark->fsn_mark.mask = mask;
539 tmp_ientry->wd = -1; 647 tmp_i_mark->wd = -1;
540 648
541 ret = -ENOSPC; 649 ret = -ENOSPC;
542 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) 650 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
543 goto out_err; 651 goto out_err;
544retry:
545 ret = -ENOMEM;
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err;
548 652
549 /* we are putting the mark on the idr, take a reference */ 653 ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
550 fsnotify_get_mark(&tmp_ientry->fsn_entry); 654 tmp_i_mark);
551 655 if (ret)
552 spin_lock(&group->inotify_data.idr_lock);
553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
554 group->inotify_data.last_wd+1,
555 &tmp_ientry->wd);
556 spin_unlock(&group->inotify_data.idr_lock);
557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
561 /* idr was out of memory allocate and try again */
562 if (ret == -EAGAIN)
563 goto retry;
564 goto out_err; 656 goto out_err;
565 }
566 657
567 /* we are on the idr, now get on the inode */ 658 /* we are on the idr, now get on the inode */
568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 659 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
569 if (ret) { 660 if (ret) {
570 /* we failed to get on the inode, get off the idr */ 661 /* we failed to get on the inode, get off the idr */
571 inotify_remove_from_idr(group, tmp_ientry); 662 inotify_remove_from_idr(group, tmp_i_mark);
572 goto out_err; 663 goto out_err;
573 } 664 }
574 665
575 /* update the idr hint, who cares about races, it's just a hint */
576 group->inotify_data.last_wd = tmp_ientry->wd;
577
578 /* increment the number of watches the user has */ 666 /* increment the number of watches the user has */
579 atomic_inc(&group->inotify_data.user->inotify_watches); 667 atomic_inc(&group->inotify_data.user->inotify_watches);
580 668
581 /* return the watch descriptor for this new entry */ 669 /* return the watch descriptor for this new mark */
582 ret = tmp_ientry->wd; 670 ret = tmp_i_mark->wd;
583
584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group);
587 671
588out_err: 672out_err:
589 /* match the ref from fsnotify_init_markentry() */ 673 /* match the ref from fsnotify_init_mark() */
590 fsnotify_put_mark(&tmp_ientry->fsn_entry); 674 fsnotify_put_mark(&tmp_i_mark->fsn_mark);
591 675
592 return ret; 676 return ret;
593} 677}
@@ -616,11 +700,8 @@ retry:
616static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) 700static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
617{ 701{
618 struct fsnotify_group *group; 702 struct fsnotify_group *group;
619 unsigned int grp_num;
620 703
621 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ 704 group = fsnotify_alloc_group(&inotify_fsnotify_ops);
622 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
623 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
624 if (IS_ERR(group)) 705 if (IS_ERR(group))
625 return group; 706 return group;
626 707
@@ -726,7 +807,7 @@ fput_and_out:
726SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 807SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
727{ 808{
728 struct fsnotify_group *group; 809 struct fsnotify_group *group;
729 struct fsnotify_mark_entry *entry; 810 struct inotify_inode_mark *i_mark;
730 struct file *filp; 811 struct file *filp;
731 int ret = 0, fput_needed; 812 int ret = 0, fput_needed;
732 813
@@ -735,25 +816,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
735 return -EBADF; 816 return -EBADF;
736 817
737 /* verify that this is indeed an inotify instance */ 818 /* verify that this is indeed an inotify instance */
738 if (unlikely(filp->f_op != &inotify_fops)) { 819 ret = -EINVAL;
739 ret = -EINVAL; 820 if (unlikely(filp->f_op != &inotify_fops))
740 goto out; 821 goto out;
741 }
742 822
743 group = filp->private_data; 823 group = filp->private_data;
744 824
745 spin_lock(&group->inotify_data.idr_lock); 825 ret = -EINVAL;
746 entry = idr_find(&group->inotify_data.idr, wd); 826 i_mark = inotify_idr_find(group, wd);
747 if (unlikely(!entry)) { 827 if (unlikely(!i_mark))
748 spin_unlock(&group->inotify_data.idr_lock);
749 ret = -EINVAL;
750 goto out; 828 goto out;
751 }
752 fsnotify_get_mark(entry);
753 spin_unlock(&group->inotify_data.idr_lock);
754 829
755 fsnotify_destroy_mark_by_entry(entry); 830 ret = 0;
756 fsnotify_put_mark(entry); 831
832 fsnotify_destroy_mark(&i_mark->fsn_mark);
833
834 /* match ref taken by inotify_idr_find */
835 fsnotify_put_mark(&i_mark->fsn_mark);
757 836
758out: 837out:
759 fput_light(filp, fput_needed); 838 fput_light(filp, fput_needed);
@@ -767,7 +846,28 @@ out:
767 */ 846 */
768static int __init inotify_user_setup(void) 847static int __init inotify_user_setup(void)
769{ 848{
770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 849 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
850 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
851 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
852 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
853 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
854 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
855 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
856 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
857 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
858 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
859 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
860 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
861 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
862 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
863 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
864 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
865 BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
866 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
867
868 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
869
870 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 871 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
772 872
773 inotify_max_queued_events = 16384; 873 inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 00000000000..325185e514b
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * mark->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the marks_list anchored inside a given group
42 * and each mark is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/kthread.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/slab.h>
92#include <linux/spinlock.h>
93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95
96#include <asm/atomic.h>
97
98#include <linux/fsnotify_backend.h>
99#include "fsnotify.h"
100
101struct srcu_struct fsnotify_mark_srcu;
102static DEFINE_SPINLOCK(destroy_lock);
103static LIST_HEAD(destroy_list);
104static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
105
106void fsnotify_get_mark(struct fsnotify_mark *mark)
107{
108 atomic_inc(&mark->refcnt);
109}
110
111void fsnotify_put_mark(struct fsnotify_mark *mark)
112{
113 if (atomic_dec_and_test(&mark->refcnt))
114 mark->free_mark(mark);
115}
116
117/*
118 * Any time a mark is getting freed we end up here.
119 * The caller had better be holding a reference to this mark so we don't actually
120 * do the final put under the mark->lock
121 */
122void fsnotify_destroy_mark(struct fsnotify_mark *mark)
123{
124 struct fsnotify_group *group;
125 struct inode *inode = NULL;
126
127 spin_lock(&mark->lock);
128
129 group = mark->group;
130
131 /* something else already called this function on this mark */
132 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
133 spin_unlock(&mark->lock);
134 return;
135 }
136
137 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
138
139 /* 1 from caller and 1 for being on i_list/g_list */
140 BUG_ON(atomic_read(&mark->refcnt) < 2);
141
142 spin_lock(&group->mark_lock);
143
144 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
145 inode = mark->i.inode;
146 fsnotify_destroy_inode_mark(mark);
147 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
148 fsnotify_destroy_vfsmount_mark(mark);
149 else
150 BUG();
151
152 list_del_init(&mark->g_list);
153
154 spin_unlock(&group->mark_lock);
155 spin_unlock(&mark->lock);
156
157 spin_lock(&destroy_lock);
158 list_add(&mark->destroy_list, &destroy_list);
159 spin_unlock(&destroy_lock);
160 wake_up(&destroy_waitq);
161
162 /*
163 * Some groups like to know that marks are being freed. This is a
164 * callback to the group function to let it know that this mark
165 * is being freed.
166 */
167 if (group->ops->freeing_mark)
168 group->ops->freeing_mark(mark, group);
169
170 /*
171 * __fsnotify_update_child_dentry_flags(inode);
172 *
173 * I really want to call that, but we can't, we have no idea if the inode
174 * still exists the second we drop the mark->lock.
175 *
176 * The next time an event arrive to this inode from one of it's children
177 * __fsnotify_parent will see that the inode doesn't care about it's
178 * children and will update all of these flags then. So really this
179 * is just a lazy update (and could be a perf win...)
180 */
181
182 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
183 iput(inode);
184
185 /*
186 * it's possible that this group tried to destroy itself, but this
187 * this mark was simultaneously being freed by inode. If that's the
188 * case, we finish freeing the group here.
189 */
190 if (unlikely(atomic_dec_and_test(&group->num_marks)))
191 fsnotify_final_destroy_group(group);
192}
193
194void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
195{
196 assert_spin_locked(&mark->lock);
197
198 mark->mask = mask;
199
200 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
201 fsnotify_set_inode_mark_mask_locked(mark, mask);
202}
203
204void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
205{
206 assert_spin_locked(&mark->lock);
207
208 mark->ignored_mask = mask;
209}
210
211/*
212 * Attach an initialized mark to a given group and fs object.
213 * These marks may be used for the fsnotify backend to determine which
214 * event types should be delivered to which group.
215 */
216int fsnotify_add_mark(struct fsnotify_mark *mark,
217 struct fsnotify_group *group, struct inode *inode,
218 struct vfsmount *mnt, int allow_dups)
219{
220 int ret = 0;
221
222 BUG_ON(inode && mnt);
223 BUG_ON(!inode && !mnt);
224
225 /*
226 * LOCKING ORDER!!!!
227 * mark->lock
228 * group->mark_lock
229 * inode->i_lock
230 */
231 spin_lock(&mark->lock);
232 spin_lock(&group->mark_lock);
233
234 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
235
236 mark->group = group;
237 list_add(&mark->g_list, &group->marks_list);
238 atomic_inc(&group->num_marks);
239 fsnotify_get_mark(mark); /* for i_list and g_list */
240
241 if (inode) {
242 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
243 if (ret)
244 goto err;
245 } else if (mnt) {
246 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
247 if (ret)
248 goto err;
249 } else {
250 BUG();
251 }
252
253 spin_unlock(&group->mark_lock);
254
255 /* this will pin the object if appropriate */
256 fsnotify_set_mark_mask_locked(mark, mark->mask);
257
258 spin_unlock(&mark->lock);
259
260 if (inode)
261 __fsnotify_update_child_dentry_flags(inode);
262
263 return ret;
264err:
265 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
266 list_del_init(&mark->g_list);
267 mark->group = NULL;
268 atomic_dec(&group->num_marks);
269
270 spin_unlock(&group->mark_lock);
271 spin_unlock(&mark->lock);
272
273 spin_lock(&destroy_lock);
274 list_add(&mark->destroy_list, &destroy_list);
275 spin_unlock(&destroy_lock);
276 wake_up(&destroy_waitq);
277
278 return ret;
279}
280
281/*
282 * clear any marks in a group in which mark->flags & flags is true
283 */
284void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
285 unsigned int flags)
286{
287 struct fsnotify_mark *lmark, *mark;
288 LIST_HEAD(free_list);
289
290 spin_lock(&group->mark_lock);
291 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
292 if (mark->flags & flags) {
293 list_add(&mark->free_g_list, &free_list);
294 list_del_init(&mark->g_list);
295 fsnotify_get_mark(mark);
296 }
297 }
298 spin_unlock(&group->mark_lock);
299
300 list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
301 fsnotify_destroy_mark(mark);
302 fsnotify_put_mark(mark);
303 }
304}
305
306/*
307 * Given a group, destroy all of the marks associated with that group.
308 */
309void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
310{
311 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
312}
313
314void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
315{
316 assert_spin_locked(&old->lock);
317 new->i.inode = old->i.inode;
318 new->m.mnt = old->m.mnt;
319 new->group = old->group;
320 new->mask = old->mask;
321 new->free_mark = old->free_mark;
322}
323
324/*
325 * Nothing fancy, just initialize lists and locks and counters.
326 */
327void fsnotify_init_mark(struct fsnotify_mark *mark,
328 void (*free_mark)(struct fsnotify_mark *mark))
329{
330 memset(mark, 0, sizeof(*mark));
331 spin_lock_init(&mark->lock);
332 atomic_set(&mark->refcnt, 1);
333 mark->free_mark = free_mark;
334}
335
336static int fsnotify_mark_destroy(void *ignored)
337{
338 struct fsnotify_mark *mark, *next;
339 LIST_HEAD(private_destroy_list);
340
341 for (;;) {
342 spin_lock(&destroy_lock);
343 /* exchange the list head */
344 list_replace_init(&destroy_list, &private_destroy_list);
345 spin_unlock(&destroy_lock);
346
347 synchronize_srcu(&fsnotify_mark_srcu);
348
349 list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
350 list_del_init(&mark->destroy_list);
351 fsnotify_put_mark(mark);
352 }
353
354 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
355 }
356
357 return 0;
358}
359
360static int __init fsnotify_mark_init(void)
361{
362 struct task_struct *thread;
363
364 thread = kthread_run(fsnotify_mark_destroy, NULL,
365 "fsnotify_mark");
366 if (IS_ERR(thread))
367 panic("unable to start fsnotify mark destruction thread.");
368
369 return 0;
370}
371device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c10..f39260f8f86 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -56,7 +56,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
56 * it is needed. It's refcnt is set 1 at kernel init time and will never 56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed' 57 * get set to 0 so it will never get 'freed'
58 */ 58 */
59static struct fsnotify_event q_overflow_event; 59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 61
62/** 62/**
@@ -87,12 +87,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
87 return; 87 return;
88 88
89 if (atomic_dec_and_test(&event->refcnt)) { 89 if (atomic_dec_and_test(&event->refcnt)) {
90 pr_debug("%s: event=%p\n", __func__, event);
91
90 if (event->data_type == FSNOTIFY_EVENT_PATH) 92 if (event->data_type == FSNOTIFY_EVENT_PATH)
91 path_put(&event->path); 93 path_put(&event->path);
92 94
93 BUG_ON(!list_empty(&event->private_data_list)); 95 BUG_ON(!list_empty(&event->private_data_list));
94 96
95 kfree(event->file_name); 97 kfree(event->file_name);
98 put_pid(event->tgid);
96 kmem_cache_free(fsnotify_event_cachep, event); 99 kmem_cache_free(fsnotify_event_cachep, event);
97 } 100 }
98} 101}
@@ -104,7 +107,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104 107
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) 108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{ 109{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder); 110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108} 112}
109 113
110/* 114/*
@@ -129,53 +133,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
129} 133}
130 134
131/* 135/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
141 switch (old->data_type) {
142 case (FSNOTIFY_EVENT_INODE):
143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name))
148 return true;
149 break;
150 case (FSNOTIFY_EVENT_PATH):
151 if ((old->path.mnt == new->path.mnt) &&
152 (old->path.dentry == new->path.dentry))
153 return true;
154 break;
155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
161 };
162 }
163 return false;
164}
165
166/*
167 * Add an event to the group notification queue. The group can later pull this 136 * Add an event to the group notification queue. The group can later pull this
168 * event off the queue to deal with. If the event is successfully added to the 137 * event off the queue to deal with. If the event is successfully added to the
169 * group's notification queue, a reference is taken on event. 138 * group's notification queue, a reference is taken on event.
170 */ 139 */
171int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
172 struct fsnotify_event_private_data *priv) 141 struct fsnotify_event_private_data *priv,
142 struct fsnotify_event *(*merge)(struct list_head *,
143 struct fsnotify_event *))
173{ 144{
145 struct fsnotify_event *return_event = NULL;
174 struct fsnotify_event_holder *holder = NULL; 146 struct fsnotify_event_holder *holder = NULL;
175 struct list_head *list = &group->notification_list; 147 struct list_head *list = &group->notification_list;
176 struct fsnotify_event_holder *last_holder; 148
177 struct fsnotify_event *last_event; 149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
178 int ret = 0;
179 150
180 /* 151 /*
181 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +160,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
189alloc_holder: 160alloc_holder:
190 holder = fsnotify_alloc_event_holder(); 161 holder = fsnotify_alloc_event_holder();
191 if (!holder) 162 if (!holder)
192 return -ENOMEM; 163 return ERR_PTR(-ENOMEM);
193 } 164 }
194 165
195 mutex_lock(&group->notification_mutex); 166 mutex_lock(&group->notification_mutex);
196 167
197 if (group->q_len >= group->max_events) { 168 if (group->q_len >= group->max_events) {
198 event = &q_overflow_event; 169 event = q_overflow_event;
199 ret = -EOVERFLOW; 170
171 /*
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event;
177
200 /* sorry, no private data on the overflow event */ 178 /* sorry, no private data on the overflow event */
201 priv = NULL; 179 priv = NULL;
202 } 180 }
203 181
182 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp;
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex);
188
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 }
195 }
196
204 spin_lock(&event->lock); 197 spin_lock(&event->lock);
205 198
206 if (list_empty(&event->holder.event_list)) { 199 if (list_empty(&event->holder.event_list)) {
@@ -212,19 +205,13 @@ alloc_holder:
212 * event holder was used, go back and get a new one */ 205 * event holder was used, go back and get a new one */
213 spin_unlock(&event->lock); 206 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex); 207 mutex_unlock(&group->notification_mutex);
215 goto alloc_holder;
216 }
217 208
218 if (!list_empty(list)) { 209 if (return_event) {
219 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); 210 fsnotify_put_event(return_event);
220 last_event = last_holder->event; 211 return_event = NULL;
221 if (event_compare(last_event, event)) {
222 spin_unlock(&event->lock);
223 mutex_unlock(&group->notification_mutex);
224 if (holder != &event->holder)
225 fsnotify_destroy_event_holder(holder);
226 return -EEXIST;
227 } 212 }
213
214 goto alloc_holder;
228 } 215 }
229 216
230 group->q_len++; 217 group->q_len++;
@@ -238,7 +225,7 @@ alloc_holder:
238 mutex_unlock(&group->notification_mutex); 225 mutex_unlock(&group->notification_mutex);
239 226
240 wake_up(&group->notification_waitq); 227 wake_up(&group->notification_waitq);
241 return ret; 228 return return_event;
242} 229}
243 230
244/* 231/*
@@ -253,6 +240,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
253 240
254 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 241 BUG_ON(!mutex_is_locked(&group->notification_mutex));
255 242
243 pr_debug("%s: group=%p\n", __func__, group);
244
256 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 245 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
257 246
258 event = holder->event; 247 event = holder->event;
@@ -314,25 +303,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
314 303
315static void initialize_event(struct fsnotify_event *event) 304static void initialize_event(struct fsnotify_event *event)
316{ 305{
317 event->holder.event = NULL;
318 INIT_LIST_HEAD(&event->holder.event_list); 306 INIT_LIST_HEAD(&event->holder.event_list);
319 atomic_set(&event->refcnt, 1); 307 atomic_set(&event->refcnt, 1);
320 308
321 spin_lock_init(&event->lock); 309 spin_lock_init(&event->lock);
322 310
323 event->path.dentry = NULL;
324 event->path.mnt = NULL;
325 event->inode = NULL;
326 event->data_type = FSNOTIFY_EVENT_NONE;
327
328 INIT_LIST_HEAD(&event->private_data_list); 311 INIT_LIST_HEAD(&event->private_data_list);
312}
313
314/*
315 * Caller damn well better be holding whatever mutex is protecting the
316 * old_holder->event_list and the new_event must be a clean event which
317 * cannot be found anywhere else in the kernel.
318 */
319int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
320 struct fsnotify_event *new_event)
321{
322 struct fsnotify_event *old_event = old_holder->event;
323 struct fsnotify_event_holder *new_holder = &new_event->holder;
329 324
330 event->to_tell = NULL; 325 enum event_spinlock_class {
326 SPINLOCK_OLD,
327 SPINLOCK_NEW,
328 };
331 329
332 event->file_name = NULL; 330 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
333 event->name_len = 0;
334 331
335 event->sync_cookie = 0; 332 /*
333 * if the new_event's embedded holder is in use someone
334 * screwed up and didn't give us a clean new event.
335 */
336 BUG_ON(!list_empty(&new_holder->event_list));
337
338 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
339 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
340
341 new_holder->event = new_event;
342 list_replace_init(&old_holder->event_list, &new_holder->event_list);
343
344 spin_unlock(&new_event->lock);
345 spin_unlock(&old_event->lock);
346
347 /* event == holder means we are referenced through the in event holder */
348 if (old_holder != &old_event->holder)
349 fsnotify_destroy_event_holder(old_holder);
350
351 fsnotify_get_event(new_event); /* on the list take reference */
352 fsnotify_put_event(old_event); /* off the list, drop reference */
353
354 return 0;
355}
356
357struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
358{
359 struct fsnotify_event *event;
360
361 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
362 if (!event)
363 return NULL;
364
365 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
366
367 memcpy(event, old_event, sizeof(*event));
368 initialize_event(event);
369
370 if (event->name_len) {
371 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
372 if (!event->file_name) {
373 kmem_cache_free(fsnotify_event_cachep, event);
374 return NULL;
375 }
376 }
377 event->tgid = get_pid(old_event->tgid);
378 if (event->data_type == FSNOTIFY_EVENT_PATH)
379 path_get(&event->path);
380
381 return event;
336} 382}
337 383
338/* 384/*
@@ -348,15 +394,18 @@ static void initialize_event(struct fsnotify_event *event)
348 * @name the filename, if available 394 * @name the filename, if available
349 */ 395 */
350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 396struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
351 int data_type, const char *name, u32 cookie, 397 int data_type, const unsigned char *name,
352 gfp_t gfp) 398 u32 cookie, gfp_t gfp)
353{ 399{
354 struct fsnotify_event *event; 400 struct fsnotify_event *event;
355 401
356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp); 402 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
357 if (!event) 403 if (!event)
358 return NULL; 404 return NULL;
359 405
406 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
407 __func__, event, to_tell, mask, data, data_type);
408
360 initialize_event(event); 409 initialize_event(event);
361 410
362 if (name) { 411 if (name) {
@@ -368,30 +417,21 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
368 event->name_len = strlen(event->file_name); 417 event->name_len = strlen(event->file_name);
369 } 418 }
370 419
420 event->tgid = get_pid(task_tgid(current));
371 event->sync_cookie = cookie; 421 event->sync_cookie = cookie;
372 event->to_tell = to_tell; 422 event->to_tell = to_tell;
423 event->data_type = data_type;
373 424
374 switch (data_type) { 425 switch (data_type) {
375 case FSNOTIFY_EVENT_FILE: {
376 struct file *file = data;
377 struct path *path = &file->f_path;
378 event->path.dentry = path->dentry;
379 event->path.mnt = path->mnt;
380 path_get(&event->path);
381 event->data_type = FSNOTIFY_EVENT_PATH;
382 break;
383 }
384 case FSNOTIFY_EVENT_PATH: { 426 case FSNOTIFY_EVENT_PATH: {
385 struct path *path = data; 427 struct path *path = data;
386 event->path.dentry = path->dentry; 428 event->path.dentry = path->dentry;
387 event->path.mnt = path->mnt; 429 event->path.mnt = path->mnt;
388 path_get(&event->path); 430 path_get(&event->path);
389 event->data_type = FSNOTIFY_EVENT_PATH;
390 break; 431 break;
391 } 432 }
392 case FSNOTIFY_EVENT_INODE: 433 case FSNOTIFY_EVENT_INODE:
393 event->inode = data; 434 event->inode = data;
394 event->data_type = FSNOTIFY_EVENT_INODE;
395 break; 435 break;
396 case FSNOTIFY_EVENT_NONE: 436 case FSNOTIFY_EVENT_NONE:
397 event->inode = NULL; 437 event->inode = NULL;
@@ -412,8 +452,11 @@ __init int fsnotify_notification_init(void)
412 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 452 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
413 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 453 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
414 454
415 initialize_event(&q_overflow_event); 455 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
416 q_overflow_event.mask = FS_Q_OVERFLOW; 456 FSNOTIFY_EVENT_NONE, NULL, 0,
457 GFP_KERNEL);
458 if (!q_overflow_event)
459 panic("unable to allocate fsnotify q_overflow_event\n");
417 460
418 return 0; 461 return 0;
419} 462}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 00000000000..85eebff6d0d
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/mount.h>
24#include <linux/mutex.h>
25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27
28#include <asm/atomic.h>
29
30#include <linux/fsnotify_backend.h>
31#include "fsnotify.h"
32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{
35 struct fsnotify_mark *mark, *lmark;
36 struct hlist_node *pos, *n;
37 LIST_HEAD(free_list);
38
39 spin_lock(&mnt->mnt_root->d_lock);
40 hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
41 list_add(&mark->m.free_m_list, &free_list);
42 hlist_del_init_rcu(&mark->m.m_list);
43 fsnotify_get_mark(mark);
44 }
45 spin_unlock(&mnt->mnt_root->d_lock);
46
47 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
48 fsnotify_destroy_mark(mark);
49 fsnotify_put_mark(mark);
50 }
51}
52
53void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
54{
55 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
56}
57
58/*
59 * Recalculate the mask of events relevant to a given vfsmount locked.
60 */
61static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
62{
63 struct fsnotify_mark *mark;
64 struct hlist_node *pos;
65 __u32 new_mask = 0;
66
67 assert_spin_locked(&mnt->mnt_root->d_lock);
68
69 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
70 new_mask |= mark->mask;
71 mnt->mnt_fsnotify_mask = new_mask;
72}
73
74/*
75 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
76 * any notifier is interested in hearing for this mount point
77 */
78void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
79{
80 spin_lock(&mnt->mnt_root->d_lock);
81 fsnotify_recalc_vfsmount_mask_locked(mnt);
82 spin_unlock(&mnt->mnt_root->d_lock);
83}
84
85void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
86{
87 struct vfsmount *mnt = mark->m.mnt;
88
89 assert_spin_locked(&mark->lock);
90 assert_spin_locked(&mark->group->mark_lock);
91
92 spin_lock(&mnt->mnt_root->d_lock);
93
94 hlist_del_init_rcu(&mark->m.m_list);
95 mark->m.mnt = NULL;
96
97 fsnotify_recalc_vfsmount_mask_locked(mnt);
98
99 spin_unlock(&mnt->mnt_root->d_lock);
100}
101
102static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
103 struct vfsmount *mnt)
104{
105 struct fsnotify_mark *mark;
106 struct hlist_node *pos;
107
108 assert_spin_locked(&mnt->mnt_root->d_lock);
109
110 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
111 if (mark->group == group) {
112 fsnotify_get_mark(mark);
113 return mark;
114 }
115 }
116 return NULL;
117}
118
119/*
120 * given a group and vfsmount, find the mark associated with that combination.
121 * if found take a reference to that mark and return it, else return NULL
122 */
123struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
124 struct vfsmount *mnt)
125{
126 struct fsnotify_mark *mark;
127
128 spin_lock(&mnt->mnt_root->d_lock);
129 mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
130 spin_unlock(&mnt->mnt_root->d_lock);
131
132 return mark;
133}
134
135/*
136 * Attach an initialized mark to a given group and vfsmount.
137 * These marks may be used for the fsnotify backend to determine which
138 * event types should be delivered to which groups.
139 */
140int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
141 struct fsnotify_group *group, struct vfsmount *mnt,
142 int allow_dups)
143{
144 struct fsnotify_mark *lmark;
145 struct hlist_node *node, *last = NULL;
146 int ret = 0;
147
148 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
149
150 assert_spin_locked(&mark->lock);
151 assert_spin_locked(&group->mark_lock);
152
153 spin_lock(&mnt->mnt_root->d_lock);
154
155 mark->m.mnt = mnt;
156
157 /* is mark the first mark? */
158 if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
159 hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
160 goto out;
161 }
162
163 /* should mark be in the middle of the current list? */
164 hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
165 last = node;
166
167 if ((lmark->group == group) && !allow_dups) {
168 ret = -EEXIST;
169 goto out;
170 }
171
172 if (mark->group->priority < lmark->group->priority)
173 continue;
174
175 if ((mark->group->priority == lmark->group->priority) &&
176 (mark->group < lmark->group))
177 continue;
178
179 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
180 goto out;
181 }
182
183 BUG_ON(last == NULL);
184 /* mark should be the last entry. last is the current last entry */
185 hlist_add_after_rcu(last, &mark->m.m_list);
186out:
187 fsnotify_recalc_vfsmount_mask_locked(mnt);
188 spin_unlock(&mnt->mnt_root->d_lock);
189
190 return ret;
191}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4b57fb1eac2..93622b175fc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2238,7 +2238,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
2238} 2238}
2239 2239
2240/** 2240/**
2241 * ntfs_clear_big_inode - clean up the ntfs specific part of an inode 2241 * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
2242 * @vi: vfs inode pending annihilation 2242 * @vi: vfs inode pending annihilation
2243 * 2243 *
2244 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() 2244 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2247,10 +2247,13 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
2247 * 2247 *
2248 * If the MFT record is dirty, we commit it before doing anything else. 2248 * If the MFT record is dirty, we commit it before doing anything else.
2249 */ 2249 */
2250void ntfs_clear_big_inode(struct inode *vi) 2250void ntfs_evict_big_inode(struct inode *vi)
2251{ 2251{
2252 ntfs_inode *ni = NTFS_I(vi); 2252 ntfs_inode *ni = NTFS_I(vi);
2253 2253
2254 truncate_inode_pages(&vi->i_data, 0);
2255 end_writeback(vi);
2256
2254#ifdef NTFS_RW 2257#ifdef NTFS_RW
2255 if (NInoDirty(ni)) { 2258 if (NInoDirty(ni)) {
2256 bool was_bad = (is_bad_inode(vi)); 2259 bool was_bad = (is_bad_inode(vi));
@@ -2879,9 +2882,6 @@ void ntfs_truncate_vfs(struct inode *vi) {
2879 * 2882 *
2880 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also 2883 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also
2881 * called with ->i_alloc_sem held for writing. 2884 * called with ->i_alloc_sem held for writing.
2882 *
2883 * Basically this is a copy of generic notify_change() and inode_setattr()
2884 * functionality, except we intercept and abort changes in i_size.
2885 */ 2885 */
2886int ntfs_setattr(struct dentry *dentry, struct iattr *attr) 2886int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2887{ 2887{
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 9a113544605..2dabf813456 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -279,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
279 279
280extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); 280extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
281extern void ntfs_destroy_big_inode(struct inode *inode); 281extern void ntfs_destroy_big_inode(struct inode *inode);
282extern void ntfs_clear_big_inode(struct inode *vi); 282extern void ntfs_evict_big_inode(struct inode *vi);
283 283
284extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); 284extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
285 285
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0de1db6cddb..a30ecacc01f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h>
34#include <linux/bitmap.h> 33#include <linux/bitmap.h>
35 34
36#include "sysctl.h" 35#include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
445 444
446 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
447 446
448 lock_kernel();
449#ifndef NTFS_RW 447#ifndef NTFS_RW
450 /* For read-only compiled driver, enforce read-only flag. */ 448 /* For read-only compiled driver, enforce read-only flag. */
451 *flags |= MS_RDONLY; 449 *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
469 if (NVolErrors(vol)) { 467 if (NVolErrors(vol)) {
470 ntfs_error(sb, "Volume has errors and is read-only%s", 468 ntfs_error(sb, "Volume has errors and is read-only%s",
471 es); 469 es);
472 unlock_kernel();
473 return -EROFS; 470 return -EROFS;
474 } 471 }
475 if (vol->vol_flags & VOLUME_IS_DIRTY) { 472 if (vol->vol_flags & VOLUME_IS_DIRTY) {
476 ntfs_error(sb, "Volume is dirty and read-only%s", es); 473 ntfs_error(sb, "Volume is dirty and read-only%s", es);
477 unlock_kernel();
478 return -EROFS; 474 return -EROFS;
479 } 475 }
480 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 476 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
481 ntfs_error(sb, "Volume has been modified by chkdsk " 477 ntfs_error(sb, "Volume has been modified by chkdsk "
482 "and is read-only%s", es); 478 "and is read-only%s", es);
483 unlock_kernel();
484 return -EROFS; 479 return -EROFS;
485 } 480 }
486 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 481 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
488 "(0x%x) and is read-only%s", 483 "(0x%x) and is read-only%s",
489 (unsigned)le16_to_cpu(vol->vol_flags), 484 (unsigned)le16_to_cpu(vol->vol_flags),
490 es); 485 es);
491 unlock_kernel();
492 return -EROFS; 486 return -EROFS;
493 } 487 }
494 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 488 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
495 ntfs_error(sb, "Failed to set dirty bit in volume " 489 ntfs_error(sb, "Failed to set dirty bit in volume "
496 "information flags%s", es); 490 "information flags%s", es);
497 unlock_kernel();
498 return -EROFS; 491 return -EROFS;
499 } 492 }
500#if 0 493#if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
514 ntfs_error(sb, "Failed to empty journal $LogFile%s", 507 ntfs_error(sb, "Failed to empty journal $LogFile%s",
515 es); 508 es);
516 NVolSetErrors(vol); 509 NVolSetErrors(vol);
517 unlock_kernel();
518 return -EROFS; 510 return -EROFS;
519 } 511 }
520 if (!ntfs_mark_quotas_out_of_date(vol)) { 512 if (!ntfs_mark_quotas_out_of_date(vol)) {
521 ntfs_error(sb, "Failed to mark quotas out of date%s", 513 ntfs_error(sb, "Failed to mark quotas out of date%s",
522 es); 514 es);
523 NVolSetErrors(vol); 515 NVolSetErrors(vol);
524 unlock_kernel();
525 return -EROFS; 516 return -EROFS;
526 } 517 }
527 if (!ntfs_stamp_usnjrnl(vol)) { 518 if (!ntfs_stamp_usnjrnl(vol)) {
528 ntfs_error(sb, "Failed to stamp transation log " 519 ntfs_error(sb, "Failed to stamp transation log "
529 "($UsnJrnl)%s", es); 520 "($UsnJrnl)%s", es);
530 NVolSetErrors(vol); 521 NVolSetErrors(vol);
531 unlock_kernel();
532 return -EROFS; 522 return -EROFS;
533 } 523 }
534 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 524 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
544 534
545 // TODO: Deal with *flags. 535 // TODO: Deal with *flags.
546 536
547 if (!parse_options(vol, opt)) { 537 if (!parse_options(vol, opt))
548 unlock_kernel();
549 return -EINVAL; 538 return -EINVAL;
550 } 539
551 unlock_kernel();
552 ntfs_debug("Done."); 540 ntfs_debug("Done.");
553 return 0; 541 return 0;
554} 542}
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
2261 2249
2262 ntfs_debug("Entering."); 2250 ntfs_debug("Entering.");
2263 2251
2264 lock_kernel();
2265
2266#ifdef NTFS_RW 2252#ifdef NTFS_RW
2267 /* 2253 /*
2268 * Commit all inodes while they are still open in case some of them 2254 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
2433 2419
2434 sb->s_fs_info = NULL; 2420 sb->s_fs_info = NULL;
2435 kfree(vol); 2421 kfree(vol);
2436
2437 unlock_kernel();
2438} 2422}
2439 2423
2440/** 2424/**
@@ -2700,7 +2684,7 @@ static const struct super_operations ntfs_sops = {
2700 .put_super = ntfs_put_super, /* Syscall: umount. */ 2684 .put_super = ntfs_put_super, /* Syscall: umount. */
2701 .statfs = ntfs_statfs, /* Syscall: statfs */ 2685 .statfs = ntfs_statfs, /* Syscall: statfs */
2702 .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ 2686 .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
2703 .clear_inode = ntfs_clear_big_inode, /* VFS: Called when an inode is 2687 .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is
2704 removed from memory. */ 2688 removed from memory. */
2705 //.umount_begin = NULL, /* Forced umount. */ 2689 //.umount_begin = NULL, /* Forced umount. */
2706 .show_options = ntfs_show_options, /* Show mount options in 2690 .show_options = ntfs_show_options, /* Show mount options in
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2772 init_rwsem(&vol->mftbmp_lock); 2756 init_rwsem(&vol->mftbmp_lock);
2773 init_rwsem(&vol->lcnbmp_lock); 2757 init_rwsem(&vol->lcnbmp_lock);
2774 2758
2775 unlock_kernel();
2776
2777 /* By default, enable sparse support. */ 2759 /* By default, enable sparse support. */
2778 NVolSetSparseEnabled(vol); 2760 NVolSetSparseEnabled(vol);
2779 2761
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2929 goto unl_upcase_iput_tmp_ino_err_out_now; 2911 goto unl_upcase_iput_tmp_ino_err_out_now;
2930 } 2912 }
2931 if ((sb->s_root = d_alloc_root(vol->root_ino))) { 2913 if ((sb->s_root = d_alloc_root(vol->root_ino))) {
2932 /* We increment i_count simulating an ntfs_iget(). */ 2914 /* We grab a reference, simulating an ntfs_iget(). */
2933 atomic_inc(&vol->root_ino->i_count); 2915 ihold(vol->root_ino);
2934 ntfs_debug("Exiting, status successful."); 2916 ntfs_debug("Exiting, status successful.");
2935 /* Release the default upcase if it has no users. */ 2917 /* Release the default upcase if it has no users. */
2936 mutex_lock(&ntfs_lock); 2918 mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2940 } 2922 }
2941 mutex_unlock(&ntfs_lock); 2923 mutex_unlock(&ntfs_lock);
2942 sb->s_export_op = &ntfs_export_ops; 2924 sb->s_export_op = &ntfs_export_ops;
2943 lock_kernel();
2944 lockdep_on(); 2925 lockdep_on();
2945 return 0; 2926 return 0;
2946 } 2927 }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
3040 if (vol->mft_ino && vol->mft_ino != tmp_ino) 3021 if (vol->mft_ino && vol->mft_ino != tmp_ino)
3041 iput(vol->mft_ino); 3022 iput(vol->mft_ino);
3042 vol->mft_ino = NULL; 3023 vol->mft_ino = NULL;
3043 /*
3044 * This is needed to get ntfs_clear_extent_inode() called for each
3045 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
3046 * leak resources and B) a subsequent mount fails automatically due to
3047 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
3048 * method again... FIXME: Do we need to do this twice now because of
3049 * attribute inodes? I think not, so leave as is for now... (AIA)
3050 */
3051 if (invalidate_inodes(sb)) {
3052 ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
3053 "driver bug.");
3054 /* Copied from fs/super.c. I just love this message. (-; */
3055 printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
3056 "seconds. Have a nice day...\n");
3057 }
3058 /* Errors at this stage are irrelevant. */ 3024 /* Errors at this stage are irrelevant. */
3059err_out_now: 3025err_out_now:
3060 lock_kernel();
3061 sb->s_fs_info = NULL; 3026 sb->s_fs_info = NULL;
3062 kfree(vol); 3027 kfree(vol);
3063 ntfs_debug("Failed, returning -EINVAL."); 3028 ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
3094/* Driver wide mutex. */ 3059/* Driver wide mutex. */
3095DEFINE_MUTEX(ntfs_lock); 3060DEFINE_MUTEX(ntfs_lock);
3096 3061
3097static int ntfs_get_sb(struct file_system_type *fs_type, 3062static struct dentry *ntfs_mount(struct file_system_type *fs_type,
3098 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3063 int flags, const char *dev_name, void *data)
3099{ 3064{
3100 return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, 3065 return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
3101 mnt);
3102} 3066}
3103 3067
3104static struct file_system_type ntfs_fs_type = { 3068static struct file_system_type ntfs_fs_type = {
3105 .owner = THIS_MODULE, 3069 .owner = THIS_MODULE,
3106 .name = "ntfs", 3070 .name = "ntfs",
3107 .get_sb = ntfs_get_sb, 3071 .mount = ntfs_mount,
3108 .kill_sb = kill_block_super, 3072 .kill_sb = kill_block_super,
3109 .fs_flags = FS_REQUIRES_DEV, 3073 .fs_flags = FS_REQUIRES_DEV,
3110}; 3074};
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e..391915093fe 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
209 } 209 }
210 210
211 inode->i_mode = new_mode; 211 inode->i_mode = new_mode;
212 inode->i_ctime = CURRENT_TIME;
212 di->i_mode = cpu_to_le16(inode->i_mode); 213 di->i_mode = cpu_to_le16(inode->i_mode);
214 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
215 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
213 216
214 ocfs2_journal_dirty(handle, di_bh); 217 ocfs2_journal_dirty(handle, di_bh);
215 218
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
290 293
291int ocfs2_check_acl(struct inode *inode, int mask) 294int ocfs2_check_acl(struct inode *inode, int mask)
292{ 295{
293 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); 296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
297 struct buffer_head *di_bh = NULL;
298 struct posix_acl *acl;
299 int ret = -EAGAIN;
294 300
295 if (IS_ERR(acl)) 301 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
302 return ret;
303
304 ret = ocfs2_read_inode_block(inode, &di_bh);
305 if (ret < 0) {
306 mlog_errno(ret);
307 return ret;
308 }
309
310 acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
311
312 brelse(di_bh);
313
314 if (IS_ERR(acl)) {
315 mlog_errno(PTR_ERR(acl));
296 return PTR_ERR(acl); 316 return PTR_ERR(acl);
317 }
297 if (acl) { 318 if (acl) {
298 int ret = posix_acl_permission(inode, acl, mask); 319 ret = posix_acl_permission(inode, acl, mask);
299 posix_acl_release(acl); 320 posix_acl_release(acl);
300 return ret; 321 return ret;
301 } 322 }
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
344{ 365{
345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 366 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
346 struct posix_acl *acl = NULL; 367 struct posix_acl *acl = NULL;
347 int ret = 0; 368 int ret = 0, ret2;
348 mode_t mode; 369 mode_t mode;
349 370
350 if (!S_ISLNK(inode->i_mode)) { 371 if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
381 mode = inode->i_mode; 402 mode = inode->i_mode;
382 ret = posix_acl_create_masq(clone, &mode); 403 ret = posix_acl_create_masq(clone, &mode);
383 if (ret >= 0) { 404 if (ret >= 0) {
384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); 405 ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
406 if (ret2) {
407 mlog_errno(ret2);
408 ret = ret2;
409 goto cleanup;
410 }
385 if (ret > 0) { 411 if (ret > 0) {
386 ret = ocfs2_set_acl(handle, inode, 412 ret = ocfs2_set_acl(handle, inode,
387 di_bh, ACL_TYPE_ACCESS, 413 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d8..592fae5007d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6672 last_page_bytes = PAGE_ALIGN(end); 6672 last_page_bytes = PAGE_ALIGN(end);
6673 index = start >> PAGE_CACHE_SHIFT; 6673 index = start >> PAGE_CACHE_SHIFT;
6674 do { 6674 do {
6675 pages[numpages] = grab_cache_page(mapping, index); 6675 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6676 if (!pages[numpages]) { 6676 if (!pages[numpages]) {
6677 ret = -ENOMEM; 6677 ret = -ENOMEM;
6678 mlog_errno(ret); 6678 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 356e976772b..f1e962cb3b7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
165 * ocfs2 never allocates in this function - the only time we 165 * ocfs2 never allocates in this function - the only time we
166 * need to use BH_New is when we're extending i_size on a file 166 * need to use BH_New is when we're extending i_size on a file
167 * system which doesn't support holes, in which case BH_New 167 * system which doesn't support holes, in which case BH_New
168 * allows block_prepare_write() to zero. 168 * allows __block_write_begin() to zero.
169 * 169 *
170 * If we see this on a sparse file system, then a truncate has 170 * If we see this on a sparse file system, then a truncate has
171 * raced us and removed the cluster. In this case, we clear 171 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
407 return ret; 407 return ret;
408} 408}
409 409
410/*
411 * This is called from ocfs2_write_zero_page() which has handled it's
412 * own cluster locking and has ensured allocation exists for those
413 * blocks to be written.
414 */
415int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
416 unsigned from, unsigned to)
417{
418 int ret;
419
420 ret = block_prepare_write(page, from, to, ocfs2_get_block);
421
422 return ret;
423}
424
425/* Taken from ext3. We don't necessarily need the full blown 410/* Taken from ext3. We don't necessarily need the full blown
426 * functionality yet, but IMHO it's better to cut and paste the whole 411 * functionality yet, but IMHO it's better to cut and paste the whole
427 * thing so we can avoid introducing our own bugs (and easily pick up 412 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -578,7 +563,9 @@ bail:
578static void ocfs2_dio_end_io(struct kiocb *iocb, 563static void ocfs2_dio_end_io(struct kiocb *iocb,
579 loff_t offset, 564 loff_t offset,
580 ssize_t bytes, 565 ssize_t bytes,
581 void *private) 566 void *private,
567 int ret,
568 bool is_async)
582{ 569{
583 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 570 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
584 int level; 571 int level;
@@ -592,6 +579,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
592 if (!level) 579 if (!level)
593 up_read(&inode->i_alloc_sem); 580 up_read(&inode->i_alloc_sem);
594 ocfs2_rw_unlock(inode, level); 581 ocfs2_rw_unlock(inode, level);
582
583 if (is_async)
584 aio_complete(iocb, ret, 0);
595} 585}
596 586
597/* 587/*
@@ -638,11 +628,10 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (i_size_read(inode) <= offset) 628 if (i_size_read(inode) <= offset)
639 return 0; 629 return 0;
640 630
641 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 631 ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
642 inode->i_sb->s_bdev, iov, offset, 632 iov, offset, nr_segs,
643 nr_segs, 633 ocfs2_direct_IO_get_blocks,
644 ocfs2_direct_IO_get_blocks, 634 ocfs2_dio_end_io, NULL, 0);
645 ocfs2_dio_end_io);
646 635
647 mlog_exit(ret); 636 mlog_exit(ret);
648 return ret; 637 return ret;
@@ -728,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
728} 717}
729 718
730/* 719/*
731 * Some of this taken from block_prepare_write(). We already have our 720 * Some of this taken from __block_write_begin(). We already have our
732 * mapping by now though, and the entire write will be allocating or 721 * mapping by now though, and the entire write will be allocating or
733 * it won't, so not much need to use BH_New. 722 * it won't, so not much need to use BH_New.
734 * 723 *
@@ -879,8 +868,8 @@ struct ocfs2_write_ctxt {
879 * out in so that future reads from that region will get 868 * out in so that future reads from that region will get
880 * zero's. 869 * zero's.
881 */ 870 */
882 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
883 unsigned int w_num_pages; 871 unsigned int w_num_pages;
872 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
884 struct page *w_target_page; 873 struct page *w_target_page;
885 874
886 /* 875 /*
@@ -1638,7 +1627,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1638 return ret; 1627 return ret;
1639} 1628}
1640 1629
1641int ocfs2_write_begin_nolock(struct address_space *mapping, 1630int ocfs2_write_begin_nolock(struct file *filp,
1631 struct address_space *mapping,
1642 loff_t pos, unsigned len, unsigned flags, 1632 loff_t pos, unsigned len, unsigned flags,
1643 struct page **pagep, void **fsdata, 1633 struct page **pagep, void **fsdata,
1644 struct buffer_head *di_bh, struct page *mmap_page) 1634 struct buffer_head *di_bh, struct page *mmap_page)
@@ -1688,7 +1678,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1688 mlog_errno(ret); 1678 mlog_errno(ret);
1689 goto out; 1679 goto out;
1690 } else if (ret == 1) { 1680 } else if (ret == 1) {
1691 ret = ocfs2_refcount_cow(inode, di_bh, 1681 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1692 wc->w_cpos, wc->w_clen, UINT_MAX); 1682 wc->w_cpos, wc->w_clen, UINT_MAX);
1693 if (ret) { 1683 if (ret) {
1694 mlog_errno(ret); 1684 mlog_errno(ret);
@@ -1850,7 +1840,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1850 */ 1840 */
1851 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1841 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1852 1842
1853 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1843 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1854 fsdata, di_bh, NULL); 1844 fsdata, di_bh, NULL);
1855 if (ret) { 1845 if (ret) {
1856 mlog_errno(ret); 1846 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc51..76bfdfda691 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
22#ifndef OCFS2_AOPS_H 22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H 23#define OCFS2_AOPS_H
24 24
25int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
26 unsigned from, unsigned to);
27
28handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 25handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page, 26 struct page *page,
30 unsigned from, 27 unsigned from,
@@ -48,7 +45,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
48 loff_t pos, unsigned len, unsigned copied, 45 loff_t pos, unsigned len, unsigned copied,
49 struct page *page, void *fsdata); 46 struct page *page, void *fsdata);
50 47
51int ocfs2_write_begin_nolock(struct address_space *mapping, 48int ocfs2_write_begin_nolock(struct file *filp,
49 struct address_space *mapping,
52 loff_t pos, unsigned len, unsigned flags, 50 loff_t pos, unsigned len, unsigned flags,
53 struct page **pagep, void **fsdata, 51 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 52 struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d1233959..c7ee03c2222 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
439 439
440 ocfs2_blockcheck_inc_failure(stats); 440 ocfs2_blockcheck_inc_failure(stats);
441 mlog(ML_ERROR, 441 mlog(ML_ERROR,
442 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 442 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
443 (unsigned int)check.bc_crc32e, (unsigned int)crc); 443 (unsigned int)check.bc_crc32e, (unsigned int)crc);
444 444
445 /* Ok, try ECC fixups */ 445 /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
453 goto out; 453 goto out;
454 } 454 }
455 455
456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
457 (unsigned int)check.bc_crc32e, (unsigned int)crc); 457 (unsigned int)check.bc_crc32e, (unsigned int)crc);
458 458
459 rc = -EIO; 459 rc = -EIO;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d5..52c7557f3e2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events); 62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64 64
65/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
72 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
77
78#define O2HB_DB_TYPE_LIVENODES 0
79#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85struct o2hb_debug_buf {
86 int db_type;
87 int db_size;
88 int db_len;
89 void *db_data;
90};
91
92static struct o2hb_debug_buf *o2hb_db_livenodes;
93static struct o2hb_debug_buf *o2hb_db_liveregions;
94static struct o2hb_debug_buf *o2hb_db_quorumregions;
95static struct o2hb_debug_buf *o2hb_db_failedregions;
96
65#define O2HB_DEBUG_DIR "o2hb" 97#define O2HB_DEBUG_DIR "o2hb"
66#define O2HB_DEBUG_LIVENODES "livenodes" 98#define O2HB_DEBUG_LIVENODES "livenodes"
99#define O2HB_DEBUG_LIVEREGIONS "live_regions"
100#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
104
67static struct dentry *o2hb_debug_dir; 105static struct dentry *o2hb_debug_dir;
68static struct dentry *o2hb_debug_livenodes; 106static struct dentry *o2hb_debug_livenodes;
107static struct dentry *o2hb_debug_liveregions;
108static struct dentry *o2hb_debug_quorumregions;
109static struct dentry *o2hb_debug_failedregions;
69 110
70static LIST_HEAD(o2hb_all_regions); 111static LIST_HEAD(o2hb_all_regions);
71 112
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
77 118
78#define O2HB_DEFAULT_BLOCK_BITS 9 119#define O2HB_DEFAULT_BLOCK_BITS 9
79 120
121enum o2hb_heartbeat_modes {
122 O2HB_HEARTBEAT_LOCAL = 0,
123 O2HB_HEARTBEAT_GLOBAL,
124 O2HB_HEARTBEAT_NUM_MODES,
125};
126
127char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
128 "local", /* O2HB_HEARTBEAT_LOCAL */
129 "global", /* O2HB_HEARTBEAT_GLOBAL */
130};
131
80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
81 134
82/* Only sets a new threshold if there are no active regions. 135/* Only sets a new threshold if there are no active regions.
83 * 136 *
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
94 } 147 }
95} 148}
96 149
150static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
151{
152 int ret = -1;
153
154 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
155 spin_lock(&o2hb_live_lock);
156 if (list_empty(&o2hb_all_regions)) {
157 o2hb_heartbeat_mode = hb_mode;
158 ret = 0;
159 }
160 spin_unlock(&o2hb_live_lock);
161 }
162
163 return ret;
164}
165
97struct o2hb_node_event { 166struct o2hb_node_event {
98 struct list_head hn_item; 167 struct list_head hn_item;
99 enum o2hb_callback_type hn_event_type; 168 enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
135 struct block_device *hr_bdev; 204 struct block_device *hr_bdev;
136 struct o2hb_disk_slot *hr_slots; 205 struct o2hb_disk_slot *hr_slots;
137 206
207 /* live node map of this region */
208 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
209 unsigned int hr_region_num;
210
211 struct dentry *hr_debug_dir;
212 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time;
215 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time;
218
138 /* let the person setting up hb wait for it to return until it 219 /* let the person setting up hb wait for it to return until it
139 * has reached a 'steady' state. This will be fixed when we have 220 * has reached a 'steady' state. This will be fixed when we have
140 * a more complete api that doesn't lead to this sort of fragility. */ 221 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
163 int wc_error; 244 int wc_error;
164}; 245};
165 246
247static int o2hb_pop_count(void *map, int count)
248{
249 int i = -1, pop = 0;
250
251 while ((i = find_next_bit(map, count, i + 1)) < count)
252 pop++;
253 return pop;
254}
255
166static void o2hb_write_timeout(struct work_struct *work) 256static void o2hb_write_timeout(struct work_struct *work)
167{ 257{
258 int failed, quorum;
259 unsigned long flags;
168 struct o2hb_region *reg = 260 struct o2hb_region *reg =
169 container_of(work, struct o2hb_region, 261 container_of(work, struct o2hb_region,
170 hr_write_timeout_work.work); 262 hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 264 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
173 "milliseconds\n", reg->hr_dev_name, 265 "milliseconds\n", reg->hr_dev_name,
174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 266 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
267
268 if (o2hb_global_heartbeat_active()) {
269 spin_lock_irqsave(&o2hb_live_lock, flags);
270 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
271 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
272 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
273 O2NM_MAX_REGIONS);
274 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
275 O2NM_MAX_REGIONS);
276 spin_unlock_irqrestore(&o2hb_live_lock, flags);
277
278 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
279 quorum, failed);
280
281 /*
282 * Fence if the number of failed regions >= half the number
283 * of quorum regions
284 */
285 if ((failed << 1) < quorum)
286 return;
287 }
288
175 o2quo_disk_timeout(); 289 o2quo_disk_timeout();
176} 290}
177 291
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 294 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS); 295 O2HB_MAX_WRITE_TIMEOUT_MS);
182 296
297 if (o2hb_global_heartbeat_active()) {
298 spin_lock(&o2hb_live_lock);
299 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
300 spin_unlock(&o2hb_live_lock);
301 }
183 cancel_delayed_work(&reg->hr_write_timeout_work); 302 cancel_delayed_work(&reg->hr_write_timeout_work);
184 reg->hr_last_timeout_start = jiffies; 303 reg->hr_last_timeout_start = jiffies;
185 schedule_delayed_work(&reg->hr_write_timeout_work, 304 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
513{ 632{
514 assert_spin_locked(&o2hb_live_lock); 633 assert_spin_locked(&o2hb_live_lock);
515 634
635 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
636
516 event->hn_event_type = type; 637 event->hn_event_type = type;
517 event->hn_node = node; 638 event->hn_node = node;
518 event->hn_node_num = node_num; 639 event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
554 o2nm_node_put(node); 675 o2nm_node_put(node);
555} 676}
556 677
678static void o2hb_set_quorum_device(struct o2hb_region *reg,
679 struct o2hb_disk_slot *slot)
680{
681 assert_spin_locked(&o2hb_live_lock);
682
683 if (!o2hb_global_heartbeat_active())
684 return;
685
686 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
687 return;
688
689 /*
690 * A region can be added to the quorum only when it sees all
691 * live nodes heartbeat on it. In other words, the region has been
692 * added to all nodes.
693 */
694 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
695 sizeof(o2hb_live_node_bitmap)))
696 return;
697
698 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
699 return;
700
701 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
702 config_item_name(&reg->hr_item));
703
704 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
705}
706
557static int o2hb_check_slot(struct o2hb_region *reg, 707static int o2hb_check_slot(struct o2hb_region *reg,
558 struct o2hb_disk_slot *slot) 708 struct o2hb_disk_slot *slot)
559{ 709{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
565 u64 cputime; 715 u64 cputime;
566 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 716 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
567 unsigned int slot_dead_ms; 717 unsigned int slot_dead_ms;
718 int tmp;
568 719
569 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 720 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
570 721
571 /* Is this correct? Do we assume that the node doesn't exist 722 /*
572 * if we're not configured for him? */ 723 * If a node is no longer configured but is still in the livemap, we
724 * may need to clear that bit from the livemap.
725 */
573 node = o2nm_get_node_by_num(slot->ds_node_num); 726 node = o2nm_get_node_by_num(slot->ds_node_num);
574 if (!node) 727 if (!node) {
575 return 0; 728 spin_lock(&o2hb_live_lock);
729 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
730 spin_unlock(&o2hb_live_lock);
731 if (!tmp)
732 return 0;
733 }
576 734
577 if (!o2hb_verify_crc(reg, hb_block)) { 735 if (!o2hb_verify_crc(reg, hb_block)) {
578 /* all paths from here will drop o2hb_live_lock for 736 /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
639 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", 797 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
640 slot->ds_node_num, (long long)slot->ds_last_generation); 798 slot->ds_node_num, (long long)slot->ds_last_generation);
641 799
800 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
801
642 /* first on the list generates a callback */ 802 /* first on the list generates a callback */
643 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 803 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
804 mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
805 "bitmap\n", slot->ds_node_num);
644 set_bit(slot->ds_node_num, o2hb_live_node_bitmap); 806 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
645 807
646 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, 808 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
684 mlog(ML_HEARTBEAT, "Node %d left my region\n", 846 mlog(ML_HEARTBEAT, "Node %d left my region\n",
685 slot->ds_node_num); 847 slot->ds_node_num);
686 848
849 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
850
687 /* last off the live_slot generates a callback */ 851 /* last off the live_slot generates a callback */
688 list_del_init(&slot->ds_live_item); 852 list_del_init(&slot->ds_live_item);
689 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 853 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
854 mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
855 "nodes bitmap\n", slot->ds_node_num);
690 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 856 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
691 857
692 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 858 /* node can be null */
693 slot->ds_node_num); 859 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
860 node, slot->ds_node_num);
694 861
695 changed = 1; 862 changed = 1;
696 } 863 }
@@ -706,11 +873,14 @@ fire_callbacks:
706 slot->ds_equal_samples = 0; 873 slot->ds_equal_samples = 0;
707 } 874 }
708out: 875out:
876 o2hb_set_quorum_device(reg, slot);
877
709 spin_unlock(&o2hb_live_lock); 878 spin_unlock(&o2hb_live_lock);
710 879
711 o2hb_run_event_list(&event); 880 o2hb_run_event_list(&event);
712 881
713 o2nm_node_put(node); 882 if (node)
883 o2nm_node_put(node);
714 return changed; 884 return changed;
715} 885}
716 886
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
737{ 907{
738 int i, ret, highest_node, change = 0; 908 int i, ret, highest_node, change = 0;
739 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 909 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
910 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
740 struct o2hb_bio_wait_ctxt write_wc; 911 struct o2hb_bio_wait_ctxt write_wc;
741 912
742 ret = o2nm_configured_node_map(configured_nodes, 913 ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
746 return ret; 917 return ret;
747 } 918 }
748 919
920 /*
921 * If a node is not configured but is in the livemap, we still need
922 * to read the slot so as to be able to remove it from the livemap.
923 */
924 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
925 i = -1;
926 while ((i = find_next_bit(live_node_bitmap,
927 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
928 set_bit(i, configured_nodes);
929 }
930
749 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 931 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
750 if (highest_node >= O2NM_MAX_NODES) { 932 if (highest_node >= O2NM_MAX_NODES) {
751 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 933 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
917#ifdef CONFIG_DEBUG_FS 1099#ifdef CONFIG_DEBUG_FS
918static int o2hb_debug_open(struct inode *inode, struct file *file) 1100static int o2hb_debug_open(struct inode *inode, struct file *file)
919{ 1101{
1102 struct o2hb_debug_buf *db = inode->i_private;
1103 struct o2hb_region *reg;
920 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1104 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
921 char *buf = NULL; 1105 char *buf = NULL;
922 int i = -1; 1106 int i = -1;
923 int out = 0; 1107 int out = 0;
924 1108
1109 /* max_nodes should be the largest bitmap we pass here */
1110 BUG_ON(sizeof(map) < db->db_size);
1111
925 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1112 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
926 if (!buf) 1113 if (!buf)
927 goto bail; 1114 goto bail;
928 1115
929 o2hb_fill_node_map(map, sizeof(map)); 1116 switch (db->db_type) {
1117 case O2HB_DB_TYPE_LIVENODES:
1118 case O2HB_DB_TYPE_LIVEREGIONS:
1119 case O2HB_DB_TYPE_QUORUMREGIONS:
1120 case O2HB_DB_TYPE_FAILEDREGIONS:
1121 spin_lock(&o2hb_live_lock);
1122 memcpy(map, db->db_data, db->db_size);
1123 spin_unlock(&o2hb_live_lock);
1124 break;
1125
1126 case O2HB_DB_TYPE_REGION_LIVENODES:
1127 spin_lock(&o2hb_live_lock);
1128 reg = (struct o2hb_region *)db->db_data;
1129 memcpy(map, reg->hr_live_node_bitmap, db->db_size);
1130 spin_unlock(&o2hb_live_lock);
1131 break;
1132
1133 case O2HB_DB_TYPE_REGION_NUMBER:
1134 reg = (struct o2hb_region *)db->db_data;
1135 out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
1136 reg->hr_region_num);
1137 goto done;
1138
1139 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1140 reg = (struct o2hb_region *)db->db_data;
1141 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1142 jiffies_to_msecs(jiffies -
1143 reg->hr_last_timeout_start));
1144 goto done;
1145
1146 default:
1147 goto done;
1148 }
930 1149
931 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 1150 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
932 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 1151 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
933 out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 1152 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
934 1153
1154done:
935 i_size_write(inode, out); 1155 i_size_write(inode, out);
936 1156
937 file->private_data = buf; 1157 file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
978 1198
979void o2hb_exit(void) 1199void o2hb_exit(void)
980{ 1200{
981 if (o2hb_debug_livenodes) 1201 kfree(o2hb_db_livenodes);
982 debugfs_remove(o2hb_debug_livenodes); 1202 kfree(o2hb_db_liveregions);
983 if (o2hb_debug_dir) 1203 kfree(o2hb_db_quorumregions);
984 debugfs_remove(o2hb_debug_dir); 1204 kfree(o2hb_db_failedregions);
1205 debugfs_remove(o2hb_debug_failedregions);
1206 debugfs_remove(o2hb_debug_quorumregions);
1207 debugfs_remove(o2hb_debug_liveregions);
1208 debugfs_remove(o2hb_debug_livenodes);
1209 debugfs_remove(o2hb_debug_dir);
1210}
1211
1212static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1213 struct o2hb_debug_buf **db, int db_len,
1214 int type, int size, int len, void *data)
1215{
1216 *db = kmalloc(db_len, GFP_KERNEL);
1217 if (!*db)
1218 return NULL;
1219
1220 (*db)->db_type = type;
1221 (*db)->db_size = size;
1222 (*db)->db_len = len;
1223 (*db)->db_data = data;
1224
1225 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1226 &o2hb_debug_fops);
1227}
1228
1229static int o2hb_debug_init(void)
1230{
1231 int ret = -ENOMEM;
1232
1233 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1234 if (!o2hb_debug_dir) {
1235 mlog_errno(ret);
1236 goto bail;
1237 }
1238
1239 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1240 o2hb_debug_dir,
1241 &o2hb_db_livenodes,
1242 sizeof(*o2hb_db_livenodes),
1243 O2HB_DB_TYPE_LIVENODES,
1244 sizeof(o2hb_live_node_bitmap),
1245 O2NM_MAX_NODES,
1246 o2hb_live_node_bitmap);
1247 if (!o2hb_debug_livenodes) {
1248 mlog_errno(ret);
1249 goto bail;
1250 }
1251
1252 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1253 o2hb_debug_dir,
1254 &o2hb_db_liveregions,
1255 sizeof(*o2hb_db_liveregions),
1256 O2HB_DB_TYPE_LIVEREGIONS,
1257 sizeof(o2hb_live_region_bitmap),
1258 O2NM_MAX_REGIONS,
1259 o2hb_live_region_bitmap);
1260 if (!o2hb_debug_liveregions) {
1261 mlog_errno(ret);
1262 goto bail;
1263 }
1264
1265 o2hb_debug_quorumregions =
1266 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1267 o2hb_debug_dir,
1268 &o2hb_db_quorumregions,
1269 sizeof(*o2hb_db_quorumregions),
1270 O2HB_DB_TYPE_QUORUMREGIONS,
1271 sizeof(o2hb_quorum_region_bitmap),
1272 O2NM_MAX_REGIONS,
1273 o2hb_quorum_region_bitmap);
1274 if (!o2hb_debug_quorumregions) {
1275 mlog_errno(ret);
1276 goto bail;
1277 }
1278
1279 o2hb_debug_failedregions =
1280 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1281 o2hb_debug_dir,
1282 &o2hb_db_failedregions,
1283 sizeof(*o2hb_db_failedregions),
1284 O2HB_DB_TYPE_FAILEDREGIONS,
1285 sizeof(o2hb_failed_region_bitmap),
1286 O2NM_MAX_REGIONS,
1287 o2hb_failed_region_bitmap);
1288 if (!o2hb_debug_failedregions) {
1289 mlog_errno(ret);
1290 goto bail;
1291 }
1292
1293 ret = 0;
1294bail:
1295 if (ret)
1296 o2hb_exit();
1297
1298 return ret;
985} 1299}
986 1300
987int o2hb_init(void) 1301int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
997 INIT_LIST_HEAD(&o2hb_node_events); 1311 INIT_LIST_HEAD(&o2hb_node_events);
998 1312
999 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 1313 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
1314 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1315 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1316 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1317 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1000 1318
1001 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1319 return o2hb_debug_init();
1002 if (!o2hb_debug_dir) {
1003 mlog_errno(-ENOMEM);
1004 return -ENOMEM;
1005 }
1006
1007 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1008 S_IFREG|S_IRUSR,
1009 o2hb_debug_dir, NULL,
1010 &o2hb_debug_fops);
1011 if (!o2hb_debug_livenodes) {
1012 mlog_errno(-ENOMEM);
1013 debugfs_remove(o2hb_debug_dir);
1014 return -ENOMEM;
1015 }
1016
1017 return 0;
1018} 1320}
1019 1321
1020/* if we're already in a callback then we're already serialized by the sem */ 1322/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
1078 if (reg->hr_slots) 1380 if (reg->hr_slots)
1079 kfree(reg->hr_slots); 1381 kfree(reg->hr_slots);
1080 1382
1383 kfree(reg->hr_db_regnum);
1384 kfree(reg->hr_db_livenodes);
1385 debugfs_remove(reg->hr_debug_livenodes);
1386 debugfs_remove(reg->hr_debug_regnum);
1387 debugfs_remove(reg->hr_debug_elapsed_time);
1388 debugfs_remove(reg->hr_debug_dir);
1389
1081 spin_lock(&o2hb_live_lock); 1390 spin_lock(&o2hb_live_lock);
1082 list_del(&reg->hr_all_item); 1391 list_del(&reg->hr_all_item);
1083 spin_unlock(&o2hb_live_lock); 1392 spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1441 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1750 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1442 spin_lock(&o2hb_live_lock); 1751 spin_lock(&o2hb_live_lock);
1443 hb_task = reg->hr_task; 1752 hb_task = reg->hr_task;
1753 if (o2hb_global_heartbeat_active())
1754 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1444 spin_unlock(&o2hb_live_lock); 1755 spin_unlock(&o2hb_live_lock);
1445 1756
1446 if (hb_task) 1757 if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1448 else 1759 else
1449 ret = -EIO; 1760 ret = -EIO;
1450 1761
1762 if (hb_task && o2hb_global_heartbeat_active())
1763 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1764 config_item_name(&reg->hr_item));
1765
1451out: 1766out:
1452 if (filp) 1767 if (filp)
1453 fput(filp); 1768 fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1586 : NULL; 1901 : NULL;
1587} 1902}
1588 1903
1904static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1905{
1906 int ret = -ENOMEM;
1907
1908 reg->hr_debug_dir =
1909 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
1910 if (!reg->hr_debug_dir) {
1911 mlog_errno(ret);
1912 goto bail;
1913 }
1914
1915 reg->hr_debug_livenodes =
1916 o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1917 reg->hr_debug_dir,
1918 &(reg->hr_db_livenodes),
1919 sizeof(*(reg->hr_db_livenodes)),
1920 O2HB_DB_TYPE_REGION_LIVENODES,
1921 sizeof(reg->hr_live_node_bitmap),
1922 O2NM_MAX_NODES, reg);
1923 if (!reg->hr_debug_livenodes) {
1924 mlog_errno(ret);
1925 goto bail;
1926 }
1927
1928 reg->hr_debug_regnum =
1929 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
1930 reg->hr_debug_dir,
1931 &(reg->hr_db_regnum),
1932 sizeof(*(reg->hr_db_regnum)),
1933 O2HB_DB_TYPE_REGION_NUMBER,
1934 0, O2NM_MAX_NODES, reg);
1935 if (!reg->hr_debug_regnum) {
1936 mlog_errno(ret);
1937 goto bail;
1938 }
1939
1940 reg->hr_debug_elapsed_time =
1941 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
1942 reg->hr_debug_dir,
1943 &(reg->hr_db_elapsed_time),
1944 sizeof(*(reg->hr_db_elapsed_time)),
1945 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
1946 0, 0, reg);
1947 if (!reg->hr_debug_elapsed_time) {
1948 mlog_errno(ret);
1949 goto bail;
1950 }
1951
1952 ret = 0;
1953bail:
1954 return ret;
1955}
1956
1589static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1957static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1590 const char *name) 1958 const char *name)
1591{ 1959{
1592 struct o2hb_region *reg = NULL; 1960 struct o2hb_region *reg = NULL;
1961 int ret;
1593 1962
1594 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1963 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1595 if (reg == NULL) 1964 if (reg == NULL)
1596 return ERR_PTR(-ENOMEM); 1965 return ERR_PTR(-ENOMEM);
1597 1966
1598 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
1968 return ERR_PTR(-ENAMETOOLONG);
1599 1969
1600 spin_lock(&o2hb_live_lock); 1970 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0;
1972 if (o2hb_global_heartbeat_active()) {
1973 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
1974 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG);
1978 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 }
1601 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1981 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1602 spin_unlock(&o2hb_live_lock); 1982 spin_unlock(&o2hb_live_lock);
1603 1983
1984 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1985
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) {
1988 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret);
1990 }
1991
1604 return &reg->hr_item; 1992 return &reg->hr_item;
1605} 1993}
1606 1994
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1612 2000
1613 /* stop the thread when the user removes the region dir */ 2001 /* stop the thread when the user removes the region dir */
1614 spin_lock(&o2hb_live_lock); 2002 spin_lock(&o2hb_live_lock);
2003 if (o2hb_global_heartbeat_active()) {
2004 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2005 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2006 }
1615 hb_task = reg->hr_task; 2007 hb_task = reg->hr_task;
1616 reg->hr_task = NULL; 2008 reg->hr_task = NULL;
1617 spin_unlock(&o2hb_live_lock); 2009 spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1628 wake_up(&o2hb_steady_queue); 2020 wake_up(&o2hb_steady_queue);
1629 } 2021 }
1630 2022
2023 if (o2hb_global_heartbeat_active())
2024 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2025 config_item_name(&reg->hr_item));
1631 config_item_put(item); 2026 config_item_put(item);
1632} 2027}
1633 2028
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
1688 return count; 2083 return count;
1689} 2084}
1690 2085
2086static
2087ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
2088 char *page)
2089{
2090 return sprintf(page, "%s\n",
2091 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
2092}
2093
2094static
2095ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2096 const char *page, size_t count)
2097{
2098 unsigned int i;
2099 int ret;
2100 size_t len;
2101
2102 len = (page[count - 1] == '\n') ? count - 1 : count;
2103 if (!len)
2104 return -EINVAL;
2105
2106 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2107 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2108 continue;
2109
2110 ret = o2hb_global_hearbeat_mode_set(i);
2111 if (!ret)
2112 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2113 o2hb_heartbeat_mode_desc[i]);
2114 return count;
2115 }
2116
2117 return -EINVAL;
2118
2119}
2120
1691static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { 2121static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1692 .attr = { .ca_owner = THIS_MODULE, 2122 .attr = { .ca_owner = THIS_MODULE,
1693 .ca_name = "dead_threshold", 2123 .ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
1696 .store = o2hb_heartbeat_group_threshold_store, 2126 .store = o2hb_heartbeat_group_threshold_store,
1697}; 2127};
1698 2128
2129static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2130 .attr = { .ca_owner = THIS_MODULE,
2131 .ca_name = "mode",
2132 .ca_mode = S_IRUGO | S_IWUSR },
2133 .show = o2hb_heartbeat_group_mode_show,
2134 .store = o2hb_heartbeat_group_mode_store,
2135};
2136
1699static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { 2137static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1700 &o2hb_heartbeat_group_attr_threshold.attr, 2138 &o2hb_heartbeat_group_attr_threshold.attr,
2139 &o2hb_heartbeat_group_attr_mode.attr,
1701 NULL, 2140 NULL,
1702}; 2141};
1703 2142
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
1963 spin_unlock(&o2hb_live_lock); 2402 spin_unlock(&o2hb_live_lock);
1964} 2403}
1965EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); 2404EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
2405
2406int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2407{
2408 struct o2hb_region *reg;
2409 int numregs = 0;
2410 char *p;
2411
2412 spin_lock(&o2hb_live_lock);
2413
2414 p = region_uuids;
2415 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2416 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2417 if (numregs < max_regions) {
2418 memcpy(p, config_item_name(&reg->hr_item),
2419 O2HB_MAX_REGION_NAME_LEN);
2420 p += O2HB_MAX_REGION_NAME_LEN;
2421 }
2422 numregs++;
2423 }
2424
2425 spin_unlock(&o2hb_live_lock);
2426
2427 return numregs;
2428}
2429EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2430
2431int o2hb_global_heartbeat_active(void)
2432{
2433 return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
2434}
2435EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b4..00ad8e8fea5 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
31 31
32#define O2HB_REGION_TIMEOUT_MS 2000 32#define O2HB_REGION_TIMEOUT_MS 2000
33 33
34#define O2HB_MAX_REGION_NAME_LEN 32
35
34/* number of changes to be seen as live */ 36/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2 37#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 38/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
81int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
82int o2hb_check_local_node_heartbeating(void); 84int o2hb_check_local_node_heartbeating(void);
83void o2hb_stop_all_regions(void); 85void o2hb_stop_all_regions(void);
86int o2hb_get_all_regions(char *region_uuids, u8 numregions);
87int o2hb_global_heartbeat_active(void);
84 88
85#endif /* O2CLUSTER_HEARTBEAT_H */ 89#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa5..ea2ed9f56c9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ 122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
123 124
124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 125#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 126#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fe..bb240647ca5 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
712 spin_lock_init(&node->nd_lock); 712 spin_lock_init(&node->nd_lock);
713 713
714 mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
715
714 return &node->nd_item; 716 return &node->nd_item;
715} 717}
716 718
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
744 } 746 }
745 write_unlock(&cluster->cl_nodes_lock); 747 write_unlock(&cluster->cl_nodes_lock);
746 748
749 mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
750 config_item_name(&node->nd_item));
751
747 config_item_put(item); 752 config_item_put(item);
748} 753}
749 754
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad57..49b594325be 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
36/* host name, group name, cluster name all 64 bytes */ 36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN 37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38 38
39/*
40 * Maximum number of global heartbeat regions allowed.
41 * **CAUTION** Changing this number will break dlm compatibility.
42 */
43#define O2NM_MAX_REGIONS 32
44
39#endif /* _OCFS2_NODEMANAGER_H */ 45#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78d..9aa426e4212 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
978 size_t caller_veclen, u8 target_node, int *status) 978 size_t caller_veclen, u8 target_node, int *status)
979{ 979{
980 int ret; 980 int ret = 0;
981 struct o2net_msg *msg = NULL; 981 struct o2net_msg *msg = NULL;
982 size_t veclen, caller_bytes = 0; 982 size_t veclen, caller_bytes = 0;
983 struct kvec *vec = NULL; 983 struct kvec *vec = NULL;
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1696{
1697 o2quo_hb_down(node_num); 1697 o2quo_hb_down(node_num);
1698 1698
1699 if (!node)
1700 return;
1701
1699 if (node_num != o2nm_this_node()) 1702 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1703 o2net_disconnect_node(node);
1701 1704
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1712
1710 o2quo_hb_up(node_num); 1713 o2quo_hb_up(node_num);
1711 1714
1715 BUG_ON(!node);
1716
1712 /* ensure an immediate connect attempt */ 1717 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1718 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1719 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1759,6 +1764,7 @@ static int o2net_accept_one(struct socket *sock)
1759 struct sockaddr_in sin; 1764 struct sockaddr_in sin;
1760 struct socket *new_sock = NULL; 1765 struct socket *new_sock = NULL;
1761 struct o2nm_node *node = NULL; 1766 struct o2nm_node *node = NULL;
1767 struct o2nm_node *local_node = NULL;
1762 struct o2net_sock_container *sc = NULL; 1768 struct o2net_sock_container *sc = NULL;
1763 struct o2net_node *nn; 1769 struct o2net_node *nn;
1764 1770
@@ -1796,11 +1802,15 @@ static int o2net_accept_one(struct socket *sock)
1796 goto out; 1802 goto out;
1797 } 1803 }
1798 1804
1799 if (o2nm_this_node() > node->nd_num) { 1805 if (o2nm_this_node() >= node->nd_num) {
1800 mlog(ML_NOTICE, "unexpected connect attempted from a lower " 1806 local_node = o2nm_get_node_by_num(o2nm_this_node());
1801 "numbered node '%s' at " "%pI4:%d with num %u\n", 1807 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
1802 node->nd_name, &sin.sin_addr.s_addr, 1808 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
1803 ntohs(sin.sin_port), node->nd_num); 1809 local_node->nd_name, local_node->nd_num,
1810 &(local_node->nd_ipv4_address),
1811 ntohs(local_node->nd_ipv4_port),
1812 node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
1813 ntohs(sin.sin_port));
1804 ret = -EINVAL; 1814 ret = -EINVAL;
1805 goto out; 1815 goto out;
1806 } 1816 }
@@ -1857,6 +1867,8 @@ out:
1857 sock_release(new_sock); 1867 sock_release(new_sock);
1858 if (node) 1868 if (node)
1859 o2nm_node_put(node); 1869 o2nm_node_put(node);
1870 if (local_node)
1871 o2nm_node_put(local_node);
1860 if (sc) 1872 if (sc)
1861 sc_put(sc); 1873 sc_put(sc);
1862 return ret; 1874 return ret;
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530..15fdbdf9eb4 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
129 129
130struct o2net_sock_container { 130struct o2net_sock_container {
131 struct kref sc_kref; 131 struct kref sc_kref;
132 /* the next two are vaild for the life time of the sc */ 132 /* the next two are valid for the life time of the sc */
133 struct socket *sc_sock; 133 struct socket *sc_sock;
134 struct o2nm_node *sc_node; 134 struct o2nm_node *sc_node;
135 135
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe..edaded48e7e 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
40#include "inode.h" 40#include "inode.h"
41#include "super.h" 41#include "super.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{
45 unsigned long gen =
46 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
47 BUG_ON(dentry->d_inode);
48 dentry->d_fsdata = (void *)gen;
49}
50
43 51
44static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
45 struct nameidata *nd) 53 struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
51 mlog_entry("(0x%p, '%.*s')\n", dentry, 59 mlog_entry("(0x%p, '%.*s')\n", dentry,
52 dentry->d_name.len, dentry->d_name.name); 60 dentry->d_name.len, dentry->d_name.name);
53 61
54 /* Never trust a negative dentry - force a new lookup. */ 62 /* For a negative dentry -
63 * check the generation number of the parent and compare with the
64 * one stored in the inode.
65 */
55 if (inode == NULL) { 66 if (inode == NULL) {
56 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, 67 unsigned long gen = (unsigned long) dentry->d_fsdata;
57 dentry->d_name.name); 68 unsigned long pgen =
58 goto bail; 69 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
70 mlog(0, "negative dentry: %.*s parent gen: %lu "
71 "dentry gen: %lu\n",
72 dentry->d_name.len, dentry->d_name.name, pgen, gen);
73 if (gen != pgen)
74 goto bail;
75 goto valid;
59 } 76 }
60 77
61 BUG_ON(!osb); 78 BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
96 goto bail; 113 goto bail;
97 } 114 }
98 115
116valid:
99 ret = 1; 117 ret = 1;
100 118
101bail: 119bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
227 if (!inode) 245 if (!inode)
228 return 0; 246 return 0;
229 247
248 if (!dentry->d_inode && dentry->d_fsdata) {
249 /* Converting a negative dentry to positive
250 Clear dentry->d_fsdata */
251 dentry->d_fsdata = dl = NULL;
252 }
253
230 if (dl) { 254 if (dl) {
231 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, 255 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
232 " \"%.*s\": old parent: %llu, new: %llu\n", 256 " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
452 476
453out: 477out:
454 iput(inode); 478 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
455} 480}
456 481
457/* 482/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf..b79eff70995 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
64 struct inode *old_dir, struct inode *new_dir); 64 struct inode *old_dir, struct inode *new_dir);
65 65
66extern spinlock_t dentry_attach_lock; 66extern spinlock_t dentry_attach_lock;
67void ocfs2_dentry_attach_gen(struct dentry *dentry);
67 68
68#endif /* OCFS2_DCACHE_H */ 69#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4..c49f6de0e7a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3931 goto out_commit; 3931 goto out_commit;
3932 } 3932 }
3933 3933
3934 cpos = split_hash;
3935 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3936 data_ac, meta_ac, new_dx_leaves,
3937 num_dx_leaves);
3938 if (ret) {
3939 mlog_errno(ret);
3940 goto out_commit;
3941 }
3942
3934 for (i = 0; i < num_dx_leaves; i++) { 3943 for (i = 0; i < num_dx_leaves; i++) {
3935 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3944 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3936 orig_dx_leaves[i], 3945 orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3939 mlog_errno(ret); 3948 mlog_errno(ret);
3940 goto out_commit; 3949 goto out_commit;
3941 } 3950 }
3942 }
3943 3951
3944 cpos = split_hash; 3952 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3945 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3953 new_dx_leaves[i],
3946 data_ac, meta_ac, new_dx_leaves, 3954 OCFS2_JOURNAL_ACCESS_WRITE);
3947 num_dx_leaves); 3955 if (ret) {
3948 if (ret) { 3956 mlog_errno(ret);
3949 mlog_errno(ret); 3957 goto out_commit;
3950 goto out_commit; 3958 }
3951 } 3959 }
3952 3960
3953 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3961 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b4..b36d0bf77a5 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG, /* 515 */
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG, /* 516 */
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG, /* 517 */
448 DLM_FINALIZE_RECO_MSG /* 518 */ 448 DLM_FINALIZE_RECO_MSG, /* 518 */
449 DLM_QUERY_REGION, /* 519 */
450 DLM_QUERY_NODEINFO, /* 520 */
449}; 451};
450 452
451struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
727 u8 domain[O2NM_MAX_NAME_LEN]; 729 u8 domain[O2NM_MAX_NAME_LEN];
728}; 730};
729 731
732struct dlm_query_region {
733 u8 qr_node;
734 u8 qr_numregions;
735 u8 qr_namelen;
736 u8 pad1;
737 u8 qr_domain[O2NM_MAX_NAME_LEN];
738 u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
739};
740
741struct dlm_node_info {
742 u8 ni_nodenum;
743 u8 pad1;
744 u16 ni_ipv4_port;
745 u32 ni_ipv4_address;
746};
747
748struct dlm_query_nodeinfo {
749 u8 qn_nodenum;
750 u8 qn_numnodes;
751 u8 qn_namelen;
752 u8 pad1;
753 u8 qn_domain[O2NM_MAX_NAME_LEN];
754 struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
755};
756
730struct dlm_exit_domain 757struct dlm_exit_domain
731{ 758{
732 u8 node_idx; 759 u8 node_idx;
@@ -1030,6 +1057,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
1030 struct dlm_lock_resource *res); 1057 struct dlm_lock_resource *res);
1031void dlm_clean_master_list(struct dlm_ctxt *dlm, 1058void dlm_clean_master_list(struct dlm_ctxt *dlm,
1032 u8 dead_node); 1059 u8 dead_node);
1060void dlm_force_free_mles(struct dlm_ctxt *dlm);
1033int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 1061int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
1034int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 1062int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
1035int __dlm_lockres_unused(struct dlm_lock_resource *res); 1063int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf5439..272ec8631a5 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
419 419
420static int debug_buffer_release(struct inode *inode, struct file *file) 420static int debug_buffer_release(struct inode *inode, struct file *file)
421{ 421{
422 struct debug_buffer *db = (struct debug_buffer *)file->private_data; 422 struct debug_buffer *db = file->private_data;
423 423
424 if (db) 424 if (db)
425 kfree(db->buf); 425 kfree(db->buf);
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
493 struct hlist_head *bucket; 493 struct hlist_head *bucket;
494 struct hlist_node *list; 494 struct hlist_node *list;
495 int i, out = 0; 495 int i, out = 0;
496 unsigned long total = 0, longest = 0, bktcnt; 496 unsigned long total = 0, longest = 0, bucket_count = 0;
497 497
498 out += snprintf(db->buf + out, db->len - out, 498 out += snprintf(db->buf + out, db->len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 499 "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
505 mle = hlist_entry(list, struct dlm_master_list_entry, 505 mle = hlist_entry(list, struct dlm_master_list_entry,
506 master_hash_node); 506 master_hash_node);
507 ++total; 507 ++total;
508 ++bktcnt; 508 ++bucket_count;
509 if (db->len - out < 200) 509 if (db->len - out < 200)
510 continue; 510 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 511 out += dump_mle(mle, db->buf + out, db->len - out);
512 } 512 }
513 longest = max(longest, bktcnt); 513 longest = max(longest, bucket_count);
514 bktcnt = 0; 514 bucket_count = 0;
515 } 515 }
516 spin_unlock(&dlm->master_lock); 516 spin_unlock(&dlm->master_lock);
517 517
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
636 spin_lock(&dlm->track_lock); 636 spin_lock(&dlm->track_lock);
637 if (oldres) 637 if (oldres)
638 track_list = &oldres->tracking; 638 track_list = &oldres->tracking;
639 else 639 else {
640 track_list = &dlm->tracking_list; 640 track_list = &dlm->tracking_list;
641 if (list_empty(track_list)) {
642 dl = NULL;
643 spin_unlock(&dlm->track_lock);
644 goto bail;
645 }
646 }
641 647
642 list_for_each_entry(res, track_list, tracking) { 648 list_for_each_entry(res, track_list, tracking) {
643 if (&res->tracking == &dlm->tracking_list) 649 if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
660 } else 666 } else
661 dl = NULL; 667 dl = NULL;
662 668
669bail:
663 /* passed to seq_show */ 670 /* passed to seq_show */
664 return dl; 671 return dl;
665} 672}
@@ -715,7 +722,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
715 goto bail; 722 goto bail;
716 } 723 }
717 724
718 seq = (struct seq_file *) file->private_data; 725 seq = file->private_data;
719 seq->private = dl; 726 seq->private = dl;
720 727
721 dlm_grab(dlm); 728 dlm_grab(dlm);
@@ -731,7 +738,7 @@ bail:
731 738
732static int debug_lockres_release(struct inode *inode, struct file *file) 739static int debug_lockres_release(struct inode *inode, struct file *file)
733{ 740{
734 struct seq_file *seq = (struct seq_file *)file->private_data; 741 struct seq_file *seq = file->private_data;
735 struct debug_lockres *dl = (struct debug_lockres *)seq->private; 742 struct debug_lockres *dl = (struct debug_lockres *)seq->private;
736 743
737 if (dl->dl_res) 744 if (dl->dl_res)
@@ -775,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
775 782
776 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
777 out += snprintf(db->buf + out, db->len - out, 784 out += snprintf(db->buf + out, db->len - out,
778 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); 785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor);
779 788
780 /* Thread Pid: xxx Node: xxx State: xxxxx */ 789 /* Thread Pid: xxx Node: xxx State: xxxxx */
781 out += snprintf(db->buf + out, db->len - out, 790 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef..58a93b95373 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
128 * will have a negotiated version with the same major number and a minor 128 * will have a negotiated version with the same major number and a minor
129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should 129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
130 * be used to determine what a running domain is actually using. 130 * be used to determine what a running domain is actually using.
131 *
132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
131 */ 135 */
132static const struct dlm_protocol_version dlm_protocol = { 136static const struct dlm_protocol_version dlm_protocol = {
133 .pv_major = 1, 137 .pv_major = 1,
134 .pv_minor = 0, 138 .pv_minor = 1,
135}; 139};
136 140
137#define DLM_DOMAIN_BACKOFF_MS 200 141#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
142 void **ret_data); 146 void **ret_data);
143static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 147static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
144 void **ret_data); 148 void **ret_data);
149static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
150 void *data, void **ret_data);
145static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 151static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
146 void **ret_data); 152 void **ret_data);
147static int dlm_protocol_compare(struct dlm_protocol_version *existing, 153static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -693,6 +699,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
693 699
694 dlm_mark_domain_leaving(dlm); 700 dlm_mark_domain_leaving(dlm);
695 dlm_leave_domain(dlm); 701 dlm_leave_domain(dlm);
702 dlm_force_free_mles(dlm);
696 dlm_complete_dlm_shutdown(dlm); 703 dlm_complete_dlm_shutdown(dlm);
697 } 704 }
698 dlm_put(dlm); 705 dlm_put(dlm);
@@ -920,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
920 return 0; 927 return 0;
921} 928}
922 929
930static int dlm_match_regions(struct dlm_ctxt *dlm,
931 struct dlm_query_region *qr)
932{
933 char *local = NULL, *remote = qr->qr_regions;
934 char *l, *r;
935 int localnr, i, j, foundit;
936 int status = 0;
937
938 if (!o2hb_global_heartbeat_active()) {
939 if (qr->qr_numregions) {
940 mlog(ML_ERROR, "Domain %s: Joining node %d has global "
941 "heartbeat enabled but local node %d does not\n",
942 qr->qr_domain, qr->qr_node, dlm->node_num);
943 status = -EINVAL;
944 }
945 goto bail;
946 }
947
948 if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
949 mlog(ML_ERROR, "Domain %s: Local node %d has global "
950 "heartbeat enabled but joining node %d does not\n",
951 qr->qr_domain, dlm->node_num, qr->qr_node);
952 status = -EINVAL;
953 goto bail;
954 }
955
956 r = remote;
957 for (i = 0; i < qr->qr_numregions; ++i) {
958 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
959 r += O2HB_MAX_REGION_NAME_LEN;
960 }
961
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
963 if (!local) {
964 status = -ENOMEM;
965 goto bail;
966 }
967
968 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
969
970 /* compare local regions with remote */
971 l = local;
972 for (i = 0; i < localnr; ++i) {
973 foundit = 0;
974 r = remote;
975 for (j = 0; j <= qr->qr_numregions; ++j) {
976 if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
977 foundit = 1;
978 break;
979 }
980 r += O2HB_MAX_REGION_NAME_LEN;
981 }
982 if (!foundit) {
983 status = -EINVAL;
984 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
985 "in local node %d but not in joining node %d\n",
986 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
987 dlm->node_num, qr->qr_node);
988 goto bail;
989 }
990 l += O2HB_MAX_REGION_NAME_LEN;
991 }
992
993 /* compare remote with local regions */
994 r = remote;
995 for (i = 0; i < qr->qr_numregions; ++i) {
996 foundit = 0;
997 l = local;
998 for (j = 0; j < localnr; ++j) {
999 if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
1000 foundit = 1;
1001 break;
1002 }
1003 l += O2HB_MAX_REGION_NAME_LEN;
1004 }
1005 if (!foundit) {
1006 status = -EINVAL;
1007 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
1008 "in joining node %d but not in local node %d\n",
1009 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
1010 qr->qr_node, dlm->node_num);
1011 goto bail;
1012 }
1013 r += O2HB_MAX_REGION_NAME_LEN;
1014 }
1015
1016bail:
1017 kfree(local);
1018
1019 return status;
1020}
1021
1022static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
1023{
1024 struct dlm_query_region *qr = NULL;
1025 int status, ret = 0, i;
1026 char *p;
1027
1028 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1029 goto bail;
1030
1031 qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
1032 if (!qr) {
1033 ret = -ENOMEM;
1034 mlog_errno(ret);
1035 goto bail;
1036 }
1037
1038 qr->qr_node = dlm->node_num;
1039 qr->qr_namelen = strlen(dlm->name);
1040 memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
1041 /* if local hb, the numregions will be zero */
1042 if (o2hb_global_heartbeat_active())
1043 qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
1044 O2NM_MAX_REGIONS);
1045
1046 p = qr->qr_regions;
1047 for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
1048 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
1049
1050 i = -1;
1051 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1052 i + 1)) < O2NM_MAX_NODES) {
1053 if (i == dlm->node_num)
1054 continue;
1055
1056 mlog(0, "Sending regions to node %d\n", i);
1057
1058 ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
1059 sizeof(struct dlm_query_region),
1060 i, &status);
1061 if (ret >= 0)
1062 ret = status;
1063 if (ret) {
1064 mlog(ML_ERROR, "Region mismatch %d, node %d\n",
1065 ret, i);
1066 break;
1067 }
1068 }
1069
1070bail:
1071 kfree(qr);
1072 return ret;
1073}
1074
1075static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1076 void *data, void **ret_data)
1077{
1078 struct dlm_query_region *qr;
1079 struct dlm_ctxt *dlm = NULL;
1080 int status = 0;
1081 int locked = 0;
1082
1083 qr = (struct dlm_query_region *) msg->buf;
1084
1085 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1086 qr->qr_domain);
1087
1088 status = -EINVAL;
1089
1090 spin_lock(&dlm_domain_lock);
1091 dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
1092 if (!dlm) {
1093 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1094 "before join domain\n", qr->qr_node, qr->qr_domain);
1095 goto bail;
1096 }
1097
1098 spin_lock(&dlm->spinlock);
1099 locked = 1;
1100 if (dlm->joining_node != qr->qr_node) {
1101 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1102 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1103 dlm->joining_node);
1104 goto bail;
1105 }
1106
1107 /* Support for global heartbeat was added in 1.1 */
1108 if (dlm->dlm_locking_proto.pv_major == 1 &&
1109 dlm->dlm_locking_proto.pv_minor == 0) {
1110 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1111 "but active dlm protocol is %d.%d\n", qr->qr_node,
1112 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1113 dlm->dlm_locking_proto.pv_minor);
1114 goto bail;
1115 }
1116
1117 status = dlm_match_regions(dlm, qr);
1118
1119bail:
1120 if (locked)
1121 spin_unlock(&dlm->spinlock);
1122 spin_unlock(&dlm_domain_lock);
1123
1124 return status;
1125}
1126
1127static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
1128{
1129 struct o2nm_node *local;
1130 struct dlm_node_info *remote;
1131 int i, j;
1132 int status = 0;
1133
1134 for (j = 0; j < qn->qn_numnodes; ++j)
1135 mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
1136 &(qn->qn_nodes[j].ni_ipv4_address),
1137 ntohs(qn->qn_nodes[j].ni_ipv4_port));
1138
1139 for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
1140 local = o2nm_get_node_by_num(i);
1141 remote = NULL;
1142 for (j = 0; j < qn->qn_numnodes; ++j) {
1143 if (qn->qn_nodes[j].ni_nodenum == i) {
1144 remote = &(qn->qn_nodes[j]);
1145 break;
1146 }
1147 }
1148
1149 if (!local && !remote)
1150 continue;
1151
1152 if ((local && !remote) || (!local && remote))
1153 status = -EINVAL;
1154
1155 if (!status &&
1156 ((remote->ni_nodenum != local->nd_num) ||
1157 (remote->ni_ipv4_port != local->nd_ipv4_port) ||
1158 (remote->ni_ipv4_address != local->nd_ipv4_address)))
1159 status = -EINVAL;
1160
1161 if (status) {
1162 if (remote && !local)
1163 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1164 "registered in joining node %d but not in "
1165 "local node %d\n", qn->qn_domain,
1166 remote->ni_nodenum,
1167 &(remote->ni_ipv4_address),
1168 ntohs(remote->ni_ipv4_port),
1169 qn->qn_nodenum, dlm->node_num);
1170 if (local && !remote)
1171 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1172 "registered in local node %d but not in "
1173 "joining node %d\n", qn->qn_domain,
1174 local->nd_num, &(local->nd_ipv4_address),
1175 ntohs(local->nd_ipv4_port),
1176 dlm->node_num, qn->qn_nodenum);
1177 BUG_ON((!local && !remote));
1178 }
1179
1180 if (local)
1181 o2nm_node_put(local);
1182 }
1183
1184 return status;
1185}
1186
1187static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
1188{
1189 struct dlm_query_nodeinfo *qn = NULL;
1190 struct o2nm_node *node;
1191 int ret = 0, status, count, i;
1192
1193 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1194 goto bail;
1195
1196 qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
1197 if (!qn) {
1198 ret = -ENOMEM;
1199 mlog_errno(ret);
1200 goto bail;
1201 }
1202
1203 for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
1204 node = o2nm_get_node_by_num(i);
1205 if (!node)
1206 continue;
1207 qn->qn_nodes[count].ni_nodenum = node->nd_num;
1208 qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
1209 qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
1210 mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
1211 &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
1212 ++count;
1213 o2nm_node_put(node);
1214 }
1215
1216 qn->qn_nodenum = dlm->node_num;
1217 qn->qn_numnodes = count;
1218 qn->qn_namelen = strlen(dlm->name);
1219 memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
1220
1221 i = -1;
1222 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1223 i + 1)) < O2NM_MAX_NODES) {
1224 if (i == dlm->node_num)
1225 continue;
1226
1227 mlog(0, "Sending nodeinfo to node %d\n", i);
1228
1229 ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
1230 qn, sizeof(struct dlm_query_nodeinfo),
1231 i, &status);
1232 if (ret >= 0)
1233 ret = status;
1234 if (ret) {
1235 mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
1236 break;
1237 }
1238 }
1239
1240bail:
1241 kfree(qn);
1242 return ret;
1243}
1244
1245static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
1246 void *data, void **ret_data)
1247{
1248 struct dlm_query_nodeinfo *qn;
1249 struct dlm_ctxt *dlm = NULL;
1250 int locked = 0, status = -EINVAL;
1251
1252 qn = (struct dlm_query_nodeinfo *) msg->buf;
1253
1254 mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
1255 qn->qn_domain);
1256
1257 spin_lock(&dlm_domain_lock);
1258 dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
1259 if (!dlm) {
1260 mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
1261 "join domain\n", qn->qn_nodenum, qn->qn_domain);
1262 goto bail;
1263 }
1264
1265 spin_lock(&dlm->spinlock);
1266 locked = 1;
1267 if (dlm->joining_node != qn->qn_nodenum) {
1268 mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
1269 "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
1270 dlm->joining_node);
1271 goto bail;
1272 }
1273
1274 /* Support for node query was added in 1.1 */
1275 if (dlm->dlm_locking_proto.pv_major == 1 &&
1276 dlm->dlm_locking_proto.pv_minor == 0) {
1277 mlog(ML_ERROR, "Node %d queried nodes on domain %s "
1278 "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
1279 qn->qn_domain, dlm->dlm_locking_proto.pv_major,
1280 dlm->dlm_locking_proto.pv_minor);
1281 goto bail;
1282 }
1283
1284 status = dlm_match_nodes(dlm, qn);
1285
1286bail:
1287 if (locked)
1288 spin_unlock(&dlm->spinlock);
1289 spin_unlock(&dlm_domain_lock);
1290
1291 return status;
1292}
1293
923static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 1294static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
924 void **ret_data) 1295 void **ret_data)
925{ 1296{
@@ -1240,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1240 set_bit(dlm->node_num, dlm->domain_map); 1611 set_bit(dlm->node_num, dlm->domain_map);
1241 spin_unlock(&dlm->spinlock); 1612 spin_unlock(&dlm->spinlock);
1242 1613
1614 /* Support for global heartbeat and node info was added in 1.1 */
1615 if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
1616 status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
1617 if (status) {
1618 mlog_errno(status);
1619 goto bail;
1620 }
1621 status = dlm_send_regions(dlm, ctxt->yes_resp_map);
1622 if (status) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626 }
1627
1243 dlm_send_join_asserts(dlm, ctxt->yes_resp_map); 1628 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
1244 1629
1245 /* Joined state *must* be set before the joining node 1630 /* Joined state *must* be set before the joining node
@@ -1806,7 +2191,21 @@ static int dlm_register_net_handlers(void)
1806 sizeof(struct dlm_cancel_join), 2191 sizeof(struct dlm_cancel_join),
1807 dlm_cancel_join_handler, 2192 dlm_cancel_join_handler,
1808 NULL, NULL, &dlm_join_handlers); 2193 NULL, NULL, &dlm_join_handlers);
2194 if (status)
2195 goto bail;
2196
2197 status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
2198 sizeof(struct dlm_query_region),
2199 dlm_query_region_handler,
2200 NULL, NULL, &dlm_join_handlers);
1809 2201
2202 if (status)
2203 goto bail;
2204
2205 status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
2206 sizeof(struct dlm_query_nodeinfo),
2207 dlm_query_nodeinfo_handler,
2208 NULL, NULL, &dlm_join_handlers);
1810bail: 2209bail:
1811 if (status < 0) 2210 if (status < 0)
1812 dlm_unregister_net_handlers(); 2211 dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88..f564b0e5f80 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
511 511
512 atomic_dec(&dlm->res_cur_count); 512 atomic_dec(&dlm->res_cur_count);
513 513
514 dlm_put(dlm);
515
516 if (!hlist_unhashed(&res->hash_node) || 514 if (!hlist_unhashed(&res->hash_node) ||
517 !list_empty(&res->granted) || 515 !list_empty(&res->granted) ||
518 !list_empty(&res->converting) || 516 !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
585 res->migration_pending = 0; 583 res->migration_pending = 0;
586 res->inflight_locks = 0; 584 res->inflight_locks = 0;
587 585
588 /* put in dlm_lockres_release */
589 dlm_grab(dlm);
590 res->dlm = dlm; 586 res->dlm = dlm;
591 587
592 kref_init(&res->refs); 588 kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3050 /* check for pre-existing lock */ 3046 /* check for pre-existing lock */
3051 spin_lock(&dlm->spinlock); 3047 spin_lock(&dlm->spinlock);
3052 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 3048 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3053 spin_lock(&dlm->master_lock);
3054
3055 if (res) { 3049 if (res) {
3056 spin_lock(&res->spinlock); 3050 spin_lock(&res->spinlock);
3057 if (res->state & DLM_LOCK_RES_RECOVERING) { 3051 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3069 spin_unlock(&res->spinlock); 3063 spin_unlock(&res->spinlock);
3070 } 3064 }
3071 3065
3066 spin_lock(&dlm->master_lock);
3072 /* ignore status. only nonzero status would BUG. */ 3067 /* ignore status. only nonzero status would BUG. */
3073 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 3068 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3074 name, namelen, 3069 name, namelen,
3075 migrate->new_master, 3070 migrate->new_master,
3076 migrate->master); 3071 migrate->master);
3077 3072
3078unlock:
3079 spin_unlock(&dlm->master_lock); 3073 spin_unlock(&dlm->master_lock);
3074unlock:
3080 spin_unlock(&dlm->spinlock); 3075 spin_unlock(&dlm->spinlock);
3081 3076
3082 if (oldmle) { 3077 if (oldmle) {
@@ -3438,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3438 wake_up(&res->wq); 3433 wake_up(&res->wq);
3439 wake_up(&dlm->migration_wq); 3434 wake_up(&dlm->migration_wq);
3440} 3435}
3436
3437void dlm_force_free_mles(struct dlm_ctxt *dlm)
3438{
3439 int i;
3440 struct hlist_head *bucket;
3441 struct dlm_master_list_entry *mle;
3442 struct hlist_node *tmp, *list;
3443
3444 /*
3445 * We notified all other nodes that we are exiting the domain and
3446 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3447 * around we force free them and wake any processes that are waiting
3448 * on the mles
3449 */
3450 spin_lock(&dlm->spinlock);
3451 spin_lock(&dlm->master_lock);
3452
3453 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3454 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3455
3456 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3457 bucket = dlm_master_hash(dlm, i);
3458 hlist_for_each_safe(list, tmp, bucket) {
3459 mle = hlist_entry(list, struct dlm_master_list_entry,
3460 master_hash_node);
3461 if (mle->type != DLM_MLE_BLOCK) {
3462 mlog(ML_ERROR, "bad mle: %p\n", mle);
3463 dlm_print_one_mle(mle);
3464 }
3465 atomic_set(&mle->woken, 1);
3466 wake_up(&mle->wq);
3467
3468 __dlm_unlink_mle(dlm, mle);
3469 __dlm_mle_detach_hb_events(dlm, mle);
3470 __dlm_put_mle(mle);
3471 }
3472 }
3473 spin_unlock(&dlm->master_lock);
3474 spin_unlock(&dlm->spinlock);
3475}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36..aaaffbcbe91 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1997 struct list_head *queue; 1997 struct list_head *queue;
1998 struct dlm_lock *lock, *next; 1998 struct dlm_lock *lock, *next;
1999 1999
2000 assert_spin_locked(&dlm->spinlock);
2001 assert_spin_locked(&res->spinlock);
2000 res->state |= DLM_LOCK_RES_RECOVERING; 2002 res->state |= DLM_LOCK_RES_RECOVERING;
2001 if (!list_empty(&res->recovering)) { 2003 if (!list_empty(&res->recovering)) {
2002 mlog(0, 2004 mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2326 /* zero the lvb if necessary */ 2328 /* zero the lvb if necessary */
2327 dlm_revalidate_lvb(dlm, res, dead_node); 2329 dlm_revalidate_lvb(dlm, res, dead_node);
2328 if (res->owner == dead_node) { 2330 if (res->owner == dead_node) {
2329 if (res->state & DLM_LOCK_RES_DROPPING_REF) 2331 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2330 mlog(0, "%s:%.*s: owned by " 2332 mlog(ML_NOTICE, "Ignore %.*s for "
2331 "dead node %u, this node was " 2333 "recovery as it is being freed\n",
2332 "dropping its ref when it died. " 2334 res->lockname.len,
2333 "continue, dropping the flag.\n", 2335 res->lockname.name);
2334 dlm->name, res->lockname.len, 2336 } else
2335 res->lockname.name, dead_node); 2337 dlm_move_lockres_to_recovery_list(dlm,
2336 2338 res);
2337 /* the wake_up for this will happen when the
2338 * RECOVERING flag is dropped later */
2339 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2340 2339
2341 dlm_move_lockres_to_recovery_list(dlm, res);
2342 } else if (res->owner == dlm->node_num) { 2340 } else if (res->owner == dlm->node_num) {
2343 dlm_free_dead_locks(dlm, res, dead_node); 2341 dlm_free_dead_locks(dlm, res, dead_node);
2344 __dlm_lockres_calc_usage(dlm, res); 2342 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe..2211acf33d9 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
92 * truly ready to be freed. */ 92 * truly ready to be freed. */
93int __dlm_lockres_unused(struct dlm_lock_resource *res) 93int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 if (!__dlm_lockres_has_locks(res) && 95 int bit;
96 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 96
97 /* try not to scan the bitmap unless the first two 97 if (__dlm_lockres_has_locks(res))
98 * conditions are already true */ 98 return 0;
99 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 99
100 if (bit >= O2NM_MAX_NODES) { 100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 /* since the bit for dlm->node_num is not 101 return 0;
102 * set, inflight_locks better be zero */ 102
103 BUG_ON(res->inflight_locks != 0); 103 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 1; 104 return 0;
105 } 105
106 } 106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 return 0; 107 if (bit < O2NM_MAX_NODES)
108 return 0;
109
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1;
108} 116}
109 117
110 118
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
152 spin_unlock(&dlm->spinlock); 160 spin_unlock(&dlm->spinlock);
153} 161}
154 162
155static int dlm_purge_lockres(struct dlm_ctxt *dlm, 163static void dlm_purge_lockres(struct dlm_ctxt *dlm,
156 struct dlm_lock_resource *res) 164 struct dlm_lock_resource *res)
157{ 165{
158 int master; 166 int master;
159 int ret = 0; 167 int ret = 0;
160 168
161 spin_lock(&res->spinlock); 169 assert_spin_locked(&dlm->spinlock);
162 if (!__dlm_lockres_unused(res)) { 170 assert_spin_locked(&res->spinlock);
163 mlog(0, "%s:%.*s: tried to purge but not unused\n",
164 dlm->name, res->lockname.len, res->lockname.name);
165 __dlm_print_one_lock_resource(res);
166 spin_unlock(&res->spinlock);
167 BUG();
168 }
169
170 if (res->state & DLM_LOCK_RES_MIGRATING) {
171 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
172 "being remastered\n", dlm->name, res->lockname.len,
173 res->lockname.name);
174 /* Re-add the lockres to the end of the purge list */
175 if (!list_empty(&res->purge)) {
176 list_del_init(&res->purge);
177 list_add_tail(&res->purge, &dlm->purge_list);
178 }
179 spin_unlock(&res->spinlock);
180 return 0;
181 }
182 171
183 master = (res->owner == dlm->node_num); 172 master = (res->owner == dlm->node_num);
184 173
185 if (!master)
186 res->state |= DLM_LOCK_RES_DROPPING_REF;
187 spin_unlock(&res->spinlock);
188 174
189 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
190 res->lockname.name, master); 176 res->lockname.name, master);
191 177
192 if (!master) { 178 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF;
193 /* drop spinlock... retake below */ 180 /* drop spinlock... retake below */
181 spin_unlock(&res->spinlock);
194 spin_unlock(&dlm->spinlock); 182 spin_unlock(&dlm->spinlock);
195 183
196 spin_lock(&res->spinlock); 184 spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
208 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", 196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
209 dlm->name, res->lockname.len, res->lockname.name, ret); 197 dlm->name, res->lockname.len, res->lockname.name, ret);
210 spin_lock(&dlm->spinlock); 198 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock);
211 } 200 }
212 201
213 spin_lock(&res->spinlock);
214 if (!list_empty(&res->purge)) { 202 if (!list_empty(&res->purge)) {
215 mlog(0, "removing lockres %.*s:%p from purgelist, " 203 mlog(0, "removing lockres %.*s:%p from purgelist, "
216 "master = %d\n", res->lockname.len, res->lockname.name, 204 "master = %d\n", res->lockname.len, res->lockname.name,
217 res, master); 205 res, master);
218 list_del_init(&res->purge); 206 list_del_init(&res->purge);
219 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 207 dlm_lockres_put(res);
221 dlm->purge_count--; 208 dlm->purge_count--;
222 } else 209 }
223 spin_unlock(&res->spinlock); 210
211 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res);
215 BUG();
216 }
224 217
225 __dlm_unhash_lockres(res); 218 __dlm_unhash_lockres(res);
226 219
227 /* lockres is not in the hash now. drop the flag and wake up 220 /* lockres is not in the hash now. drop the flag and wake up
228 * any processes waiting in dlm_get_lock_resource. */ 221 * any processes waiting in dlm_get_lock_resource. */
229 if (!master) { 222 if (!master) {
230 spin_lock(&res->spinlock);
231 res->state &= ~DLM_LOCK_RES_DROPPING_REF; 223 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
232 spin_unlock(&res->spinlock); 224 spin_unlock(&res->spinlock);
233 wake_up(&res->wq); 225 wake_up(&res->wq);
234 } 226 } else
235 return 0; 227 spin_unlock(&res->spinlock);
236} 228}
237 229
238static void dlm_run_purge_list(struct dlm_ctxt *dlm, 230static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
251 lockres = list_entry(dlm->purge_list.next, 243 lockres = list_entry(dlm->purge_list.next,
252 struct dlm_lock_resource, purge); 244 struct dlm_lock_resource, purge);
253 245
254 /* Status of the lockres *might* change so double
255 * check. If the lockres is unused, holding the dlm
256 * spinlock will prevent people from getting and more
257 * refs on it -- there's no need to keep the lockres
258 * spinlock. */
259 spin_lock(&lockres->spinlock); 246 spin_lock(&lockres->spinlock);
260 unused = __dlm_lockres_unused(lockres);
261 spin_unlock(&lockres->spinlock);
262
263 if (!unused)
264 continue;
265 247
266 purge_jiffies = lockres->last_used + 248 purge_jiffies = lockres->last_used +
267 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); 249 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
273 * in tail order, we can stop at the first 255 * in tail order, we can stop at the first
274 * unpurgable resource -- anyone added after 256 * unpurgable resource -- anyone added after
275 * him will have a greater last_used value */ 257 * him will have a greater last_used value */
258 spin_unlock(&lockres->spinlock);
276 break; 259 break;
277 } 260 }
278 261
262 /* Status of the lockres *might* change so double
263 * check. If the lockres is unused, holding the dlm
264 * spinlock will prevent people from getting and more
265 * refs on it. */
266 unused = __dlm_lockres_unused(lockres);
267 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or "
270 "being remastered, used %d, state %d\n",
271 dlm->name, lockres->lockname.len,
272 lockres->lockname.name, !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock);
275 continue;
276 }
277
279 dlm_lockres_get(lockres); 278 dlm_lockres_get(lockres);
280 279
281 /* This may drop and reacquire the dlm spinlock if it 280 dlm_purge_lockres(dlm, lockres);
282 * has to do migration. */
283 if (dlm_purge_lockres(dlm, lockres))
284 BUG();
285 281
286 dlm_lockres_put(lockres); 282 dlm_lockres_put(lockres);
287 283
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f..b2df490a19e 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode,
182{ 182{
183 int level, status; 183 int level, status;
184 struct dlmfs_inode_private *ip = DLMFS_I(inode); 184 struct dlmfs_inode_private *ip = DLMFS_I(inode);
185 struct dlmfs_filp_private *fp = 185 struct dlmfs_filp_private *fp = file->private_data;
186 (struct dlmfs_filp_private *) file->private_data;
187 186
188 if (S_ISDIR(inode->i_mode)) 187 if (S_ISDIR(inode->i_mode))
189 BUG(); 188 BUG();
@@ -214,10 +213,12 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
214 213
215 attr->ia_valid &= ~ATTR_SIZE; 214 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr); 215 error = inode_change_ok(inode, attr);
217 if (!error) 216 if (error)
218 error = inode_setattr(inode, attr); 217 return error;
219 218
220 return error; 219 setattr_copy(inode, attr);
220 mark_inode_dirty(inode);
221 return 0;
221} 222}
222 223
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) 224static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
@@ -355,13 +356,12 @@ static void dlmfs_destroy_inode(struct inode *inode)
355 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); 356 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
356} 357}
357 358
358static void dlmfs_clear_inode(struct inode *inode) 359static void dlmfs_evict_inode(struct inode *inode)
359{ 360{
360 int status; 361 int status;
361 struct dlmfs_inode_private *ip; 362 struct dlmfs_inode_private *ip;
362 363
363 if (!inode) 364 end_writeback(inode);
364 return;
365 365
366 mlog(0, "inode %lu\n", inode->i_ino); 366 mlog(0, "inode %lu\n", inode->i_ino);
367 367
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400 if (inode) { 400 if (inode) {
401 ip = DLMFS_I(inode); 401 ip = DLMFS_I(inode);
402 402
403 inode->i_ino = get_next_ino();
403 inode->i_mode = mode; 404 inode->i_mode = mode;
404 inode->i_uid = current_fsuid(); 405 inode->i_uid = current_fsuid();
405 inode->i_gid = current_fsgid(); 406 inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
425 if (!inode) 426 if (!inode)
426 return NULL; 427 return NULL;
427 428
429 inode->i_ino = get_next_ino();
428 inode->i_mode = mode; 430 inode->i_mode = mode;
429 inode->i_uid = current_fsuid(); 431 inode->i_uid = current_fsuid();
430 inode->i_gid = current_fsgid(); 432 inode->i_gid = current_fsgid();
@@ -612,6 +614,7 @@ static const struct file_operations dlmfs_file_operations = {
612 .poll = dlmfs_file_poll, 614 .poll = dlmfs_file_poll,
613 .read = dlmfs_file_read, 615 .read = dlmfs_file_read,
614 .write = dlmfs_file_write, 616 .write = dlmfs_file_write,
617 .llseek = default_llseek,
615}; 618};
616 619
617static const struct inode_operations dlmfs_dir_inode_operations = { 620static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -631,7 +634,7 @@ static const struct super_operations dlmfs_ops = {
631 .statfs = simple_statfs, 634 .statfs = simple_statfs,
632 .alloc_inode = dlmfs_alloc_inode, 635 .alloc_inode = dlmfs_alloc_inode,
633 .destroy_inode = dlmfs_destroy_inode, 636 .destroy_inode = dlmfs_destroy_inode,
634 .clear_inode = dlmfs_clear_inode, 637 .evict_inode = dlmfs_evict_inode,
635 .drop_inode = generic_delete_inode, 638 .drop_inode = generic_delete_inode,
636}; 639};
637 640
@@ -640,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
640 .setattr = dlmfs_file_setattr, 643 .setattr = dlmfs_file_setattr,
641}; 644};
642 645
643static int dlmfs_get_sb(struct file_system_type *fs_type, 646static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
644 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 647 int flags, const char *dev_name, void *data)
645{ 648{
646 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); 649 return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
647} 650}
648 651
649static struct file_system_type dlmfs_fs_type = { 652static struct file_system_type dlmfs_fs_type = {
650 .owner = THIS_MODULE, 653 .owner = THIS_MODULE,
651 .name = "ocfs2_dlmfs", 654 .name = "ocfs2_dlmfs",
652 .get_sb = dlmfs_get_sb, 655 .mount = dlmfs_mount,
653 .kill_sb = kill_litter_super, 656 .kill_sb = kill_litter_super,
654}; 657};
655 658
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 39eb16ac5f9..e8d94d722ec 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
2966 2966
2967static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) 2967static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2968{ 2968{
2969 struct seq_file *seq = (struct seq_file *) file->private_data; 2969 struct seq_file *seq = file->private_data;
2970 struct ocfs2_dlm_seq_priv *priv = seq->private; 2970 struct ocfs2_dlm_seq_priv *priv = seq->private;
2971 struct ocfs2_lock_res *res = &priv->p_iter_res; 2971 struct ocfs2_lock_res *res = &priv->p_iter_res;
2972 2972
@@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
3000 goto out; 3000 goto out;
3001 } 3001 }
3002 3002
3003 seq = (struct seq_file *) file->private_data; 3003 seq = file->private_data;
3004 seq->private = priv; 3004 seq->private = priv;
3005 3005
3006 ocfs2_add_lockres_tracking(&priv->p_iter_res, 3006 ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3635{ 3635{
3636 struct inode *inode; 3636 struct inode *inode;
3637 struct address_space *mapping; 3637 struct address_space *mapping;
3638 struct ocfs2_inode_info *oi;
3638 3639
3639 inode = ocfs2_lock_res_inode(lockres); 3640 inode = ocfs2_lock_res_inode(lockres);
3640 mapping = inode->i_mapping; 3641 mapping = inode->i_mapping;
3641 3642
3643 if (S_ISDIR(inode->i_mode)) {
3644 oi = OCFS2_I(inode);
3645 oi->ip_dir_lock_gen++;
3646 mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
3647 goto out;
3648 }
3649
3642 if (!S_ISREG(inode->i_mode)) 3650 if (!S_ISREG(inode->i_mode))
3643 goto out; 3651 goto out;
3644 3652
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d..1d596d8c4a4 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
84 OI_LS_PARENT, 84 OI_LS_PARENT,
85 OI_LS_RENAME1, 85 OI_LS_RENAME1,
86 OI_LS_RENAME2, 86 OI_LS_RENAME2,
87 OI_LS_REFLINK_TARGET,
87}; 88};
88 89
89int ocfs2_dlm_init(struct ocfs2_super *osb); 90int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2b10b36d157..77b4c04a280 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h> 38#include <linux/quotaops.h>
39#include <linux/blkdev.h>
39 40
40#define MLOG_MASK_PREFIX ML_INODE 41#define MLOG_MASK_PREFIX ML_INODE
41#include <cluster/masklog.h> 42#include <cluster/masklog.h>
@@ -63,12 +64,6 @@
63 64
64#include "buffer_head_io.h" 65#include "buffer_head_io.h"
65 66
66static int ocfs2_sync_inode(struct inode *inode)
67{
68 filemap_fdatawrite(inode->i_mapping);
69 return sync_mapping_buffers(inode->i_mapping);
70}
71
72static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
73{ 68{
74 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
@@ -179,19 +174,22 @@ static int ocfs2_sync_file(struct file *file, int datasync)
179{ 174{
180 int err = 0; 175 int err = 0;
181 journal_t *journal; 176 journal_t *journal;
182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host; 177 struct inode *inode = file->f_mapping->host;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185 179
186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
187 dentry->d_name.len, dentry->d_name.name); 181 file->f_path.dentry, file->f_path.dentry->d_name.len,
188 182 file->f_path.dentry->d_name.name);
189 err = ocfs2_sync_inode(dentry->d_inode);
190 if (err)
191 goto bail;
192 183
193 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
185 /*
186 * We still have to flush drive's caches to get data to the
187 * platter
188 */
189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
194 goto bail; 191 goto bail;
192 }
195 193
196 journal = osb->journal->j_journal; 194 journal = osb->journal->j_journal;
197 err = jbd2_journal_force_commit(journal); 195 err = jbd2_journal_force_commit(journal);
@@ -361,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
361 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 359 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
362 goto out; 360 goto out;
363 361
364 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 362 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
365 363
366out: 364out:
367 return status; 365 return status;
@@ -774,7 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
774 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 772 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
775 BUG_ON(abs_from & (inode->i_blkbits - 1)); 773 BUG_ON(abs_from & (inode->i_blkbits - 1));
776 774
777 page = grab_cache_page(mapping, index); 775 page = find_or_create_page(mapping, index, GFP_NOFS);
778 if (!page) { 776 if (!page) {
779 ret = -ENOMEM; 777 ret = -ENOMEM;
780 mlog_errno(ret); 778 mlog_errno(ret);
@@ -798,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
798 block_end = block_start + (1 << inode->i_blkbits); 796 block_end = block_start + (1 << inode->i_blkbits);
799 797
800 /* 798 /*
801 * block_start is block-aligned. Bump it by one to 799 * block_start is block-aligned. Bump it by one to force
802 * force ocfs2_{prepare,commit}_write() to zero the 800 * __block_write_begin and block_commit_write to zero the
803 * whole block. 801 * whole block.
804 */ 802 */
805 ret = ocfs2_prepare_write_nolock(inode, page, 803 ret = __block_write_begin(page, block_start + 1, 0,
806 block_start + 1, 804 ocfs2_get_block);
807 block_start + 1);
808 if (ret < 0) { 805 if (ret < 0) {
809 mlog_errno(ret); 806 mlog_errno(ret);
810 goto out_unlock; 807 goto out_unlock;
@@ -904,8 +901,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
904 zero_clusters = last_cpos - zero_cpos; 901 zero_clusters = last_cpos - zero_cpos;
905 902
906 if (needs_cow) { 903 if (needs_cow) {
907 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, 904 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
908 UINT_MAX); 905 zero_clusters, UINT_MAX);
909 if (rc) { 906 if (rc) {
910 mlog_errno(rc); 907 mlog_errno(rc);
911 goto out; 908 goto out;
@@ -1233,18 +1230,26 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1233 } 1230 }
1234 1231
1235 /* 1232 /*
1236 * This will intentionally not wind up calling simple_setsize(), 1233 * This will intentionally not wind up calling truncate_setsize(),
1237 * since all the work for a size change has been done above. 1234 * since all the work for a size change has been done above.
1238 * Otherwise, we could get into problems with truncate as 1235 * Otherwise, we could get into problems with truncate as
1239 * ip_alloc_sem is used there to protect against i_size 1236 * ip_alloc_sem is used there to protect against i_size
1240 * changes. 1237 * changes.
1238 *
1239 * XXX: this means the conditional below can probably be removed.
1241 */ 1240 */
1242 status = inode_setattr(inode, attr); 1241 if ((attr->ia_valid & ATTR_SIZE) &&
1243 if (status < 0) { 1242 attr->ia_size != i_size_read(inode)) {
1244 mlog_errno(status); 1243 status = vmtruncate(inode, attr->ia_size);
1245 goto bail_commit; 1244 if (status) {
1245 mlog_errno(status);
1246 goto bail_commit;
1247 }
1246 } 1248 }
1247 1249
1250 setattr_copy(inode, attr);
1251 mark_inode_dirty(inode);
1252
1248 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1253 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1249 if (status < 0) 1254 if (status < 0)
1250 mlog_errno(status); 1255 mlog_errno(status);
@@ -2045,6 +2050,7 @@ out:
2045} 2050}
2046 2051
2047static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2052static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2053 struct file *file,
2048 loff_t pos, size_t count, 2054 loff_t pos, size_t count,
2049 int *meta_level) 2055 int *meta_level)
2050{ 2056{
@@ -2062,7 +2068,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2062 2068
2063 *meta_level = 1; 2069 *meta_level = 1;
2064 2070
2065 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2071 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2066 if (ret) 2072 if (ret)
2067 mlog_errno(ret); 2073 mlog_errno(ret);
2068out: 2074out:
@@ -2070,7 +2076,7 @@ out:
2070 return ret; 2076 return ret;
2071} 2077}
2072 2078
2073static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 2079static int ocfs2_prepare_inode_for_write(struct file *file,
2074 loff_t *ppos, 2080 loff_t *ppos,
2075 size_t count, 2081 size_t count,
2076 int appending, 2082 int appending,
@@ -2078,6 +2084,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2078 int *has_refcount) 2084 int *has_refcount)
2079{ 2085{
2080 int ret = 0, meta_level = 0; 2086 int ret = 0, meta_level = 0;
2087 struct dentry *dentry = file->f_path.dentry;
2081 struct inode *inode = dentry->d_inode; 2088 struct inode *inode = dentry->d_inode;
2082 loff_t saved_pos, end; 2089 loff_t saved_pos, end;
2083 2090
@@ -2133,6 +2140,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2133 meta_level = -1; 2140 meta_level = -1;
2134 2141
2135 ret = ocfs2_prepare_inode_for_refcount(inode, 2142 ret = ocfs2_prepare_inode_for_refcount(inode,
2143 file,
2136 saved_pos, 2144 saved_pos,
2137 count, 2145 count,
2138 &meta_level); 2146 &meta_level);
@@ -2215,6 +2223,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2215 struct file *file = iocb->ki_filp; 2223 struct file *file = iocb->ki_filp;
2216 struct inode *inode = file->f_path.dentry->d_inode; 2224 struct inode *inode = file->f_path.dentry->d_inode;
2217 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2226 int full_coherency = !(osb->s_mount_opt &
2227 OCFS2_MOUNT_COHERENCY_BUFFERED);
2218 2228
2219 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2229 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2220 (unsigned int)nr_segs, 2230 (unsigned int)nr_segs,
@@ -2238,16 +2248,39 @@ relock:
2238 have_alloc_sem = 1; 2248 have_alloc_sem = 1;
2239 } 2249 }
2240 2250
2241 /* concurrent O_DIRECT writes are allowed */ 2251 /*
2242 rw_level = !direct_io; 2252 * Concurrent O_DIRECT writes are allowed with
2253 * mount_option "coherency=buffered".
2254 */
2255 rw_level = (!direct_io || full_coherency);
2256
2243 ret = ocfs2_rw_lock(inode, rw_level); 2257 ret = ocfs2_rw_lock(inode, rw_level);
2244 if (ret < 0) { 2258 if (ret < 0) {
2245 mlog_errno(ret); 2259 mlog_errno(ret);
2246 goto out_sems; 2260 goto out_sems;
2247 } 2261 }
2248 2262
2263 /*
2264 * O_DIRECT writes with "coherency=full" need to take EX cluster
2265 * inode_lock to guarantee coherency.
2266 */
2267 if (direct_io && full_coherency) {
2268 /*
2269 * We need to take and drop the inode lock to force
2270 * other nodes to drop their caches. Buffered I/O
2271 * already does this in write_begin().
2272 */
2273 ret = ocfs2_inode_lock(inode, NULL, 1);
2274 if (ret < 0) {
2275 mlog_errno(ret);
2276 goto out_sems;
2277 }
2278
2279 ocfs2_inode_unlock(inode, 1);
2280 }
2281
2249 can_do_direct = direct_io; 2282 can_do_direct = direct_io;
2250 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2283 ret = ocfs2_prepare_inode_for_write(file, ppos,
2251 iocb->ki_left, appending, 2284 iocb->ki_left, appending,
2252 &can_do_direct, &has_refcount); 2285 &can_do_direct, &has_refcount);
2253 if (ret < 0) { 2286 if (ret < 0) {
@@ -2295,17 +2328,6 @@ relock:
2295 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2328 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2296 ppos, count, ocount); 2329 ppos, count, ocount);
2297 if (written < 0) { 2330 if (written < 0) {
2298 /*
2299 * direct write may have instantiated a few
2300 * blocks outside i_size. Trim these off again.
2301 * Don't need i_size_read because we hold i_mutex.
2302 *
2303 * XXX(hch): this looks buggy because ocfs2 did not
2304 * actually implement ->truncate. Take a look at
2305 * the new truncate sequence and update this accordingly
2306 */
2307 if (*ppos + count > inode->i_size)
2308 simple_setsize(inode, inode->i_size);
2309 ret = written; 2331 ret = written;
2310 goto out_dio; 2332 goto out_dio;
2311 } 2333 }
@@ -2321,7 +2343,7 @@ out_dio:
2321 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2343 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2322 2344
2323 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2345 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2324 ((file->f_flags & O_DIRECT) && has_refcount)) { 2346 ((file->f_flags & O_DIRECT) && !direct_io)) {
2325 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2347 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2326 pos + count - 1); 2348 pos + count - 1);
2327 if (ret < 0) 2349 if (ret < 0)
@@ -2377,7 +2399,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2377{ 2399{
2378 int ret; 2400 int ret;
2379 2401
2380 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2402 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2381 sd->total_len, 0, NULL, NULL); 2403 sd->total_len, 0, NULL, NULL);
2382 if (ret < 0) { 2404 if (ret < 0) {
2383 mlog_errno(ret); 2405 mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index abb0a95cc71..f935fd6600d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
335 else 335 else
336 inode->i_fop = &ocfs2_dops_no_plocks; 336 inode->i_fop = &ocfs2_dops_no_plocks;
337 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
338 OCFS2_I(inode)->ip_dir_lock_gen = 1;
338 break; 339 break;
339 case S_IFLNK: 340 case S_IFLNK:
340 if (ocfs2_inode_is_fast_symlink(inode)) 341 if (ocfs2_inode_is_fast_symlink(inode))
@@ -488,7 +489,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
488 OCFS2_BH_IGNORE_CACHE); 489 OCFS2_BH_IGNORE_CACHE);
489 } else { 490 } else {
490 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 491 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
491 if (!status) 492 /*
493 * If buffer is in jbd, then its checksum may not have been
494 * computed as yet.
495 */
496 if (!status && !buffer_jbd(bh))
492 status = ocfs2_validate_inode_block(osb->sb, bh); 497 status = ocfs2_validate_inode_block(osb->sb, bh);
493 } 498 }
494 if (status < 0) { 499 if (status < 0) {
@@ -969,7 +974,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
969 truncate_inode_pages(&inode->i_data, 0); 974 truncate_inode_pages(&inode->i_data, 0);
970} 975}
971 976
972void ocfs2_delete_inode(struct inode *inode) 977static void ocfs2_delete_inode(struct inode *inode)
973{ 978{
974 int wipe, status; 979 int wipe, status;
975 sigset_t oldset; 980 sigset_t oldset;
@@ -1075,20 +1080,17 @@ bail_unlock_nfs_sync:
1075bail_unblock: 1080bail_unblock:
1076 ocfs2_unblock_signals(&oldset); 1081 ocfs2_unblock_signals(&oldset);
1077bail: 1082bail:
1078 clear_inode(inode);
1079 mlog_exit_void(); 1083 mlog_exit_void();
1080} 1084}
1081 1085
1082void ocfs2_clear_inode(struct inode *inode) 1086static void ocfs2_clear_inode(struct inode *inode)
1083{ 1087{
1084 int status; 1088 int status;
1085 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1089 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1086 1090
1087 mlog_entry_void(); 1091 mlog_entry_void();
1088 1092
1089 if (!inode) 1093 end_writeback(inode);
1090 goto bail;
1091
1092 mlog(0, "Clearing inode: %llu, nlink = %u\n", 1094 mlog(0, "Clearing inode: %llu, nlink = %u\n",
1093 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); 1095 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
1094 1096
@@ -1180,16 +1182,27 @@ void ocfs2_clear_inode(struct inode *inode)
1180 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, 1182 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
1181 &oi->ip_jinode); 1183 &oi->ip_jinode);
1182 1184
1183bail:
1184 mlog_exit_void(); 1185 mlog_exit_void();
1185} 1186}
1186 1187
1188void ocfs2_evict_inode(struct inode *inode)
1189{
1190 if (!inode->i_nlink ||
1191 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1192 ocfs2_delete_inode(inode);
1193 } else {
1194 truncate_inode_pages(&inode->i_data, 0);
1195 }
1196 ocfs2_clear_inode(inode);
1197}
1198
1187/* Called under inode_lock, with no more references on the 1199/* Called under inode_lock, with no more references on the
1188 * struct inode, so it's safe here to check the flags field 1200 * struct inode, so it's safe here to check the flags field
1189 * and to manipulate i_nlink without any other locks. */ 1201 * and to manipulate i_nlink without any other locks. */
1190void ocfs2_drop_inode(struct inode *inode) 1202int ocfs2_drop_inode(struct inode *inode)
1191{ 1203{
1192 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1204 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1205 int res;
1193 1206
1194 mlog_entry_void(); 1207 mlog_entry_void();
1195 1208
@@ -1197,11 +1210,12 @@ void ocfs2_drop_inode(struct inode *inode)
1197 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); 1210 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
1198 1211
1199 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1212 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
1200 generic_delete_inode(inode); 1213 res = 1;
1201 else 1214 else
1202 generic_drop_inode(inode); 1215 res = generic_drop_inode(inode);
1203 1216
1204 mlog_exit_void(); 1217 mlog_exit_void();
1218 return res;
1205} 1219}
1206 1220
1207/* 1221/*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9f5f5fcadc4..1c508b149b3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
46 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 47 spinlock_t ip_lock;
48 u32 ip_open_count; 48 u32 ip_open_count;
49 u32 ip_clusters;
50 struct list_head ip_io_markers; 49 struct list_head ip_io_markers;
50 u32 ip_clusters;
51 51
52 u16 ip_dyn_features;
52 struct mutex ip_io_mutex; 53 struct mutex ip_io_mutex;
53
54 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
55 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
56 u16 ip_dyn_features;
57 56
58 /* protected by recovery_lock. */ 57 /* protected by recovery_lock. */
59 struct inode *ip_next_orphan; 58 struct inode *ip_next_orphan;
60 59
61 u32 ip_dir_start_lookup;
62
63 struct ocfs2_caching_info ip_metadata_cache; 60 struct ocfs2_caching_info ip_metadata_cache;
64
65 struct ocfs2_extent_map ip_extent_map; 61 struct ocfs2_extent_map ip_extent_map;
66
67 struct inode vfs_inode; 62 struct inode vfs_inode;
68 struct jbd2_inode ip_jinode; 63 struct jbd2_inode ip_jinode;
69 64
65 u32 ip_dir_start_lookup;
66
70 /* Only valid if the inode is the dir. */ 67 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 68 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 69 u64 ip_last_used_group;
70 u32 ip_dir_lock_gen;
73 71
74 struct ocfs2_alloc_reservation ip_la_data_resv; 72 struct ocfs2_alloc_reservation ip_la_data_resv;
75}; 73};
@@ -123,9 +121,8 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
123 return &OCFS2_I(inode)->ip_metadata_cache; 121 return &OCFS2_I(inode)->ip_metadata_cache;
124} 122}
125 123
126void ocfs2_clear_inode(struct inode *inode); 124void ocfs2_evict_inode(struct inode *inode);
127void ocfs2_delete_inode(struct inode *inode); 125int ocfs2_drop_inode(struct inode *inode);
128void ocfs2_drop_inode(struct inode *inode);
129 126
130/* Flags for ocfs2_iget() */ 127/* Flags for ocfs2_iget() */
131#define OCFS2_FI_FLAG_SYSFILE 0x1 128#define OCFS2_FI_FLAG_SYSFILE 0x1
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132ce..7a486819615 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
26 26
27#include <linux/ext2_fs.h> 27#include <linux/ext2_fs.h>
28 28
29#define o2info_from_user(a, b) \
30 copy_from_user(&(a), (b), sizeof(a))
31#define o2info_to_user(a, b) \
32 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
33
34/*
35 * This call is void because we are already reporting an error that may
36 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
37 * just a best-effort to tell userspace that this request caused the error.
38 */
39static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
40 struct ocfs2_info_request __user *req)
41{
42 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
43 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
44}
45
46#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48
29static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
30{ 50{
31 int status; 51 int status;
@@ -109,6 +129,328 @@ bail:
109 return status; 129 return status;
110} 130}
111 131
132int ocfs2_info_handle_blocksize(struct inode *inode,
133 struct ocfs2_info_request __user *req)
134{
135 int status = -EFAULT;
136 struct ocfs2_info_blocksize oib;
137
138 if (o2info_from_user(oib, req))
139 goto bail;
140
141 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
143
144 if (o2info_to_user(oib, req))
145 goto bail;
146
147 status = 0;
148bail:
149 if (status)
150 o2info_set_request_error(oib, req);
151
152 return status;
153}
154
155int ocfs2_info_handle_clustersize(struct inode *inode,
156 struct ocfs2_info_request __user *req)
157{
158 int status = -EFAULT;
159 struct ocfs2_info_clustersize oic;
160 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
161
162 if (o2info_from_user(oic, req))
163 goto bail;
164
165 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
167
168 if (o2info_to_user(oic, req))
169 goto bail;
170
171 status = 0;
172bail:
173 if (status)
174 o2info_set_request_error(oic, req);
175
176 return status;
177}
178
179int ocfs2_info_handle_maxslots(struct inode *inode,
180 struct ocfs2_info_request __user *req)
181{
182 int status = -EFAULT;
183 struct ocfs2_info_maxslots oim;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185
186 if (o2info_from_user(oim, req))
187 goto bail;
188
189 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
191
192 if (o2info_to_user(oim, req))
193 goto bail;
194
195 status = 0;
196bail:
197 if (status)
198 o2info_set_request_error(oim, req);
199
200 return status;
201}
202
203int ocfs2_info_handle_label(struct inode *inode,
204 struct ocfs2_info_request __user *req)
205{
206 int status = -EFAULT;
207 struct ocfs2_info_label oil;
208 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
209
210 if (o2info_from_user(oil, req))
211 goto bail;
212
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
215
216 if (o2info_to_user(oil, req))
217 goto bail;
218
219 status = 0;
220bail:
221 if (status)
222 o2info_set_request_error(oil, req);
223
224 return status;
225}
226
227int ocfs2_info_handle_uuid(struct inode *inode,
228 struct ocfs2_info_request __user *req)
229{
230 int status = -EFAULT;
231 struct ocfs2_info_uuid oiu;
232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233
234 if (o2info_from_user(oiu, req))
235 goto bail;
236
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
239
240 if (o2info_to_user(oiu, req))
241 goto bail;
242
243 status = 0;
244bail:
245 if (status)
246 o2info_set_request_error(oiu, req);
247
248 return status;
249}
250
251int ocfs2_info_handle_fs_features(struct inode *inode,
252 struct ocfs2_info_request __user *req)
253{
254 int status = -EFAULT;
255 struct ocfs2_info_fs_features oif;
256 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
257
258 if (o2info_from_user(oif, req))
259 goto bail;
260
261 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
265
266 if (o2info_to_user(oif, req))
267 goto bail;
268
269 status = 0;
270bail:
271 if (status)
272 o2info_set_request_error(oif, req);
273
274 return status;
275}
276
277int ocfs2_info_handle_journal_size(struct inode *inode,
278 struct ocfs2_info_request __user *req)
279{
280 int status = -EFAULT;
281 struct ocfs2_info_journal_size oij;
282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
283
284 if (o2info_from_user(oij, req))
285 goto bail;
286
287 oij.ij_journal_size = osb->journal->j_inode->i_size;
288
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
290
291 if (o2info_to_user(oij, req))
292 goto bail;
293
294 status = 0;
295bail:
296 if (status)
297 o2info_set_request_error(oij, req);
298
299 return status;
300}
301
302int ocfs2_info_handle_unknown(struct inode *inode,
303 struct ocfs2_info_request __user *req)
304{
305 int status = -EFAULT;
306 struct ocfs2_info_request oir;
307
308 if (o2info_from_user(oir, req))
309 goto bail;
310
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
312
313 if (o2info_to_user(oir, req))
314 goto bail;
315
316 status = 0;
317bail:
318 if (status)
319 o2info_set_request_error(oir, req);
320
321 return status;
322}
323
324/*
325 * Validate and distinguish OCFS2_IOC_INFO requests.
326 *
327 * - validate the magic number.
328 * - distinguish different requests.
329 * - validate size of different requests.
330 */
331int ocfs2_info_handle_request(struct inode *inode,
332 struct ocfs2_info_request __user *req)
333{
334 int status = -EFAULT;
335 struct ocfs2_info_request oir;
336
337 if (o2info_from_user(oir, req))
338 goto bail;
339
340 status = -EINVAL;
341 if (oir.ir_magic != OCFS2_INFO_MAGIC)
342 goto bail;
343
344 switch (oir.ir_code) {
345 case OCFS2_INFO_BLOCKSIZE:
346 if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
347 status = ocfs2_info_handle_blocksize(inode, req);
348 break;
349 case OCFS2_INFO_CLUSTERSIZE:
350 if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
351 status = ocfs2_info_handle_clustersize(inode, req);
352 break;
353 case OCFS2_INFO_MAXSLOTS:
354 if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
355 status = ocfs2_info_handle_maxslots(inode, req);
356 break;
357 case OCFS2_INFO_LABEL:
358 if (oir.ir_size == sizeof(struct ocfs2_info_label))
359 status = ocfs2_info_handle_label(inode, req);
360 break;
361 case OCFS2_INFO_UUID:
362 if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
363 status = ocfs2_info_handle_uuid(inode, req);
364 break;
365 case OCFS2_INFO_FS_FEATURES:
366 if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
367 status = ocfs2_info_handle_fs_features(inode, req);
368 break;
369 case OCFS2_INFO_JOURNAL_SIZE:
370 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
371 status = ocfs2_info_handle_journal_size(inode, req);
372 break;
373 default:
374 status = ocfs2_info_handle_unknown(inode, req);
375 break;
376 }
377
378bail:
379 return status;
380}
381
382int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
383 u64 *req_addr, int compat_flag)
384{
385 int status = -EFAULT;
386 u64 __user *bp = NULL;
387
388 if (compat_flag) {
389#ifdef CONFIG_COMPAT
390 /*
391 * pointer bp stores the base address of a pointers array,
392 * which collects all addresses of separate request.
393 */
394 bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
395#else
396 BUG();
397#endif
398 } else
399 bp = (u64 __user *)(unsigned long)(info->oi_requests);
400
401 if (o2info_from_user(*req_addr, bp + idx))
402 goto bail;
403
404 status = 0;
405bail:
406 return status;
407}
408
409/*
410 * OCFS2_IOC_INFO handles an array of requests passed from userspace.
411 *
412 * ocfs2_info_handle() recevies a large info aggregation, grab and
413 * validate the request count from header, then break it into small
414 * pieces, later specific handlers can handle them one by one.
415 *
416 * Idea here is to make each separate request small enough to ensure
417 * a better backward&forward compatibility, since a small piece of
418 * request will be less likely to be broken if disk layout get changed.
419 */
420int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
421 int compat_flag)
422{
423 int i, status = 0;
424 u64 req_addr;
425 struct ocfs2_info_request __user *reqp;
426
427 if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
428 (!info->oi_requests)) {
429 status = -EINVAL;
430 goto bail;
431 }
432
433 for (i = 0; i < info->oi_count; i++) {
434
435 status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
436 if (status)
437 break;
438
439 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
440 if (!reqp) {
441 status = -EINVAL;
442 goto bail;
443 }
444
445 status = ocfs2_info_handle_request(inode, reqp);
446 if (status)
447 break;
448 }
449
450bail:
451 return status;
452}
453
112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 454long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
113{ 455{
114 struct inode *inode = filp->f_path.dentry->d_inode; 456 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
120 struct reflink_arguments args; 462 struct reflink_arguments args;
121 const char *old_path, *new_path; 463 const char *old_path, *new_path;
122 bool preserve; 464 bool preserve;
465 struct ocfs2_info info;
123 466
124 switch (cmd) { 467 switch (cmd) {
125 case OCFS2_IOC_GETFLAGS: 468 case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
174 preserve = (args.preserve != 0); 517 preserve = (args.preserve != 0);
175 518
176 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 519 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
520 case OCFS2_IOC_INFO:
521 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
522 sizeof(struct ocfs2_info)))
523 return -EFAULT;
524
525 return ocfs2_info_handle(inode, &info, 0);
177 default: 526 default:
178 return -ENOTTY; 527 return -ENOTTY;
179 } 528 }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
185 bool preserve; 534 bool preserve;
186 struct reflink_arguments args; 535 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode; 536 struct inode *inode = file->f_path.dentry->d_inode;
537 struct ocfs2_info info;
188 538
189 switch (cmd) { 539 switch (cmd) {
190 case OCFS2_IOC32_GETFLAGS: 540 case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
209 559
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 560 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve); 561 compat_ptr(args.new_path), preserve);
562 case OCFS2_IOC_INFO:
563 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
564 sizeof(struct ocfs2_info)))
565 return -EFAULT;
566
567 return ocfs2_info_handle(inode, &info, 1);
212 default: 568 default:
213 return -ENOIOCTLCMD; 569 return -ENOIOCTLCMD;
214 } 570 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 625de9d7088..faa2303dbf0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
301{ 301{
302 int status = 0; 302 int status = 0;
303 unsigned int flushed; 303 unsigned int flushed;
304 unsigned long old_id;
305 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
306 305
307 mlog_entry_void(); 306 mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
326 goto finally; 325 goto finally;
327 } 326 }
328 327
329 old_id = ocfs2_inc_trans_id(journal); 328 ocfs2_inc_trans_id(journal);
330 329
331 flushed = atomic_read(&journal->j_num_trans); 330 flushed = atomic_read(&journal->j_num_trans);
332 atomic_set(&journal->j_num_trans, 0); 331 atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
342 return status; 341 return status;
343} 342}
344 343
345/* pass it NULL and it will allocate a new handle object for you. If
346 * you pass it a handle however, it may still return error, in which
347 * case it has free'd the passed handle for you. */
348handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 344handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
349{ 345{
350 journal_t *journal = osb->journal->j_journal; 346 journal_t *journal = osb->journal->j_journal;
@@ -760,13 +756,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
760 if (osb->osb_commit_interval) 756 if (osb->osb_commit_interval)
761 commit_interval = osb->osb_commit_interval; 757 commit_interval = osb->osb_commit_interval;
762 758
763 spin_lock(&journal->j_state_lock); 759 write_lock(&journal->j_state_lock);
764 journal->j_commit_interval = commit_interval; 760 journal->j_commit_interval = commit_interval;
765 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 761 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
766 journal->j_flags |= JBD2_BARRIER; 762 journal->j_flags |= JBD2_BARRIER;
767 else 763 else
768 journal->j_flags &= ~JBD2_BARRIER; 764 journal->j_flags &= ~JBD2_BARRIER;
769 spin_unlock(&journal->j_state_lock); 765 write_unlock(&journal->j_state_lock);
770} 766}
771 767
772int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) 768int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888 1884
1889 os = &osb->osb_orphan_scan; 1885 os = &osb->osb_orphan_scan;
1890 1886
1887 mlog(0, "Begin orphan scan\n");
1888
1891 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1892 goto out; 1890 goto out;
1893 1891
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1920unlock: 1918unlock:
1921 ocfs2_orphan_scan_unlock(osb, seqno); 1919 ocfs2_orphan_scan_unlock(osb, seqno);
1922out: 1920out:
1921 mlog(0, "Orphan scan completed\n");
1923 return; 1922 return;
1924} 1923}
1925 1924
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710..43e56b97f9c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
67 struct buffer_head *j_bh; /* Journal disk inode block */ 67 struct buffer_head *j_bh; /* Journal disk inode block */
68 atomic_t j_num_trans; /* Number of transactions 68 atomic_t j_num_trans; /* Number of transactions
69 * currently in the system. */ 69 * currently in the system. */
70 spinlock_t j_lock;
70 unsigned long j_trans_id; 71 unsigned long j_trans_id;
71 struct rw_semaphore j_trans_barrier; 72 struct rw_semaphore j_trans_barrier;
72 wait_queue_head_t j_checkpointed; 73 wait_queue_head_t j_checkpointed;
73 74
74 spinlock_t j_lock; 75 /* both fields protected by j_lock*/
75 struct list_head j_la_cleanups; 76 struct list_head j_la_cleanups;
76 struct work_struct j_recovery_work; 77 struct work_struct j_recovery_work;
77}; 78};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f13..7e32db9c2c9 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
59 return ret; 59 return ret;
60} 60}
61 61
62static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, 62static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
63 struct page *page) 63 struct page *page)
64{ 64{
65 int ret; 65 int ret;
66 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 67 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 68 loff_t pos = page_offset(page);
68 unsigned int len = PAGE_CACHE_SIZE; 69 unsigned int len = PAGE_CACHE_SIZE;
@@ -74,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
74 /* 75 /*
75 * Another node might have truncated while we were waiting on 76 * Another node might have truncated while we were waiting on
76 * cluster locks. 77 * cluster locks.
78 * We don't check size == 0 before the shift. This is borrowed
79 * from do_generic_file_read.
77 */ 80 */
78 last_index = size >> PAGE_CACHE_SHIFT; 81 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
79 if (page->index > last_index) { 82 if (unlikely(!size || page->index > last_index)) {
80 ret = -EINVAL; 83 ret = -EINVAL;
81 goto out; 84 goto out;
82 } 85 }
@@ -107,9 +110,9 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
107 * because the "write" would invalidate their data. 110 * because the "write" would invalidate their data.
108 */ 111 */
109 if (page->index == last_index) 112 if (page->index == last_index)
110 len = size & ~PAGE_CACHE_MASK; 113 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
111 114
112 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 115 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
113 &fsdata, di_bh, page); 116 &fsdata, di_bh, page);
114 if (ret) { 117 if (ret) {
115 if (ret != -ENOSPC) 118 if (ret != -ENOSPC)
@@ -157,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
157 */ 160 */
158 down_write(&OCFS2_I(inode)->ip_alloc_sem); 161 down_write(&OCFS2_I(inode)->ip_alloc_sem);
159 162
160 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 163 ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
161 164
162 up_write(&OCFS2_I(inode)->ip_alloc_sem); 165 up_write(&OCFS2_I(inode)->ip_alloc_sem);
163 166
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f..ff5744e1e36 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
171 ret = ERR_PTR(status); 171 ret = ERR_PTR(status);
172 goto bail_unlock; 172 goto bail_unlock;
173 } 173 }
174 } 174 } else
175 ocfs2_dentry_attach_gen(dentry);
175 176
176bail_unlock: 177bail_unlock:
177 /* Don't drop the cluster lock until *after* the d_add -- 178 /* Don't drop the cluster lock until *after* the d_add --
@@ -472,32 +473,23 @@ leave:
472 return status; 473 return status;
473} 474}
474 475
475static int ocfs2_mknod_locked(struct ocfs2_super *osb, 476static int __ocfs2_mknod_locked(struct inode *dir,
476 struct inode *dir, 477 struct inode *inode,
477 struct inode *inode, 478 dev_t dev,
478 dev_t dev, 479 struct buffer_head **new_fe_bh,
479 struct buffer_head **new_fe_bh, 480 struct buffer_head *parent_fe_bh,
480 struct buffer_head *parent_fe_bh, 481 handle_t *handle,
481 handle_t *handle, 482 struct ocfs2_alloc_context *inode_ac,
482 struct ocfs2_alloc_context *inode_ac) 483 u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
483{ 484{
484 int status = 0; 485 int status = 0;
486 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
485 struct ocfs2_dinode *fe = NULL; 487 struct ocfs2_dinode *fe = NULL;
486 struct ocfs2_extent_list *fel; 488 struct ocfs2_extent_list *fel;
487 u64 suballoc_loc, fe_blkno = 0;
488 u16 suballoc_bit;
489 u16 feat; 489 u16 feat;
490 490
491 *new_fe_bh = NULL; 491 *new_fe_bh = NULL;
492 492
493 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
494 inode_ac, &suballoc_loc,
495 &suballoc_bit, &fe_blkno);
496 if (status < 0) {
497 mlog_errno(status);
498 goto leave;
499 }
500
501 /* populate as many fields early on as possible - many of 493 /* populate as many fields early on as possible - many of
502 * these are used by the support functions here and in 494 * these are used by the support functions here and in
503 * callers. */ 495 * callers. */
@@ -591,6 +583,34 @@ leave:
591 return status; 583 return status;
592} 584}
593 585
586static int ocfs2_mknod_locked(struct ocfs2_super *osb,
587 struct inode *dir,
588 struct inode *inode,
589 dev_t dev,
590 struct buffer_head **new_fe_bh,
591 struct buffer_head *parent_fe_bh,
592 handle_t *handle,
593 struct ocfs2_alloc_context *inode_ac)
594{
595 int status = 0;
596 u64 suballoc_loc, fe_blkno = 0;
597 u16 suballoc_bit;
598
599 *new_fe_bh = NULL;
600
601 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
602 inode_ac, &suballoc_loc,
603 &suballoc_bit, &fe_blkno);
604 if (status < 0) {
605 mlog_errno(status);
606 return status;
607 }
608
609 return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
610 parent_fe_bh, handle, inode_ac,
611 fe_blkno, suballoc_loc, suballoc_bit);
612}
613
594static int ocfs2_mkdir(struct inode *dir, 614static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry, 615 struct dentry *dentry,
596 int mode) 616 int mode)
@@ -722,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
722 goto out_commit; 742 goto out_commit;
723 } 743 }
724 744
725 atomic_inc(&inode->i_count); 745 ihold(inode);
726 dentry->d_op = &ocfs2_dentry_ops; 746 dentry->d_op = &ocfs2_dentry_ops;
727 d_instantiate(dentry, inode); 747 d_instantiate(dentry, inode);
728 748
@@ -1852,61 +1872,117 @@ bail:
1852 return status; 1872 return status;
1853} 1873}
1854 1874
1855static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1875static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
1856 struct inode **ret_orphan_dir, 1876 struct inode **ret_orphan_dir,
1857 u64 blkno, 1877 struct buffer_head **ret_orphan_dir_bh)
1858 char *name,
1859 struct ocfs2_dir_lookup_result *lookup)
1860{ 1878{
1861 struct inode *orphan_dir_inode; 1879 struct inode *orphan_dir_inode;
1862 struct buffer_head *orphan_dir_bh = NULL; 1880 struct buffer_head *orphan_dir_bh = NULL;
1863 int status = 0; 1881 int ret = 0;
1864
1865 status = ocfs2_blkno_stringify(blkno, name);
1866 if (status < 0) {
1867 mlog_errno(status);
1868 return status;
1869 }
1870 1882
1871 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1883 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1872 ORPHAN_DIR_SYSTEM_INODE, 1884 ORPHAN_DIR_SYSTEM_INODE,
1873 osb->slot_num); 1885 osb->slot_num);
1874 if (!orphan_dir_inode) { 1886 if (!orphan_dir_inode) {
1875 status = -ENOENT; 1887 ret = -ENOENT;
1876 mlog_errno(status); 1888 mlog_errno(ret);
1877 return status; 1889 return ret;
1878 } 1890 }
1879 1891
1880 mutex_lock(&orphan_dir_inode->i_mutex); 1892 mutex_lock(&orphan_dir_inode->i_mutex);
1881 1893
1882 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1894 ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1883 if (status < 0) { 1895 if (ret < 0) {
1884 mlog_errno(status); 1896 mutex_unlock(&orphan_dir_inode->i_mutex);
1885 goto leave; 1897 iput(orphan_dir_inode);
1898
1899 mlog_errno(ret);
1900 return ret;
1886 } 1901 }
1887 1902
1888 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1903 *ret_orphan_dir = orphan_dir_inode;
1889 orphan_dir_bh, name, 1904 *ret_orphan_dir_bh = orphan_dir_bh;
1890 OCFS2_ORPHAN_NAMELEN, lookup);
1891 if (status < 0) {
1892 ocfs2_inode_unlock(orphan_dir_inode, 1);
1893 1905
1894 mlog_errno(status); 1906 return 0;
1895 goto leave; 1907}
1908
1909static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
1910 struct buffer_head *orphan_dir_bh,
1911 u64 blkno,
1912 char *name,
1913 struct ocfs2_dir_lookup_result *lookup)
1914{
1915 int ret;
1916 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
1917
1918 ret = ocfs2_blkno_stringify(blkno, name);
1919 if (ret < 0) {
1920 mlog_errno(ret);
1921 return ret;
1922 }
1923
1924 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1925 orphan_dir_bh, name,
1926 OCFS2_ORPHAN_NAMELEN, lookup);
1927 if (ret < 0) {
1928 mlog_errno(ret);
1929 return ret;
1930 }
1931
1932 return 0;
1933}
1934
1935/**
1936 * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
1937 * insertion of an orphan.
1938 * @osb: ocfs2 file system
1939 * @ret_orphan_dir: Orphan dir inode - returned locked!
1940 * @blkno: Actual block number of the inode to be inserted into orphan dir.
1941 * @lookup: dir lookup result, to be passed back into functions like
1942 * ocfs2_orphan_add
1943 *
1944 * Returns zero on success and the ret_orphan_dir, name and lookup
1945 * fields will be populated.
1946 *
1947 * Returns non-zero on failure.
1948 */
1949static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1950 struct inode **ret_orphan_dir,
1951 u64 blkno,
1952 char *name,
1953 struct ocfs2_dir_lookup_result *lookup)
1954{
1955 struct inode *orphan_dir_inode = NULL;
1956 struct buffer_head *orphan_dir_bh = NULL;
1957 int ret = 0;
1958
1959 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
1960 &orphan_dir_bh);
1961 if (ret < 0) {
1962 mlog_errno(ret);
1963 return ret;
1964 }
1965
1966 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
1967 blkno, name, lookup);
1968 if (ret < 0) {
1969 mlog_errno(ret);
1970 goto out;
1896 } 1971 }
1897 1972
1898 *ret_orphan_dir = orphan_dir_inode; 1973 *ret_orphan_dir = orphan_dir_inode;
1899 1974
1900leave: 1975out:
1901 if (status) { 1976 brelse(orphan_dir_bh);
1977
1978 if (ret) {
1979 ocfs2_inode_unlock(orphan_dir_inode, 1);
1902 mutex_unlock(&orphan_dir_inode->i_mutex); 1980 mutex_unlock(&orphan_dir_inode->i_mutex);
1903 iput(orphan_dir_inode); 1981 iput(orphan_dir_inode);
1904 } 1982 }
1905 1983
1906 brelse(orphan_dir_bh); 1984 mlog_exit(ret);
1907 1985 return ret;
1908 mlog_exit(status);
1909 return status;
1910} 1986}
1911 1987
1912static int ocfs2_orphan_add(struct ocfs2_super *osb, 1988static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2129,99 @@ leave:
2053 return status; 2129 return status;
2054} 2130}
2055 2131
2132/**
2133 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
2134 * allocated file. This is different from the typical 'add to orphan dir'
2135 * operation in that the inode does not yet exist. This is a problem because
2136 * the orphan dir stringifies the inode block number to come up with it's
2137 * dirent. Obviously if the inode does not yet exist we have a chicken and egg
2138 * problem. This function works around it by calling deeper into the orphan
2139 * and suballoc code than other callers. Use this only by necessity.
2140 * @dir: The directory which this inode will ultimately wind up under - not the
2141 * orphan dir!
2142 * @dir_bh: buffer_head the @dir inode block
2143 * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
2144 * with the string to be used for orphan dirent. Pass back to the orphan dir
2145 * code.
2146 * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
2147 * dir code.
2148 * @ret_di_blkno: block number where the new inode will be allocated.
2149 * @orphan_insert: Dir insert context to be passed back into orphan dir code.
2150 * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
2151 *
2152 * Returns zero on success and the ret_orphan_dir, name and lookup
2153 * fields will be populated.
2154 *
2155 * Returns non-zero on failure.
2156 */
2157static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2158 struct buffer_head *dir_bh,
2159 char *orphan_name,
2160 struct inode **ret_orphan_dir,
2161 u64 *ret_di_blkno,
2162 struct ocfs2_dir_lookup_result *orphan_insert,
2163 struct ocfs2_alloc_context **ret_inode_ac)
2164{
2165 int ret;
2166 u64 di_blkno;
2167 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2168 struct inode *orphan_dir = NULL;
2169 struct buffer_head *orphan_dir_bh = NULL;
2170 struct ocfs2_alloc_context *inode_ac = NULL;
2171
2172 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
2173 if (ret < 0) {
2174 mlog_errno(ret);
2175 return ret;
2176 }
2177
2178 /* reserve an inode spot */
2179 ret = ocfs2_reserve_new_inode(osb, &inode_ac);
2180 if (ret < 0) {
2181 if (ret != -ENOSPC)
2182 mlog_errno(ret);
2183 goto out;
2184 }
2185
2186 ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
2187 &di_blkno);
2188 if (ret) {
2189 mlog_errno(ret);
2190 goto out;
2191 }
2192
2193 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2194 di_blkno, orphan_name, orphan_insert);
2195 if (ret < 0) {
2196 mlog_errno(ret);
2197 goto out;
2198 }
2199
2200out:
2201 if (ret == 0) {
2202 *ret_orphan_dir = orphan_dir;
2203 *ret_di_blkno = di_blkno;
2204 *ret_inode_ac = inode_ac;
2205 /*
2206 * orphan_name and orphan_insert are already up to
2207 * date via prepare_orphan_dir
2208 */
2209 } else {
2210 /* Unroll reserve_new_inode* */
2211 if (inode_ac)
2212 ocfs2_free_alloc_context(inode_ac);
2213
2214 /* Unroll orphan dir locking */
2215 mutex_unlock(&orphan_dir->i_mutex);
2216 ocfs2_inode_unlock(orphan_dir, 1);
2217 iput(orphan_dir);
2218 }
2219
2220 brelse(orphan_dir_bh);
2221
2222 return 0;
2223}
2224
2056int ocfs2_create_inode_in_orphan(struct inode *dir, 2225int ocfs2_create_inode_in_orphan(struct inode *dir,
2057 int mode, 2226 int mode,
2058 struct inode **new_inode) 2227 struct inode **new_inode)
@@ -2068,6 +2237,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2068 struct buffer_head *new_di_bh = NULL; 2237 struct buffer_head *new_di_bh = NULL;
2069 struct ocfs2_alloc_context *inode_ac = NULL; 2238 struct ocfs2_alloc_context *inode_ac = NULL;
2070 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 2239 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2240 u64 uninitialized_var(di_blkno), suballoc_loc;
2241 u16 suballoc_bit;
2071 2242
2072 status = ocfs2_inode_lock(dir, &parent_di_bh, 1); 2243 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2073 if (status < 0) { 2244 if (status < 0) {
@@ -2076,20 +2247,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2076 return status; 2247 return status;
2077 } 2248 }
2078 2249
2079 /* 2250 status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
2080 * We give the orphan dir the root blkno to fake an orphan name, 2251 orphan_name, &orphan_dir,
2081 * and allocate enough space for our insertion. 2252 &di_blkno, &orphan_insert, &inode_ac);
2082 */
2083 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2084 osb->root_blkno,
2085 orphan_name, &orphan_insert);
2086 if (status < 0) {
2087 mlog_errno(status);
2088 goto leave;
2089 }
2090
2091 /* reserve an inode spot */
2092 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2093 if (status < 0) { 2253 if (status < 0) {
2094 if (status != -ENOSPC) 2254 if (status != -ENOSPC)
2095 mlog_errno(status); 2255 mlog_errno(status);
@@ -2116,17 +2276,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2116 goto leave; 2276 goto leave;
2117 did_quota_inode = 1; 2277 did_quota_inode = 1;
2118 2278
2119 inode->i_nlink = 0; 2279 status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
2120 /* do the real work now. */ 2280 &suballoc_loc,
2121 status = ocfs2_mknod_locked(osb, dir, inode, 2281 &suballoc_bit, di_blkno);
2122 0, &new_di_bh, parent_di_bh, handle,
2123 inode_ac);
2124 if (status < 0) { 2282 if (status < 0) {
2125 mlog_errno(status); 2283 mlog_errno(status);
2126 goto leave; 2284 goto leave;
2127 } 2285 }
2128 2286
2129 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); 2287 inode->i_nlink = 0;
2288 /* do the real work now. */
2289 status = __ocfs2_mknod_locked(dir, inode,
2290 0, &new_di_bh, parent_di_bh, handle,
2291 inode_ac, di_blkno, suballoc_loc,
2292 suballoc_bit);
2130 if (status < 0) { 2293 if (status < 0) {
2131 mlog_errno(status); 2294 mlog_errno(status);
2132 goto leave; 2295 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a..d8408217e3b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
150struct ocfs2_lock_res { 150struct ocfs2_lock_res {
151 void *l_priv; 151 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 152 struct ocfs2_lock_res_ops *l_ops;
153 spinlock_t l_lock; 153
154 154
155 struct list_head l_blocked_list; 155 struct list_head l_blocked_list;
156 struct list_head l_mask_waiters; 156 struct list_head l_mask_waiters;
157 157
158 enum ocfs2_lock_type l_type;
159 unsigned long l_flags; 158 unsigned long l_flags;
160 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
161 int l_level;
162 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
163 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
164 struct ocfs2_dlm_lksb l_lksb; 162 unsigned char l_level;
163
164 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type;
165 166
166 /* used from AST/BAST funcs. */ 167 /* used from AST/BAST funcs. */
167 enum ocfs2_ast_action l_action; 168 /* Data packed - enum type ocfs2_ast_action */
168 enum ocfs2_unlock_action l_unlock_action; 169 unsigned char l_action;
169 int l_requested; 170 /* Data packed - enum type ocfs2_unlock_action */
170 int l_blocking; 171 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
171 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
172 175
176 spinlock_t l_lock;
177
178 struct ocfs2_dlm_lksb l_lksb;
179
173 wait_queue_head_t l_event; 180 wait_queue_head_t l_event;
174 181
175 struct list_head l_debug_list; 182 struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
243 250
244enum ocfs2_mount_options 251enum ocfs2_mount_options
245{ 252{
246 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ 253 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
247 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ 254 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
248 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 255 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
249 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 256 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
256 control lists */ 263 control lists */
257 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
258 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
266 OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
267 writes */
268 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
269 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
259}; 270};
260 271
261#define OCFS2_OSB_SOFT_RO 0x0001 272#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
277 struct super_block *sb; 288 struct super_block *sb;
278 struct inode *root_inode; 289 struct inode *root_inode;
279 struct inode *sys_root_inode; 290 struct inode *sys_root_inode;
280 struct inode *system_inodes[NUM_SYSTEM_INODES]; 291 struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
292 struct inode **local_system_inodes;
281 293
282 struct ocfs2_slot_info *slot_info; 294 struct ocfs2_slot_info *slot_info;
283 295
@@ -368,6 +380,8 @@ struct ocfs2_super
368 struct ocfs2_alloc_stats alloc_stats; 380 struct ocfs2_alloc_stats alloc_stats;
369 char dev_str[20]; /* "major,minor" of the device */ 381 char dev_str[20]; /* "major,minor" of the device */
370 382
383 u8 osb_stackflags;
384
371 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 385 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
372 struct ocfs2_cluster_connection *cconn; 386 struct ocfs2_cluster_connection *cconn;
373 struct ocfs2_lock_res osb_super_lockres; 387 struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
601 return ret; 615 return ret;
602} 616}
603 617
604static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) 618static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
605{ 619{
606 return (osb->s_feature_incompat & 620 return (osb->s_feature_incompat &
607 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); 621 (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
622 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
623}
624
625static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
626{
627 if (ocfs2_clusterinfo_valid(osb) &&
628 memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
629 OCFS2_STACK_LABEL_LEN))
630 return 1;
631 return 0;
632}
633
634static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
635{
636 if (ocfs2_clusterinfo_valid(osb) &&
637 !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
638 OCFS2_STACK_LABEL_LEN))
639 return 1;
640 return 0;
641}
642
643static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
644{
645 return ocfs2_o2cb_stack(osb) &&
646 (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
608} 647}
609 648
610static inline int ocfs2_mount_local(struct ocfs2_super *osb) 649static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258..c2e4f8222e2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) 104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 171#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171 172
172/* 173/*
174 * Incompat bit to indicate useable clusterinfo with stackflags for all
175 * cluster stacks (userspace adnd o2cb). If this bit is set,
176 * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
177 */
178#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
179
180/*
173 * backup superblock flag is used to indicate that this volume 181 * backup superblock flag is used to indicate that this volume
174 * has backup superblocks. 182 * has backup superblocks.
175 */ 183 */
@@ -235,18 +243,31 @@
235#define OCFS2_HAS_REFCOUNT_FL (0x0010) 243#define OCFS2_HAS_REFCOUNT_FL (0x0010)
236 244
237/* Inode attributes, keep in sync with EXT2 */ 245/* Inode attributes, keep in sync with EXT2 */
238#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 246#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
239#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 247#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
240#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ 248#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
241#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ 249#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
242#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ 250#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
243#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ 251#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
244#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ 252#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
245#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ 253#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
246#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ 254/* Reserved for compression usage... */
247 255#define OCFS2_DIRTY_FL FS_DIRTY_FL
248#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ 256#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
249#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 257#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
258#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
259/* End compression flags --- maybe not all used */
260#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
261#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
262#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
263#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
264#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
265#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
266#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
267#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
268
269#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
270#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
250 271
251/* 272/*
252 * Extent record flags (e_node.leaf.flags) 273 * Extent record flags (e_node.leaf.flags)
@@ -279,10 +300,13 @@
279#define OCFS2_VOL_UUID_LEN 16 300#define OCFS2_VOL_UUID_LEN 16
280#define OCFS2_MAX_VOL_LABEL_LEN 64 301#define OCFS2_MAX_VOL_LABEL_LEN 64
281 302
282/* The alternate, userspace stack fields */ 303/* The cluster stack fields */
283#define OCFS2_STACK_LABEL_LEN 4 304#define OCFS2_STACK_LABEL_LEN 4
284#define OCFS2_CLUSTER_NAME_LEN 16 305#define OCFS2_CLUSTER_NAME_LEN 16
285 306
307/* Classic (historically speaking) cluster stack */
308#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
309
286/* Journal limits (in bytes) */ 310/* Journal limits (in bytes) */
287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 311#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
288 312
@@ -292,6 +316,11 @@
292 */ 316 */
293#define OCFS2_MIN_XATTR_INLINE_SIZE 256 317#define OCFS2_MIN_XATTR_INLINE_SIZE 256
294 318
319/*
320 * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
321 */
322#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
323
295struct ocfs2_system_inode_info { 324struct ocfs2_system_inode_info {
296 char *si_name; 325 char *si_name;
297 int si_iflags; 326 int si_iflags;
@@ -309,6 +338,7 @@ enum {
309 USER_QUOTA_SYSTEM_INODE, 338 USER_QUOTA_SYSTEM_INODE,
310 GROUP_QUOTA_SYSTEM_INODE, 339 GROUP_QUOTA_SYSTEM_INODE,
311#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE 340#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
341#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
312 ORPHAN_DIR_SYSTEM_INODE, 342 ORPHAN_DIR_SYSTEM_INODE,
313 EXTENT_ALLOC_SYSTEM_INODE, 343 EXTENT_ALLOC_SYSTEM_INODE,
314 INODE_ALLOC_SYSTEM_INODE, 344 INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +347,12 @@ enum {
317 TRUNCATE_LOG_SYSTEM_INODE, 347 TRUNCATE_LOG_SYSTEM_INODE,
318 LOCAL_USER_QUOTA_SYSTEM_INODE, 348 LOCAL_USER_QUOTA_SYSTEM_INODE,
319 LOCAL_GROUP_QUOTA_SYSTEM_INODE, 349 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
320 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
321}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
322 356
323static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { 357static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
324 /* Global system inodes (single copy) */ 358 /* Global system inodes (single copy) */
@@ -347,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
347/* Parameter passed from mount.ocfs2 to module */ 381/* Parameter passed from mount.ocfs2 to module */
348#define OCFS2_HB_NONE "heartbeat=none" 382#define OCFS2_HB_NONE "heartbeat=none"
349#define OCFS2_HB_LOCAL "heartbeat=local" 383#define OCFS2_HB_LOCAL "heartbeat=local"
384#define OCFS2_HB_GLOBAL "heartbeat=global"
350 385
351/* 386/*
352 * OCFS2 directory file types. Only the low 3 bits are used. The 387 * OCFS2 directory file types. Only the low 3 bits are used. The
@@ -553,9 +588,21 @@ struct ocfs2_slot_map_extended {
553 */ 588 */
554}; 589};
555 590
591/*
592 * ci_stackflags is only valid if the incompat bit
593 * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
594 */
556struct ocfs2_cluster_info { 595struct ocfs2_cluster_info {
557/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; 596/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
558 __le32 ci_reserved; 597 union {
598 __le32 ci_reserved;
599 struct {
600 __u8 ci_stackflags;
601 __u8 ci_reserved1;
602 __u8 ci_reserved2;
603 __u8 ci_reserved3;
604 };
605 };
559/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; 606/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
560/*18*/ 607/*18*/
561}; 608};
@@ -592,9 +639,9 @@ struct ocfs2_super_block {
592 * group header */ 639 * group header */
593/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 640/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
594/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 641/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
595/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 642/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
596 stack. Only valid 643 userspace or clusterinfo
597 with INCOMPAT flag. */ 644 INCOMPAT flag set. */
598/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 645/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
599 for this fs*/ 646 for this fs*/
600 __le16 s_reserved0; 647 __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a8..b46f39bf743 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
23/* 23/*
24 * ioctl commands 24 * ioctl commands
25 */ 25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 26#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 27#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) 28#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 29#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
30 30
31/* 31/*
32 * Space reservation / allocation / free ioctls and argument structure 32 * Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,99 @@ struct reflink_arguments {
76}; 76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) 77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78 78
79/* Following definitions dedicated for ocfs2_info_request ioctls. */
80#define OCFS2_INFO_MAX_REQUEST (50)
81#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
82
83/* Magic number of all requests */
84#define OCFS2_INFO_MAGIC (0x4F32494E)
85
86/*
87 * Always try to separate info request into small pieces to
88 * guarantee the backward&forward compatibility.
89 */
90struct ocfs2_info {
91 __u64 oi_requests; /* Array of __u64 pointers to requests */
92 __u32 oi_count; /* Number of requests in info_requests */
93 __u32 oi_pad;
94};
95
96struct ocfs2_info_request {
97/*00*/ __u32 ir_magic; /* Magic number */
98 __u32 ir_code; /* Info request code */
99 __u32 ir_size; /* Size of request */
100 __u32 ir_flags; /* Request flags */
101/*10*/ /* Request specific fields */
102};
103
104struct ocfs2_info_clustersize {
105 struct ocfs2_info_request ic_req;
106 __u32 ic_clustersize;
107 __u32 ic_pad;
108};
109
110struct ocfs2_info_blocksize {
111 struct ocfs2_info_request ib_req;
112 __u32 ib_blocksize;
113 __u32 ib_pad;
114};
115
116struct ocfs2_info_maxslots {
117 struct ocfs2_info_request im_req;
118 __u32 im_max_slots;
119 __u32 im_pad;
120};
121
122struct ocfs2_info_label {
123 struct ocfs2_info_request il_req;
124 __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
125} __attribute__ ((packed));
126
127struct ocfs2_info_uuid {
128 struct ocfs2_info_request iu_req;
129 __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
130} __attribute__ ((packed));
131
132struct ocfs2_info_fs_features {
133 struct ocfs2_info_request if_req;
134 __u32 if_compat_features;
135 __u32 if_incompat_features;
136 __u32 if_ro_compat_features;
137 __u32 if_pad;
138};
139
140struct ocfs2_info_journal_size {
141 struct ocfs2_info_request ij_req;
142 __u64 ij_journal_size;
143};
144
145/* Codes for ocfs2_info_request */
146enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1,
148 OCFS2_INFO_BLOCKSIZE,
149 OCFS2_INFO_MAXSLOTS,
150 OCFS2_INFO_LABEL,
151 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE,
154 OCFS2_INFO_NUM_TYPES
155};
156
157/* Flags for struct ocfs2_info_request */
158/* Filled by the caller */
159#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
160 required. This is a hint.
161 It is up to ocfs2 whether
162 the request can be fulfilled
163 without locking. */
164/* Filled by ocfs2 */
165#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
166 this request and
167 filled in the answer */
168
169#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
170 request handling. */
171
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173
79#endif /* OCFS2_IOCTL_H */ 174#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9..b5f9160e93e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
52 u32 cow_start; 53 u32 cow_start;
53 u32 cow_len; 54 u32 cow_len;
54 struct ocfs2_extent_tree data_et; 55 struct ocfs2_extent_tree data_et;
@@ -2436,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2436 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2437 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2437 le32_to_cpu(rec.r_clusters)) - cpos; 2438 le32_to_cpu(rec.r_clusters)) - cpos;
2438 /* 2439 /*
2439 * If the refcount rec already exist, cool. We just need
2440 * to check whether there is a split. Otherwise we just need
2441 * to increase the refcount.
2442 * If we will insert one, increases recs_add.
2443 *
2444 * We record all the records which will be inserted to the 2440 * We record all the records which will be inserted to the
2445 * same refcount block, so that we can tell exactly whether 2441 * same refcount block, so that we can tell exactly whether
2446 * we need a new refcount block or not. 2442 * we need a new refcount block or not.
2443 *
2444 * If we will insert a new one, this is easy and only happens
2445 * during adding refcounted flag to the extent, so we don't
2446 * have a chance of spliting. We just need one record.
2447 *
2448 * If the refcount rec already exists, that would be a little
2449 * complicated. we may have to:
2450 * 1) split at the beginning if the start pos isn't aligned.
2451 * we need 1 more record in this case.
2452 * 2) split int the end if the end pos isn't aligned.
2453 * we need 1 more record in this case.
2454 * 3) split in the middle because of file system fragmentation.
2455 * we need 2 more records in this case(we can't detect this
2456 * beforehand, so always think of the worst case).
2447 */ 2457 */
2448 if (rec.r_refcount) { 2458 if (rec.r_refcount) {
2459 recs_add += 2;
2449 /* Check whether we need a split at the beginning. */ 2460 /* Check whether we need a split at the beginning. */
2450 if (cpos == start_cpos && 2461 if (cpos == start_cpos &&
2451 cpos != le64_to_cpu(rec.r_cpos)) 2462 cpos != le64_to_cpu(rec.r_cpos))
@@ -2922,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2922 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2923 struct page *page; 2934 struct page *page;
2924 pgoff_t page_index; 2935 pgoff_t page_index;
2925 unsigned int from, to; 2936 unsigned int from, to, readahead_pages;
2926 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2927 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = context->inode->i_mapping;
2928 2939
2929 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2930 new_cluster, new_len, cpos); 2941 new_cluster, new_len, cpos);
2931 2942
2943 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) <<
2945 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2932 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2946 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2933 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2947 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2934 /* 2948 /*
@@ -2950,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2950 if (map_end & (PAGE_CACHE_SIZE - 1)) 2964 if (map_end & (PAGE_CACHE_SIZE - 1))
2951 to = map_end & (PAGE_CACHE_SIZE - 1); 2965 to = map_end & (PAGE_CACHE_SIZE - 1);
2952 2966
2953 page = grab_cache_page(mapping, page_index); 2967 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2954 2968
2955 /* 2969 /*
2956 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2970 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2959,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2959 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2960 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2961 2975
2976 if (PageReadahead(page) && context->file) {
2977 page_cache_async_readahead(mapping,
2978 &context->file->f_ra,
2979 context->file,
2980 page, page_index,
2981 readahead_pages);
2982 }
2983
2962 if (!PageUptodate(page)) { 2984 if (!PageUptodate(page)) {
2963 ret = block_read_full_page(page, ocfs2_get_block); 2985 ret = block_read_full_page(page, ocfs2_get_block);
2964 if (ret) { 2986 if (ret) {
@@ -3169,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3169 if (map_end > end) 3191 if (map_end > end)
3170 map_end = end; 3192 map_end = end;
3171 3193
3172 page = grab_cache_page(context->inode->i_mapping, page_index); 3194 page = find_or_create_page(context->inode->i_mapping,
3195 page_index, GFP_NOFS);
3173 BUG_ON(!page); 3196 BUG_ON(!page);
3174 3197
3175 wait_on_page_writeback(page); 3198 wait_on_page_writeback(page);
@@ -3398,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3398 return ret; 3421 return ret;
3399} 3422}
3400 3423
3424static void ocfs2_readahead_for_cow(struct inode *inode,
3425 struct file *file,
3426 u32 start, u32 len)
3427{
3428 struct address_space *mapping;
3429 pgoff_t index;
3430 unsigned long num_pages;
3431 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3432
3433 if (!file)
3434 return;
3435
3436 mapping = file->f_mapping;
3437 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3438 if (!num_pages)
3439 num_pages = 1;
3440
3441 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3442 page_cache_sync_readahead(mapping, &file->f_ra, file,
3443 index, num_pages);
3444}
3445
3401/* 3446/*
3402 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3447 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3403 * past max_cpos. This will stop when it runs into a hole or an 3448 * past max_cpos. This will stop when it runs into a hole or an
3404 * unrefcounted extent. 3449 * unrefcounted extent.
3405 */ 3450 */
3406static int ocfs2_refcount_cow_hunk(struct inode *inode, 3451static int ocfs2_refcount_cow_hunk(struct inode *inode,
3452 struct file *file,
3407 struct buffer_head *di_bh, 3453 struct buffer_head *di_bh,
3408 u32 cpos, u32 write_len, u32 max_cpos) 3454 u32 cpos, u32 write_len, u32 max_cpos)
3409{ 3455{
@@ -3432,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3432 3478
3433 BUG_ON(cow_len == 0); 3479 BUG_ON(cow_len == 0);
3434 3480
3481 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3482
3435 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3483 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3436 if (!context) { 3484 if (!context) {
3437 ret = -ENOMEM; 3485 ret = -ENOMEM;
@@ -3453,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3453 context->ref_root_bh = ref_root_bh; 3501 context->ref_root_bh = ref_root_bh;
3454 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3502 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3455 context->get_clusters = ocfs2_di_get_clusters; 3503 context->get_clusters = ocfs2_di_get_clusters;
3504 context->file = file;
3456 3505
3457 ocfs2_init_dinode_extent_tree(&context->data_et, 3506 ocfs2_init_dinode_extent_tree(&context->data_et,
3458 INODE_CACHE(inode), di_bh); 3507 INODE_CACHE(inode), di_bh);
@@ -3481,6 +3530,7 @@ out:
3481 * clusters between cpos and cpos+write_len are safe to modify. 3530 * clusters between cpos and cpos+write_len are safe to modify.
3482 */ 3531 */
3483int ocfs2_refcount_cow(struct inode *inode, 3532int ocfs2_refcount_cow(struct inode *inode,
3533 struct file *file,
3484 struct buffer_head *di_bh, 3534 struct buffer_head *di_bh,
3485 u32 cpos, u32 write_len, u32 max_cpos) 3535 u32 cpos, u32 write_len, u32 max_cpos)
3486{ 3536{
@@ -3500,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3500 num_clusters = write_len; 3550 num_clusters = write_len;
3501 3551
3502 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3552 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3503 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3553 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3504 num_clusters, max_cpos); 3554 num_clusters, max_cpos);
3505 if (ret) { 3555 if (ret) {
3506 mlog_errno(ret); 3556 mlog_errno(ret);
@@ -4190,8 +4240,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4190 goto out; 4240 goto out;
4191 } 4241 }
4192 4242
4193 mutex_lock(&new_inode->i_mutex); 4243 mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4194 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4244 ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4245 OI_LS_REFLINK_TARGET);
4195 if (ret) { 4246 if (ret) {
4196 mlog_errno(ret); 4247 mlog_errno(ret);
4197 goto out_unlock; 4248 goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e..c8ce46f7d8e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
21 struct rb_node rf_node; 21 struct rb_node rf_node;
22 u64 rf_blkno; 22 u64 rf_blkno;
23 u32 rf_generation; 23 u32 rf_generation;
24 struct kref rf_getcnt;
24 struct rw_semaphore rf_sem; 25 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres; 26 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed; 27 int rf_removed;
28 28
29 /* the following 4 fields are used by caching_info. */ 29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock; 30 spinlock_t rf_lock;
31 struct ocfs2_caching_info rf_ci;
32 struct mutex rf_io_mutex; 32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb; 33 struct super_block *rf_sb;
34}; 34};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
57 58
58typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b8..3e78db361bc 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
732 struct ocfs2_alloc_reservation *resv, 732 struct ocfs2_alloc_reservation *resv,
733 int *cstart, int *clen) 733 int *cstart, int *clen)
734{ 734{
735 unsigned int wanted = *clen;
736
737 if (resv == NULL || ocfs2_resmap_disabled(resmap)) 735 if (resv == NULL || ocfs2_resmap_disabled(resmap))
738 return -ENOSPC; 736 return -ENOSPC;
739 737
740 spin_lock(&resv_lock); 738 spin_lock(&resv_lock);
741 739
742 /*
743 * We don't want to over-allocate for temporary
744 * windows. Otherwise, we run the risk of fragmenting the
745 * allocation space.
746 */
747 wanted = ocfs2_resv_window_bits(resmap, resv);
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 if (ocfs2_resv_empty(resv)) { 740 if (ocfs2_resv_empty(resv)) {
752 mlog(0, "empty reservation, find new window\n"); 741 /*
742 * We don't want to over-allocate for temporary
743 * windows. Otherwise, we run the risk of fragmenting the
744 * allocation space.
745 */
746 unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
753 747
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 mlog(0, "empty reservation, find new window\n");
754 /* 752 /*
755 * Try to get a window here. If it works, we must fall 753 * Try to get a window here. If it works, we must fall
756 * through and test the bitmap . This avoids some 754 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949..ab4e0172cc1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
357{ 357{
358 int status = 0; 358 int status = 0;
359 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes; 360 unsigned long long blocks, bytes = 0;
361 unsigned int i; 361 unsigned int i;
362 struct buffer_head *bh; 362 struct buffer_head *bh;
363 363
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c..19965b00c43 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
283 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
284 * in the heartbeat universe */ 284 * in the heartbeat universe */
285 if (!o2hb_check_local_node_heartbeating()) { 285 if (!o2hb_check_local_node_heartbeating()) {
286 if (o2hb_global_heartbeat_active())
287 mlog(ML_ERROR, "Global heartbeat not started\n");
286 rc = -EINVAL; 288 rc = -EINVAL;
287 goto out; 289 goto out;
288 } 290 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca068..252e7c82f92 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/smp_lock.h>
26#include <linux/reboot.h> 25#include <linux/reboot.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28 27
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
612 return -ENOMEM; 611 return -ENOMEM;
613 p->op_this_node = -1; 612 p->op_this_node = -1;
614 613
615 lock_kernel();
616 mutex_lock(&ocfs2_control_lock); 614 mutex_lock(&ocfs2_control_lock);
617 file->private_data = p; 615 file->private_data = p;
618 list_add(&p->op_list, &ocfs2_control_private_list); 616 list_add(&p->op_list, &ocfs2_control_private_list);
619 mutex_unlock(&ocfs2_control_lock); 617 mutex_unlock(&ocfs2_control_lock);
620 unlock_kernel();
621 618
622 return 0; 619 return 0;
623} 620}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
628 .read = ocfs2_control_read, 625 .read = ocfs2_control_read,
629 .write = ocfs2_control_write, 626 .write = ocfs2_control_write,
630 .owner = THIS_MODULE, 627 .owner = THIS_MODULE,
628 .llseek = default_llseek,
631}; 629};
632 630
633static struct miscdevice ocfs2_control_device = { 631static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353..5fed60de763 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set 57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is 58 to 0 when a block group is
59 contiguous. */ 59 contiguous. */
60 u64 sr_bg_stable_blkno; /*
61 * Doesn't change, always
62 * set to target block
63 * group descriptor
64 * block.
65 */
60 u64 sr_blkno; /* The first allocated block */ 66 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */ 67 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */ 68 unsigned int sr_bits; /* How many bits we claimed */
63}; 69};
64 70
71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72{
73 if (res->sr_blkno == 0)
74 return 0;
75
76 if (res->sr_bg_blkno)
77 return res->sr_bg_blkno;
78
79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80}
81
65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
138 brelse(ac->ac_bh); 155 brelse(ac->ac_bh);
139 ac->ac_bh = NULL; 156 ac->ac_bh = NULL;
140 ac->ac_resv = NULL; 157 ac->ac_resv = NULL;
158 if (ac->ac_find_loc_priv) {
159 kfree(ac->ac_find_loc_priv);
160 ac->ac_find_loc_priv = NULL;
161 }
141} 162}
142 163
143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 164void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg, 358 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl, 359 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters) 360 u64 p_blkno, unsigned int clusters)
340{ 361{
341 struct ocfs2_extent_list *el = &bg->bg_list; 362 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec; 363 struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
348 rec->e_blkno = cpu_to_le64(p_blkno); 369 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc)); 371 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters); 372 rec->e_leaf_clusters = cpu_to_le16(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count, 374 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc)); 375 clusters * le16_to_cpu(cl->cl_bpc));
@@ -1359,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1359 } 1380 }
1360 1381
1361 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385 " count %u but claims %u are freed. num_bits %d",
1386 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387 le16_to_cpu(bg->bg_bits),
1388 le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389 return -EROFS;
1390 }
1362 while(num_bits--) 1391 while(num_bits--)
1363 ocfs2_set_bit(bit_off++, bitmap); 1392 ocfs2_set_bit(bit_off++, bitmap);
1364 1393
@@ -1678,6 +1707,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1678 if (!ret) 1707 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res); 1708 ocfs2_bg_discontig_fix_result(ac, gd, res);
1680 1709
1710 /*
1711 * sr_bg_blkno might have been changed by
1712 * ocfs2_bg_discontig_fix_result
1713 */
1714 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1715
1716 if (ac->ac_find_loc_only)
1717 goto out_loc_only;
1718
1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1719 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1682 res->sr_bits, 1720 res->sr_bits,
1683 le16_to_cpu(gd->bg_chain)); 1721 le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1729,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1691 if (ret < 0) 1729 if (ret < 0)
1692 mlog_errno(ret); 1730 mlog_errno(ret);
1693 1731
1732out_loc_only:
1694 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1733 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1695 1734
1696out: 1735out:
@@ -1708,7 +1747,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1708{ 1747{
1709 int status; 1748 int status;
1710 u16 chain; 1749 u16 chain;
1711 u32 tmp_used;
1712 u64 next_group; 1750 u64 next_group;
1713 struct inode *alloc_inode = ac->ac_inode; 1751 struct inode *alloc_inode = ac->ac_inode;
1714 struct buffer_head *group_bh = NULL; 1752 struct buffer_head *group_bh = NULL;
@@ -1770,6 +1808,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1770 if (!status) 1808 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res); 1809 ocfs2_bg_discontig_fix_result(ac, bg, res);
1772 1810
1811 /*
1812 * sr_bg_blkno might have been changed by
1813 * ocfs2_bg_discontig_fix_result
1814 */
1815 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1773 1816
1774 /* 1817 /*
1775 * Keep track of previous block descriptor read. When 1818 * Keep track of previous block descriptor read. When
@@ -1796,22 +1839,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1796 } 1839 }
1797 } 1840 }
1798 1841
1799 /* Ok, claim our bits now: set the info on dinode, chainlist 1842 if (ac->ac_find_loc_only)
1800 * and then the group */ 1843 goto out_loc_only;
1801 status = ocfs2_journal_access_di(handle, 1844
1802 INODE_CACHE(alloc_inode), 1845 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1803 ac->ac_bh, 1846 ac->ac_bh, res->sr_bits,
1804 OCFS2_JOURNAL_ACCESS_WRITE); 1847 chain);
1805 if (status < 0) { 1848 if (status) {
1806 mlog_errno(status); 1849 mlog_errno(status);
1807 goto bail; 1850 goto bail;
1808 } 1851 }
1809 1852
1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1814
1815 status = ocfs2_block_group_set_bits(handle, 1853 status = ocfs2_block_group_set_bits(handle,
1816 alloc_inode, 1854 alloc_inode,
1817 bg, 1855 bg,
@@ -1826,6 +1864,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, 1864 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1827 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1865 (unsigned long long)le64_to_cpu(fe->i_blkno));
1828 1866
1867out_loc_only:
1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1868 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1830bail: 1869bail:
1831 brelse(group_bh); 1870 brelse(group_bh);
@@ -1845,6 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1845 int status; 1884 int status;
1846 u16 victim, i; 1885 u16 victim, i;
1847 u16 bits_left = 0; 1886 u16 bits_left = 0;
1887 u64 hint = ac->ac_last_group;
1848 struct ocfs2_chain_list *cl; 1888 struct ocfs2_chain_list *cl;
1849 struct ocfs2_dinode *fe; 1889 struct ocfs2_dinode *fe;
1850 1890
@@ -1872,7 +1912,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1872 goto bail; 1912 goto bail;
1873 } 1913 }
1874 1914
1875 res->sr_bg_blkno = ac->ac_last_group; 1915 res->sr_bg_blkno = hint;
1876 if (res->sr_bg_blkno) { 1916 if (res->sr_bg_blkno) {
1877 /* Attempt to short-circuit the usual search mechanism 1917 /* Attempt to short-circuit the usual search mechanism
1878 * by jumping straight to the most recently used 1918 * by jumping straight to the most recently used
@@ -1896,8 +1936,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1896 1936
1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1937 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1898 res, &bits_left); 1938 res, &bits_left);
1899 if (!status) 1939 if (!status) {
1940 hint = ocfs2_group_from_res(res);
1900 goto set_hint; 1941 goto set_hint;
1942 }
1901 if (status < 0 && status != -ENOSPC) { 1943 if (status < 0 && status != -ENOSPC) {
1902 mlog_errno(status); 1944 mlog_errno(status);
1903 goto bail; 1945 goto bail;
@@ -1920,8 +1962,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1920 ac->ac_chain = i; 1962 ac->ac_chain = i;
1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1963 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1922 res, &bits_left); 1964 res, &bits_left);
1923 if (!status) 1965 if (!status) {
1966 hint = ocfs2_group_from_res(res);
1924 break; 1967 break;
1968 }
1925 if (status < 0 && status != -ENOSPC) { 1969 if (status < 0 && status != -ENOSPC) {
1926 mlog_errno(status); 1970 mlog_errno(status);
1927 goto bail; 1971 goto bail;
@@ -1936,7 +1980,7 @@ set_hint:
1936 if (bits_left < min_bits) 1980 if (bits_left < min_bits)
1937 ac->ac_last_group = 0; 1981 ac->ac_last_group = 0;
1938 else 1982 else
1939 ac->ac_last_group = res->sr_bg_blkno; 1983 ac->ac_last_group = hint;
1940 } 1984 }
1941 1985
1942bail: 1986bail:
@@ -2016,6 +2060,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2060 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2017} 2061}
2018 2062
2063int ocfs2_find_new_inode_loc(struct inode *dir,
2064 struct buffer_head *parent_fe_bh,
2065 struct ocfs2_alloc_context *ac,
2066 u64 *fe_blkno)
2067{
2068 int ret;
2069 handle_t *handle = NULL;
2070 struct ocfs2_suballoc_result *res;
2071
2072 BUG_ON(!ac);
2073 BUG_ON(ac->ac_bits_given != 0);
2074 BUG_ON(ac->ac_bits_wanted != 1);
2075 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2076
2077 res = kzalloc(sizeof(*res), GFP_NOFS);
2078 if (res == NULL) {
2079 ret = -ENOMEM;
2080 mlog_errno(ret);
2081 goto out;
2082 }
2083
2084 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2085
2086 /*
2087 * The handle started here is for chain relink. Alternatively,
2088 * we could just disable relink for these calls.
2089 */
2090 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2091 if (IS_ERR(handle)) {
2092 ret = PTR_ERR(handle);
2093 handle = NULL;
2094 mlog_errno(ret);
2095 goto out;
2096 }
2097
2098 /*
2099 * This will instruct ocfs2_claim_suballoc_bits and
2100 * ocfs2_search_one_group to search but save actual allocation
2101 * for later.
2102 */
2103 ac->ac_find_loc_only = 1;
2104
2105 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2106 if (ret < 0) {
2107 mlog_errno(ret);
2108 goto out;
2109 }
2110
2111 ac->ac_find_loc_priv = res;
2112 *fe_blkno = res->sr_blkno;
2113
2114out:
2115 if (handle)
2116 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2117
2118 if (ret)
2119 kfree(res);
2120
2121 return ret;
2122}
2123
2124int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2125 struct inode *dir,
2126 struct ocfs2_alloc_context *ac,
2127 u64 *suballoc_loc,
2128 u16 *suballoc_bit,
2129 u64 di_blkno)
2130{
2131 int ret;
2132 u16 chain;
2133 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2134 struct buffer_head *bg_bh = NULL;
2135 struct ocfs2_group_desc *bg;
2136 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2137
2138 /*
2139 * Since di_blkno is being passed back in, we check for any
2140 * inconsistencies which may have happened between
2141 * calls. These are code bugs as di_blkno is not expected to
2142 * change once returned from ocfs2_find_new_inode_loc()
2143 */
2144 BUG_ON(res->sr_blkno != di_blkno);
2145
2146 ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2147 res->sr_bg_stable_blkno, &bg_bh);
2148 if (ret) {
2149 mlog_errno(ret);
2150 goto out;
2151 }
2152
2153 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2154 chain = le16_to_cpu(bg->bg_chain);
2155
2156 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2157 ac->ac_bh, res->sr_bits,
2158 chain);
2159 if (ret) {
2160 mlog_errno(ret);
2161 goto out;
2162 }
2163
2164 ret = ocfs2_block_group_set_bits(handle,
2165 ac->ac_inode,
2166 bg,
2167 bg_bh,
2168 res->sr_bit_offset,
2169 res->sr_bits);
2170 if (ret < 0) {
2171 mlog_errno(ret);
2172 goto out;
2173 }
2174
2175 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2176 (unsigned long long)di_blkno);
2177
2178 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2179
2180 BUG_ON(res->sr_bits != 1);
2181
2182 *suballoc_loc = res->sr_bg_blkno;
2183 *suballoc_bit = res->sr_bit_offset;
2184 ac->ac_bits_given++;
2185 ocfs2_save_inode_ac_group(dir, ac);
2186
2187out:
2188 brelse(bg_bh);
2189
2190 return ret;
2191}
2192
2019int ocfs2_claim_new_inode(handle_t *handle, 2193int ocfs2_claim_new_inode(handle_t *handle,
2020 struct inode *dir, 2194 struct inode *dir,
2021 struct buffer_head *parent_fe_bh, 2195 struct buffer_head *parent_fe_bh,
@@ -2253,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2253 (unsigned long *) undo_bg->bg_bitmap); 2427 (unsigned long *) undo_bg->bg_bitmap);
2254 } 2428 }
2255 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2429 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432 " count %u but claims %u are freed. num_bits %d",
2433 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436 return -EROFS;
2437 }
2256 2438
2257 if (undo_fn) 2439 if (undo_fn)
2258 jbd_unlock_bh_state(group_bh); 2440 jbd_unlock_bh_state(group_bh);
@@ -2567,7 +2749,8 @@ out:
2567 * suballoc_bit. 2749 * suballoc_bit.
2568 */ 2750 */
2569static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2751static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2570 u16 *suballoc_slot, u16 *suballoc_bit) 2752 u16 *suballoc_slot, u64 *group_blkno,
2753 u16 *suballoc_bit)
2571{ 2754{
2572 int status; 2755 int status;
2573 struct buffer_head *inode_bh = NULL; 2756 struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2787,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2604 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2787 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2605 if (suballoc_bit) 2788 if (suballoc_bit)
2606 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2789 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2790 if (group_blkno)
2791 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2607 2792
2608bail: 2793bail:
2609 brelse(inode_bh); 2794 brelse(inode_bh);
@@ -2621,7 +2806,8 @@ bail:
2621 */ 2806 */
2622static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2807static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2623 struct inode *suballoc, 2808 struct inode *suballoc,
2624 struct buffer_head *alloc_bh, u64 blkno, 2809 struct buffer_head *alloc_bh,
2810 u64 group_blkno, u64 blkno,
2625 u16 bit, int *res) 2811 u16 bit, int *res)
2626{ 2812{
2627 struct ocfs2_dinode *alloc_di; 2813 struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2828,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2642 goto bail; 2828 goto bail;
2643 } 2829 }
2644 2830
2645 if (alloc_di->i_suballoc_loc) 2831 bg_blkno = group_blkno ? group_blkno :
2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); 2832 ocfs2_which_suballoc_group(blkno, bit);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2833 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2650 &group_bh); 2834 &group_bh);
2651 if (status < 0) { 2835 if (status < 0) {
@@ -2680,6 +2864,7 @@ bail:
2680int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2864int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2681{ 2865{
2682 int status; 2866 int status;
2867 u64 group_blkno = 0;
2683 u16 suballoc_bit = 0, suballoc_slot = 0; 2868 u16 suballoc_bit = 0, suballoc_slot = 0;
2684 struct inode *inode_alloc_inode; 2869 struct inode *inode_alloc_inode;
2685 struct buffer_head *alloc_bh = NULL; 2870 struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2872,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2687 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2872 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2688 2873
2689 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2874 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2690 &suballoc_bit); 2875 &group_blkno, &suballoc_bit);
2691 if (status < 0) { 2876 if (status < 0) {
2692 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2877 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2693 goto bail; 2878 goto bail;
@@ -2715,7 +2900,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2715 } 2900 }
2716 2901
2717 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2902 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2718 blkno, suballoc_bit, res); 2903 group_blkno, blkno, suballoc_bit, res);
2719 if (status < 0) 2904 if (status < 0)
2720 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2905 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2721 2906
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d..b8afabfeede 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
56 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
57 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58 58
59 int ac_find_loc_only; /* hack for reflink operation ordering */
60 struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
61
59 struct ocfs2_alloc_reservation *ac_resv; 62 struct ocfs2_alloc_reservation *ac_resv;
60}; 63};
61 64
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
197 struct ocfs2_alloc_context **meta_ac); 200 struct ocfs2_alloc_context **meta_ac);
198 201
199int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); 202int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
203
204
205
206/*
207 * The following two interfaces are for ocfs2_create_inode_in_orphan().
208 */
209int ocfs2_find_new_inode_loc(struct inode *dir,
210 struct buffer_head *parent_fe_bh,
211 struct ocfs2_alloc_context *ac,
212 u64 *fe_blkno);
213
214int ocfs2_claim_new_inode_at_loc(handle_t *handle,
215 struct inode *dir,
216 struct ocfs2_alloc_context *ac,
217 u64 *suballoc_loc,
218 u16 *suballoc_bit,
219 u64 di_blkno);
220
200#endif /* _CHAINALLOC_H_ */ 221#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0eaa929a4db..f02c0ef3157 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -145,8 +145,7 @@ static const struct super_operations ocfs2_sops = {
145 .alloc_inode = ocfs2_alloc_inode, 145 .alloc_inode = ocfs2_alloc_inode,
146 .destroy_inode = ocfs2_destroy_inode, 146 .destroy_inode = ocfs2_destroy_inode,
147 .drop_inode = ocfs2_drop_inode, 147 .drop_inode = ocfs2_drop_inode,
148 .clear_inode = ocfs2_clear_inode, 148 .evict_inode = ocfs2_evict_inode,
149 .delete_inode = ocfs2_delete_inode,
150 .sync_fs = ocfs2_sync_fs, 149 .sync_fs = ocfs2_sync_fs,
151 .put_super = ocfs2_put_super, 150 .put_super = ocfs2_put_super,
152 .remount_fs = ocfs2_remount, 151 .remount_fs = ocfs2_remount,
@@ -163,6 +162,7 @@ enum {
163 Opt_nointr, 162 Opt_nointr,
164 Opt_hb_none, 163 Opt_hb_none,
165 Opt_hb_local, 164 Opt_hb_local,
165 Opt_hb_global,
166 Opt_data_ordered, 166 Opt_data_ordered,
167 Opt_data_writeback, 167 Opt_data_writeback,
168 Opt_atime_quantum, 168 Opt_atime_quantum,
@@ -178,6 +178,8 @@ enum {
178 Opt_noacl, 178 Opt_noacl,
179 Opt_usrquota, 179 Opt_usrquota,
180 Opt_grpquota, 180 Opt_grpquota,
181 Opt_coherency_buffered,
182 Opt_coherency_full,
181 Opt_resv_level, 183 Opt_resv_level,
182 Opt_dir_resv_level, 184 Opt_dir_resv_level,
183 Opt_err, 185 Opt_err,
@@ -191,6 +193,7 @@ static const match_table_t tokens = {
191 {Opt_nointr, "nointr"}, 193 {Opt_nointr, "nointr"},
192 {Opt_hb_none, OCFS2_HB_NONE}, 194 {Opt_hb_none, OCFS2_HB_NONE},
193 {Opt_hb_local, OCFS2_HB_LOCAL}, 195 {Opt_hb_local, OCFS2_HB_LOCAL},
196 {Opt_hb_global, OCFS2_HB_GLOBAL},
194 {Opt_data_ordered, "data=ordered"}, 197 {Opt_data_ordered, "data=ordered"},
195 {Opt_data_writeback, "data=writeback"}, 198 {Opt_data_writeback, "data=writeback"},
196 {Opt_atime_quantum, "atime_quantum=%u"}, 199 {Opt_atime_quantum, "atime_quantum=%u"},
@@ -206,6 +209,8 @@ static const match_table_t tokens = {
206 {Opt_noacl, "noacl"}, 209 {Opt_noacl, "noacl"},
207 {Opt_usrquota, "usrquota"}, 210 {Opt_usrquota, "usrquota"},
208 {Opt_grpquota, "grpquota"}, 211 {Opt_grpquota, "grpquota"},
212 {Opt_coherency_buffered, "coherency=buffered"},
213 {Opt_coherency_full, "coherency=full"},
209 {Opt_resv_level, "resv_level=%u"}, 214 {Opt_resv_level, "resv_level=%u"},
210 {Opt_dir_resv_level, "dir_resv_level=%u"}, 215 {Opt_dir_resv_level, "dir_resv_level=%u"},
211 {Opt_err, NULL} 216 {Opt_err, NULL}
@@ -515,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
515 520
516 mlog_entry_void(); 521 mlog_entry_void();
517 522
518 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 523 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
519 inode = osb->system_inodes[i]; 524 inode = osb->global_system_inodes[i];
520 if (inode) { 525 if (inode) {
521 iput(inode); 526 iput(inode);
522 osb->system_inodes[i] = NULL; 527 osb->global_system_inodes[i] = NULL;
523 } 528 }
524 } 529 }
525 530
@@ -535,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
535 osb->root_inode = NULL; 540 osb->root_inode = NULL;
536 } 541 }
537 542
543 if (!osb->local_system_inodes)
544 goto out;
545
546 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
547 if (osb->local_system_inodes[i]) {
548 iput(osb->local_system_inodes[i]);
549 osb->local_system_inodes[i] = NULL;
550 }
551 }
552
553 kfree(osb->local_system_inodes);
554 osb->local_system_inodes = NULL;
555
556out:
538 mlog_exit(0); 557 mlog_exit(0);
539} 558}
540 559
@@ -609,8 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
609 int ret = 0; 628 int ret = 0;
610 struct mount_options parsed_options; 629 struct mount_options parsed_options;
611 struct ocfs2_super *osb = OCFS2_SB(sb); 630 struct ocfs2_super *osb = OCFS2_SB(sb);
612 631 u32 tmp;
613 lock_kernel();
614 632
615 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 633 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
616 !ocfs2_check_set_options(sb, &parsed_options)) { 634 !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -618,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
618 goto out; 636 goto out;
619 } 637 }
620 638
621 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 639 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
622 (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 640 OCFS2_MOUNT_HB_NONE;
641 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
623 ret = -EINVAL; 642 ret = -EINVAL;
624 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 643 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
625 goto out; 644 goto out;
@@ -718,7 +737,6 @@ unlock_osb:
718 MS_POSIXACL : 0); 737 MS_POSIXACL : 0);
719 } 738 }
720out: 739out:
721 unlock_kernel();
722 return ret; 740 return ret;
723} 741}
724 742
@@ -810,23 +828,29 @@ bail:
810 828
811static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 829static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
812{ 830{
813 if (ocfs2_mount_local(osb)) { 831 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
814 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 832
833 if (osb->s_mount_opt & hb_enabled) {
834 if (ocfs2_mount_local(osb)) {
815 mlog(ML_ERROR, "Cannot heartbeat on a locally " 835 mlog(ML_ERROR, "Cannot heartbeat on a locally "
816 "mounted device.\n"); 836 "mounted device.\n");
817 return -EINVAL; 837 return -EINVAL;
818 } 838 }
819 } 839 if (ocfs2_userspace_stack(osb)) {
820
821 if (ocfs2_userspace_stack(osb)) {
822 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
823 mlog(ML_ERROR, "Userspace stack expected, but " 840 mlog(ML_ERROR, "Userspace stack expected, but "
824 "o2cb heartbeat arguments passed to mount\n"); 841 "o2cb heartbeat arguments passed to mount\n");
825 return -EINVAL; 842 return -EINVAL;
826 } 843 }
844 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
845 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
846 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
847 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
848 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
849 return -EINVAL;
850 }
827 } 851 }
828 852
829 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 853 if (!(osb->s_mount_opt & hb_enabled)) {
830 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 854 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
831 !ocfs2_userspace_stack(osb)) { 855 !ocfs2_userspace_stack(osb)) {
832 mlog(ML_ERROR, "Heartbeat has to be started to mount " 856 mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1212,14 +1236,12 @@ read_super_error:
1212 return status; 1236 return status;
1213} 1237}
1214 1238
1215static int ocfs2_get_sb(struct file_system_type *fs_type, 1239static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1216 int flags, 1240 int flags,
1217 const char *dev_name, 1241 const char *dev_name,
1218 void *data, 1242 void *data)
1219 struct vfsmount *mnt)
1220{ 1243{
1221 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, 1244 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1222 mnt);
1223} 1245}
1224 1246
1225static void ocfs2_kill_sb(struct super_block *sb) 1247static void ocfs2_kill_sb(struct super_block *sb)
@@ -1243,8 +1265,7 @@ out:
1243static struct file_system_type ocfs2_fs_type = { 1265static struct file_system_type ocfs2_fs_type = {
1244 .owner = THIS_MODULE, 1266 .owner = THIS_MODULE,
1245 .name = "ocfs2", 1267 .name = "ocfs2",
1246 .get_sb = ocfs2_get_sb, /* is this called when we mount 1268 .mount = ocfs2_mount,
1247 * the fs? */
1248 .kill_sb = ocfs2_kill_sb, 1269 .kill_sb = ocfs2_kill_sb,
1249 1270
1250 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1271 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1292,6 +1313,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1292{ 1313{
1293 int status; 1314 int status;
1294 char *p; 1315 char *p;
1316 u32 tmp;
1295 1317
1296 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1318 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
1297 options ? options : "(none)"); 1319 options ? options : "(none)");
@@ -1323,7 +1345,10 @@ static int ocfs2_parse_options(struct super_block *sb,
1323 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1345 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1324 break; 1346 break;
1325 case Opt_hb_none: 1347 case Opt_hb_none:
1326 mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 1348 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1349 break;
1350 case Opt_hb_global:
1351 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1327 break; 1352 break;
1328 case Opt_barrier: 1353 case Opt_barrier:
1329 if (match_int(&args[0], &option)) { 1354 if (match_int(&args[0], &option)) {
@@ -1439,6 +1464,12 @@ static int ocfs2_parse_options(struct super_block *sb,
1439 case Opt_grpquota: 1464 case Opt_grpquota:
1440 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1465 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1441 break; 1466 break;
1467 case Opt_coherency_buffered:
1468 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1469 break;
1470 case Opt_coherency_full:
1471 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1472 break;
1442 case Opt_acl: 1473 case Opt_acl:
1443 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1474 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1444 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1475 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1478,6 +1509,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1478 } 1509 }
1479 } 1510 }
1480 1511
1512 /* Ensure only one heartbeat mode */
1513 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
1514 OCFS2_MOUNT_HB_NONE);
1515 if (hweight32(tmp) != 1) {
1516 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1517 status = 0;
1518 goto bail;
1519 }
1520
1481 status = 1; 1521 status = 1;
1482 1522
1483bail: 1523bail:
@@ -1491,10 +1531,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1491 unsigned long opts = osb->s_mount_opt; 1531 unsigned long opts = osb->s_mount_opt;
1492 unsigned int local_alloc_megs; 1532 unsigned int local_alloc_megs;
1493 1533
1494 if (opts & OCFS2_MOUNT_HB_LOCAL) 1534 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1495 seq_printf(s, ",_netdev,heartbeat=local"); 1535 seq_printf(s, ",_netdev");
1496 else 1536 if (opts & OCFS2_MOUNT_HB_LOCAL)
1497 seq_printf(s, ",heartbeat=none"); 1537 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1538 else
1539 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1540 } else
1541 seq_printf(s, ",%s", OCFS2_HB_NONE);
1498 1542
1499 if (opts & OCFS2_MOUNT_NOINTR) 1543 if (opts & OCFS2_MOUNT_NOINTR)
1500 seq_printf(s, ",nointr"); 1544 seq_printf(s, ",nointr");
@@ -1537,6 +1581,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1537 if (opts & OCFS2_MOUNT_GRPQUOTA) 1581 if (opts & OCFS2_MOUNT_GRPQUOTA)
1538 seq_printf(s, ",grpquota"); 1582 seq_printf(s, ",grpquota");
1539 1583
1584 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1585 seq_printf(s, ",coherency=buffered");
1586 else
1587 seq_printf(s, ",coherency=full");
1588
1540 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1589 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1541 seq_printf(s, ",nouser_xattr"); 1590 seq_printf(s, ",nouser_xattr");
1542 else 1591 else
@@ -1641,13 +1690,9 @@ static void ocfs2_put_super(struct super_block *sb)
1641{ 1690{
1642 mlog_entry("(0x%p)\n", sb); 1691 mlog_entry("(0x%p)\n", sb);
1643 1692
1644 lock_kernel();
1645
1646 ocfs2_sync_blockdev(sb); 1693 ocfs2_sync_blockdev(sb);
1647 ocfs2_dismount_volume(sb, 0); 1694 ocfs2_dismount_volume(sb, 0);
1648 1695
1649 unlock_kernel();
1650
1651 mlog_exit_void(); 1696 mlog_exit_void();
1652} 1697}
1653 1698
@@ -1991,6 +2036,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1991 return 0; 2036 return 0;
1992} 2037}
1993 2038
2039/* Make sure entire volume is addressable by our journal. Requires
2040 osb_clusters_at_boot to be valid and for the journal to have been
2041 initialized by ocfs2_journal_init(). */
2042static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2043{
2044 int status = 0;
2045 u64 max_block =
2046 ocfs2_clusters_to_blocks(osb->sb,
2047 osb->osb_clusters_at_boot) - 1;
2048
2049 /* 32-bit block number is always OK. */
2050 if (max_block <= (u32)~0ULL)
2051 goto out;
2052
2053 /* Volume is "huge", so see if our journal is new enough to
2054 support it. */
2055 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2056 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2057 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2058 JBD2_FEATURE_INCOMPAT_64BIT))) {
2059 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2060 "Enable the 'block64' journal option with tunefs.ocfs2");
2061 status = -EFBIG;
2062 goto out;
2063 }
2064
2065 out:
2066 return status;
2067}
2068
1994static int ocfs2_initialize_super(struct super_block *sb, 2069static int ocfs2_initialize_super(struct super_block *sb,
1995 struct buffer_head *bh, 2070 struct buffer_head *bh,
1996 int sector_size, 2071 int sector_size,
@@ -2003,6 +2078,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2003 struct ocfs2_journal *journal; 2078 struct ocfs2_journal *journal;
2004 __le32 uuid_net_key; 2079 __le32 uuid_net_key;
2005 struct ocfs2_super *osb; 2080 struct ocfs2_super *osb;
2081 u64 total_blocks;
2006 2082
2007 mlog_entry_void(); 2083 mlog_entry_void();
2008 2084
@@ -2061,6 +2137,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2061 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2137 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2062 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2138 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2063 2139
2140 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2141 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2142 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2143 osb->max_slots);
2144 status = -EINVAL;
2145 goto bail;
2146 }
2147 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2148
2064 ocfs2_orphan_scan_init(osb); 2149 ocfs2_orphan_scan_init(osb);
2065 2150
2066 status = ocfs2_recovery_init(osb); 2151 status = ocfs2_recovery_init(osb);
@@ -2099,15 +2184,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2099 goto bail; 2184 goto bail;
2100 } 2185 }
2101 2186
2102 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2103 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2104 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2105 osb->max_slots);
2106 status = -EINVAL;
2107 goto bail;
2108 }
2109 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2110
2111 osb->slot_recovery_generations = 2187 osb->slot_recovery_generations =
2112 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2188 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2113 GFP_KERNEL); 2189 GFP_KERNEL);
@@ -2150,7 +2226,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2150 goto bail; 2226 goto bail;
2151 } 2227 }
2152 2228
2153 if (ocfs2_userspace_stack(osb)) { 2229 if (ocfs2_clusterinfo_valid(osb)) {
2230 osb->osb_stackflags =
2231 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2154 memcpy(osb->osb_cluster_stack, 2232 memcpy(osb->osb_cluster_stack,
2155 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2233 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2156 OCFS2_STACK_LABEL_LEN); 2234 OCFS2_STACK_LABEL_LEN);
@@ -2215,11 +2293,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2215 goto bail; 2293 goto bail;
2216 } 2294 }
2217 2295
2218 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 2296 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2219 > (u32)~0UL) { 2297 le32_to_cpu(di->i_clusters));
2220 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 2298
2221 "what jbd can address in 32 bits.\n"); 2299 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2222 status = -EINVAL; 2300 total_blocks);
2301 if (status) {
2302 mlog(ML_ERROR, "Volume too large "
2303 "to mount safely on this system");
2304 status = -EFBIG;
2223 goto bail; 2305 goto bail;
2224 } 2306 }
2225 2307
@@ -2381,6 +2463,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2381 goto finally; 2463 goto finally;
2382 } 2464 }
2383 2465
2466 /* Now that journal has been initialized, check to make sure
2467 entire volume is addressable. */
2468 status = ocfs2_journal_addressable(osb);
2469 if (status)
2470 goto finally;
2471
2384 /* If the journal was unmounted cleanly then we don't want to 2472 /* If the journal was unmounted cleanly then we don't want to
2385 * recover anything. Otherwise, journal_load will do that 2473 * recover anything. Otherwise, journal_load will do that
2386 * dirty work for us :) */ 2474 * dirty work for us :) */
@@ -2472,7 +2560,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2472 kfree(osb->slot_recovery_generations); 2560 kfree(osb->slot_recovery_generations);
2473 /* FIXME 2561 /* FIXME
2474 * This belongs in journal shutdown, but because we have to 2562 * This belongs in journal shutdown, but because we have to
2475 * allocate osb->journal at the start of ocfs2_initalize_osb(), 2563 * allocate osb->journal at the start of ocfs2_initialize_osb(),
2476 * we free it here. 2564 * we free it here.
2477 */ 2565 */
2478 kfree(osb->journal); 2566 kfree(osb->journal);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc..9975457c981 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf..902efb23b6a 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
44 int type, 44 int type,
45 u32 slot); 45 u32 slot);
46 46
47static inline int is_global_system_inode(int type);
48static inline int is_in_system_inode_array(struct ocfs2_super *osb,
49 int type,
50 u32 slot);
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 47#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; 48static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
54#endif 49#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
59 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; 54 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
60} 55}
61 56
62static inline int is_in_system_inode_array(struct ocfs2_super *osb, 57static struct inode **get_local_system_inode(struct ocfs2_super *osb,
63 int type, 58 int type,
64 u32 slot) 59 u32 slot)
65{ 60{
66 return slot == osb->slot_num || is_global_system_inode(type); 61 int index;
62 struct inode **local_system_inodes, **free = NULL;
63
64 BUG_ON(slot == OCFS2_INVALID_SLOT);
65 BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
66 type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
67
68 spin_lock(&osb->osb_lock);
69 local_system_inodes = osb->local_system_inodes;
70 spin_unlock(&osb->osb_lock);
71
72 if (unlikely(!local_system_inodes)) {
73 local_system_inodes = kzalloc(sizeof(struct inode *) *
74 NUM_LOCAL_SYSTEM_INODES *
75 osb->max_slots,
76 GFP_NOFS);
77 if (!local_system_inodes) {
78 mlog_errno(-ENOMEM);
79 /*
80 * return NULL here so that ocfs2_get_sytem_file_inodes
81 * will try to create an inode and use it. We will try
82 * to initialize local_system_inodes next time.
83 */
84 return NULL;
85 }
86
87 spin_lock(&osb->osb_lock);
88 if (osb->local_system_inodes) {
89 /* Someone has initialized it for us. */
90 free = local_system_inodes;
91 local_system_inodes = osb->local_system_inodes;
92 } else
93 osb->local_system_inodes = local_system_inodes;
94 spin_unlock(&osb->osb_lock);
95 if (unlikely(free))
96 kfree(free);
97 }
98
99 index = (slot * NUM_LOCAL_SYSTEM_INODES) +
100 (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
101
102 return &local_system_inodes[index];
67} 103}
68 104
69struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, 105struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
74 struct inode **arr = NULL; 110 struct inode **arr = NULL;
75 111
76 /* avoid the lookup if cached in local system file array */ 112 /* avoid the lookup if cached in local system file array */
77 if (is_in_system_inode_array(osb, type, slot)) 113 if (is_global_system_inode(type)) {
78 arr = &(osb->system_inodes[type]); 114 arr = &(osb->global_system_inodes[type]);
115 } else
116 arr = get_local_system_inode(osb, type, slot);
79 117
80 if (arr && ((inode = *arr) != NULL)) { 118 if (arr && ((inode = *arr) != NULL)) {
81 /* get a ref in addition to the array ref */ 119 /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f6180..67cd4391464 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1286 xis.inode_bh = xbs.inode_bh = di_bh; 1286 xis.inode_bh = xbs.inode_bh = di_bh;
1287 di = (struct ocfs2_dinode *)di_bh->b_data; 1287 di = (struct ocfs2_dinode *)di_bh->b_data;
1288 1288
1289 down_read(&oi->ip_xattr_sem);
1290 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, 1289 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
1291 buffer_size, &xis); 1290 buffer_size, &xis);
1292 if (ret == -ENODATA && di->i_xattr_loc) 1291 if (ret == -ENODATA && di->i_xattr_loc)
1293 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1292 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
1294 buffer_size, &xbs); 1293 buffer_size, &xbs);
1295 up_read(&oi->ip_xattr_sem);
1296 1294
1297 return ret; 1295 return ret;
1298} 1296}
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
1316 mlog_errno(ret); 1314 mlog_errno(ret);
1317 return ret; 1315 return ret;
1318 } 1316 }
1317 down_read(&OCFS2_I(inode)->ip_xattr_sem);
1319 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, 1318 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1320 name, buffer, buffer_size); 1319 name, buffer, buffer_size);
1320 up_read(&OCFS2_I(inode)->ip_xattr_sem);
1321 1321
1322 ocfs2_inode_unlock(inode, 0); 1322 ocfs2_inode_unlock(inode, 0);
1323 1323
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
7081 goto out; 7081 goto out;
7082 } 7082 }
7083 7083
7084 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) 7084 if (!indexed)
7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); 7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
7086 else 7086 else
7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); 7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b42d6241903..393f3f659da 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -25,11 +25,10 @@ static struct buffer_head *omfs_get_bucket(struct inode *dir,
25 const char *name, int namelen, int *ofs) 25 const char *name, int namelen, int *ofs)
26{ 26{
27 int nbuckets = (dir->i_size - OMFS_DIR_START)/8; 27 int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
28 int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
29 int bucket = omfs_hash(name, namelen, nbuckets); 28 int bucket = omfs_hash(name, namelen, nbuckets);
30 29
31 *ofs = OMFS_DIR_START + bucket * 8; 30 *ofs = OMFS_DIR_START + bucket * 8;
32 return sb_bread(dir->i_sb, block); 31 return omfs_bread(dir->i_sb, dir->i_ino);
33} 32}
34 33
35static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block, 34static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
@@ -42,8 +41,7 @@ static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
42 *prev_block = ~0; 41 *prev_block = ~0;
43 42
44 while (block != ~0) { 43 while (block != ~0) {
45 bh = sb_bread(dir->i_sb, 44 bh = omfs_bread(dir->i_sb, block);
46 clus_to_blk(OMFS_SB(dir->i_sb), block));
47 if (!bh) { 45 if (!bh) {
48 err = -EIO; 46 err = -EIO;
49 goto err; 47 goto err;
@@ -86,11 +84,10 @@ static struct buffer_head *omfs_find_entry(struct inode *dir,
86int omfs_make_empty(struct inode *inode, struct super_block *sb) 84int omfs_make_empty(struct inode *inode, struct super_block *sb)
87{ 85{
88 struct omfs_sb_info *sbi = OMFS_SB(sb); 86 struct omfs_sb_info *sbi = OMFS_SB(sb);
89 int block = clus_to_blk(sbi, inode->i_ino);
90 struct buffer_head *bh; 87 struct buffer_head *bh;
91 struct omfs_inode *oi; 88 struct omfs_inode *oi;
92 89
93 bh = sb_bread(sb, block); 90 bh = omfs_bread(sb, inode->i_ino);
94 if (!bh) 91 if (!bh)
95 return -ENOMEM; 92 return -ENOMEM;
96 93
@@ -134,7 +131,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
134 brelse(bh); 131 brelse(bh);
135 132
136 /* now set the sibling and parent pointers on the new inode */ 133 /* now set the sibling and parent pointers on the new inode */
137 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino)); 134 bh = omfs_bread(dir->i_sb, inode->i_ino);
138 if (!bh) 135 if (!bh)
139 goto out; 136 goto out;
140 137
@@ -190,8 +187,7 @@ static int omfs_delete_entry(struct dentry *dentry)
190 if (prev != ~0) { 187 if (prev != ~0) {
191 /* found in middle of list, get list ptr */ 188 /* found in middle of list, get list ptr */
192 brelse(bh); 189 brelse(bh);
193 bh = sb_bread(dir->i_sb, 190 bh = omfs_bread(dir->i_sb, prev);
194 clus_to_blk(OMFS_SB(dir->i_sb), prev));
195 if (!bh) 191 if (!bh)
196 goto out; 192 goto out;
197 193
@@ -224,8 +220,7 @@ static int omfs_dir_is_empty(struct inode *inode)
224 u64 *ptr; 220 u64 *ptr;
225 int i; 221 int i;
226 222
227 bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb), 223 bh = omfs_bread(inode->i_sb, inode->i_ino);
228 inode->i_ino));
229 224
230 if (!bh) 225 if (!bh)
231 return 0; 226 return 0;
@@ -353,8 +348,7 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
353 348
354 /* follow chain in this bucket */ 349 /* follow chain in this bucket */
355 while (fsblock != ~0) { 350 while (fsblock != ~0) {
356 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), 351 bh = omfs_bread(dir->i_sb, fsblock);
357 fsblock));
358 if (!bh) 352 if (!bh)
359 goto out; 353 goto out;
360 354
@@ -466,7 +460,7 @@ static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
466 hchain = (filp->f_pos >> 20) - 1; 460 hchain = (filp->f_pos >> 20) - 1;
467 hindex = filp->f_pos & 0xfffff; 461 hindex = filp->f_pos & 0xfffff;
468 462
469 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino)); 463 bh = omfs_bread(dir->i_sb, dir->i_ino);
470 if (!bh) 464 if (!bh)
471 goto out; 465 goto out;
472 466
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 6e7a3291bbe..8a6d34fa668 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -50,7 +50,7 @@ int omfs_shrink_inode(struct inode *inode)
50 if (inode->i_size != 0) 50 if (inode->i_size != 0)
51 goto out; 51 goto out;
52 52
53 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 53 bh = omfs_bread(inode->i_sb, next);
54 if (!bh) 54 if (!bh)
55 goto out; 55 goto out;
56 56
@@ -90,7 +90,7 @@ int omfs_shrink_inode(struct inode *inode)
90 if (next == ~0) 90 if (next == ~0)
91 break; 91 break;
92 92
93 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 93 bh = omfs_bread(inode->i_sb, next);
94 if (!bh) 94 if (!bh)
95 goto out; 95 goto out;
96 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 96 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -222,7 +222,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
222 struct buffer_head *bh; 222 struct buffer_head *bh;
223 sector_t next, offset; 223 sector_t next, offset;
224 int ret; 224 int ret;
225 u64 new_block; 225 u64 uninitialized_var(new_block);
226 u32 max_extents; 226 u32 max_extents;
227 int extent_count; 227 int extent_count;
228 struct omfs_extent *oe; 228 struct omfs_extent *oe;
@@ -232,7 +232,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
232 int remain; 232 int remain;
233 233
234 ret = -EIO; 234 ret = -EIO;
235 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino)); 235 bh = omfs_bread(inode->i_sb, inode->i_ino);
236 if (!bh) 236 if (!bh)
237 goto out; 237 goto out;
238 238
@@ -265,7 +265,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
265 break; 265 break;
266 266
267 brelse(bh); 267 brelse(bh);
268 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 268 bh = omfs_bread(inode->i_sb, next);
269 if (!bh) 269 if (!bh)
270 goto out; 270 goto out;
271 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 271 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -312,9 +312,17 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
312 loff_t pos, unsigned len, unsigned flags, 312 loff_t pos, unsigned len, unsigned flags,
313 struct page **pagep, void **fsdata) 313 struct page **pagep, void **fsdata)
314{ 314{
315 *pagep = NULL; 315 int ret;
316 return block_write_begin(file, mapping, pos, len, flags, 316
317 pagep, fsdata, omfs_get_block); 317 ret = block_write_begin(mapping, pos, len, flags, pagep,
318 omfs_get_block);
319 if (unlikely(ret)) {
320 loff_t isize = mapping->host->i_size;
321 if (pos + len > isize)
322 vmtruncate(mapping->host, isize);
323 }
324
325 return ret;
318} 326}
319 327
320static sector_t omfs_bmap(struct address_space *mapping, sector_t block) 328static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
@@ -333,7 +341,29 @@ const struct file_operations omfs_file_operations = {
333 .splice_read = generic_file_splice_read, 341 .splice_read = generic_file_splice_read,
334}; 342};
335 343
344static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
345{
346 struct inode *inode = dentry->d_inode;
347 int error;
348
349 error = inode_change_ok(inode, attr);
350 if (error)
351 return error;
352
353 if ((attr->ia_valid & ATTR_SIZE) &&
354 attr->ia_size != i_size_read(inode)) {
355 error = vmtruncate(inode, attr->ia_size);
356 if (error)
357 return error;
358 }
359
360 setattr_copy(inode, attr);
361 mark_inode_dirty(inode);
362 return 0;
363}
364
336const struct inode_operations omfs_file_inops = { 365const struct inode_operations omfs_file_inops = {
366 .setattr = omfs_setattr,
337 .truncate = omfs_truncate 367 .truncate = omfs_truncate
338}; 368};
339 369
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 089839a6cc6..e043c4cb9a9 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -19,6 +19,15 @@ MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
19MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); 19MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
20MODULE_LICENSE("GPL"); 20MODULE_LICENSE("GPL");
21 21
22struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
23{
24 struct omfs_sb_info *sbi = OMFS_SB(sb);
25 if (block >= sbi->s_num_blocks)
26 return NULL;
27
28 return sb_bread(sb, clus_to_blk(sbi, block));
29}
30
22struct inode *omfs_new_inode(struct inode *dir, int mode) 31struct inode *omfs_new_inode(struct inode *dir, int mode)
23{ 32{
24 struct inode *inode; 33 struct inode *inode;
@@ -93,15 +102,13 @@ static int __omfs_write_inode(struct inode *inode, int wait)
93 struct omfs_inode *oi; 102 struct omfs_inode *oi;
94 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 103 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
95 struct buffer_head *bh, *bh2; 104 struct buffer_head *bh, *bh2;
96 unsigned int block;
97 u64 ctime; 105 u64 ctime;
98 int i; 106 int i;
99 int ret = -EIO; 107 int ret = -EIO;
100 int sync_failed = 0; 108 int sync_failed = 0;
101 109
102 /* get current inode since we may have written sibling ptrs etc. */ 110 /* get current inode since we may have written sibling ptrs etc. */
103 block = clus_to_blk(sbi, inode->i_ino); 111 bh = omfs_bread(inode->i_sb, inode->i_ino);
104 bh = sb_bread(inode->i_sb, block);
105 if (!bh) 112 if (!bh)
106 goto out; 113 goto out;
107 114
@@ -140,8 +147,7 @@ static int __omfs_write_inode(struct inode *inode, int wait)
140 147
141 /* if mirroring writes, copy to next fsblock */ 148 /* if mirroring writes, copy to next fsblock */
142 for (i = 1; i < sbi->s_mirrors; i++) { 149 for (i = 1; i < sbi->s_mirrors; i++) {
143 bh2 = sb_bread(inode->i_sb, block + i * 150 bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
144 (sbi->s_blocksize / sbi->s_sys_blocksize));
145 if (!bh2) 151 if (!bh2)
146 goto out_brelse; 152 goto out_brelse;
147 153
@@ -175,9 +181,13 @@ int omfs_sync_inode(struct inode *inode)
175 * called when an entry is deleted, need to clear the bits in the 181 * called when an entry is deleted, need to clear the bits in the
176 * bitmaps. 182 * bitmaps.
177 */ 183 */
178static void omfs_delete_inode(struct inode *inode) 184static void omfs_evict_inode(struct inode *inode)
179{ 185{
180 truncate_inode_pages(&inode->i_data, 0); 186 truncate_inode_pages(&inode->i_data, 0);
187 end_writeback(inode);
188
189 if (inode->i_nlink)
190 return;
181 191
182 if (S_ISREG(inode->i_mode)) { 192 if (S_ISREG(inode->i_mode)) {
183 inode->i_size = 0; 193 inode->i_size = 0;
@@ -185,7 +195,6 @@ static void omfs_delete_inode(struct inode *inode)
185 } 195 }
186 196
187 omfs_clear_range(inode->i_sb, inode->i_ino, 2); 197 omfs_clear_range(inode->i_sb, inode->i_ino, 2);
188 clear_inode(inode);
189} 198}
190 199
191struct inode *omfs_iget(struct super_block *sb, ino_t ino) 200struct inode *omfs_iget(struct super_block *sb, ino_t ino)
@@ -193,7 +202,6 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
193 struct omfs_sb_info *sbi = OMFS_SB(sb); 202 struct omfs_sb_info *sbi = OMFS_SB(sb);
194 struct omfs_inode *oi; 203 struct omfs_inode *oi;
195 struct buffer_head *bh; 204 struct buffer_head *bh;
196 unsigned int block;
197 u64 ctime; 205 u64 ctime;
198 unsigned long nsecs; 206 unsigned long nsecs;
199 struct inode *inode; 207 struct inode *inode;
@@ -204,8 +212,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
204 if (!(inode->i_state & I_NEW)) 212 if (!(inode->i_state & I_NEW))
205 return inode; 213 return inode;
206 214
207 block = clus_to_blk(sbi, ino); 215 bh = omfs_bread(inode->i_sb, ino);
208 bh = sb_bread(inode->i_sb, block);
209 if (!bh) 216 if (!bh)
210 goto iget_failed; 217 goto iget_failed;
211 218
@@ -284,7 +291,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
284 291
285static const struct super_operations omfs_sops = { 292static const struct super_operations omfs_sops = {
286 .write_inode = omfs_write_inode, 293 .write_inode = omfs_write_inode,
287 .delete_inode = omfs_delete_inode, 294 .evict_inode = omfs_evict_inode,
288 .put_super = omfs_put_super, 295 .put_super = omfs_put_super,
289 .statfs = omfs_statfs, 296 .statfs = omfs_statfs,
290 .show_options = generic_show_options, 297 .show_options = generic_show_options,
@@ -319,6 +326,9 @@ static int omfs_get_imap(struct super_block *sb)
319 goto nomem; 326 goto nomem;
320 327
321 block = clus_to_blk(sbi, sbi->s_bitmap_ino); 328 block = clus_to_blk(sbi, sbi->s_bitmap_ino);
329 if (block >= sbi->s_num_blocks)
330 goto nomem;
331
322 ptr = sbi->s_imap; 332 ptr = sbi->s_imap;
323 for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { 333 for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
324 bh = sb_bread(sb, block++); 334 bh = sb_bread(sb, block++);
@@ -417,7 +427,6 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
417 struct omfs_root_block *omfs_rb; 427 struct omfs_root_block *omfs_rb;
418 struct omfs_sb_info *sbi; 428 struct omfs_sb_info *sbi;
419 struct inode *root; 429 struct inode *root;
420 sector_t start;
421 int ret = -EINVAL; 430 int ret = -EINVAL;
422 431
423 save_mount_options(sb, (char *) data); 432 save_mount_options(sb, (char *) data);
@@ -486,8 +495,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
486 sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - 495 sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
487 get_bitmask_order(sbi->s_sys_blocksize); 496 get_bitmask_order(sbi->s_sys_blocksize);
488 497
489 start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block)); 498 bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
490 bh2 = sb_bread(sb, start);
491 if (!bh2) 499 if (!bh2)
492 goto out_brelse_bh; 500 goto out_brelse_bh;
493 501
@@ -504,6 +512,21 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
504 goto out_brelse_bh2; 512 goto out_brelse_bh2;
505 } 513 }
506 514
515 if (sbi->s_bitmap_ino != ~0ULL &&
516 sbi->s_bitmap_ino > sbi->s_num_blocks) {
517 printk(KERN_ERR "omfs: free space bitmap location is corrupt "
518 "(%llx, total blocks %llx)\n",
519 (unsigned long long) sbi->s_bitmap_ino,
520 (unsigned long long) sbi->s_num_blocks);
521 goto out_brelse_bh2;
522 }
523 if (sbi->s_clustersize < 1 ||
524 sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
525 printk(KERN_ERR "omfs: cluster size out of range (%d)",
526 sbi->s_clustersize);
527 goto out_brelse_bh2;
528 }
529
507 ret = omfs_get_imap(sb); 530 ret = omfs_get_imap(sb);
508 if (ret) 531 if (ret)
509 goto out_brelse_bh2; 532 goto out_brelse_bh2;
@@ -529,20 +552,21 @@ out_brelse_bh2:
529out_brelse_bh: 552out_brelse_bh:
530 brelse(bh); 553 brelse(bh);
531end: 554end:
555 if (ret)
556 kfree(sbi);
532 return ret; 557 return ret;
533} 558}
534 559
535static int omfs_get_sb(struct file_system_type *fs_type, 560static struct dentry *omfs_mount(struct file_system_type *fs_type,
536 int flags, const char *dev_name, 561 int flags, const char *dev_name, void *data)
537 void *data, struct vfsmount *m)
538{ 562{
539 return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); 563 return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
540} 564}
541 565
542static struct file_system_type omfs_fs_type = { 566static struct file_system_type omfs_fs_type = {
543 .owner = THIS_MODULE, 567 .owner = THIS_MODULE,
544 .name = "omfs", 568 .name = "omfs",
545 .get_sb = omfs_get_sb, 569 .mount = omfs_mount,
546 .kill_sb = kill_block_super, 570 .kill_sb = kill_block_super,
547 .fs_flags = FS_REQUIRES_DEV, 571 .fs_flags = FS_REQUIRES_DEV,
548}; 572};
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index ebe2fdbe535..7d414fef501 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -58,6 +58,7 @@ extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
58extern int omfs_shrink_inode(struct inode *inode); 58extern int omfs_shrink_inode(struct inode *inode);
59 59
60/* inode.c */ 60/* inode.c */
61extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
61extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); 62extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
62extern struct inode *omfs_new_inode(struct inode *dir, int mode); 63extern struct inode *omfs_new_inode(struct inode *dir, int mode);
63extern int omfs_reserve_block(struct super_block *sb, sector_t block); 64extern int omfs_reserve_block(struct super_block *sb, sector_t block);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index 12cca245d6e..ee5e4327de9 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -17,6 +17,7 @@
17#define OMFS_EXTENT_CONT 0x40 17#define OMFS_EXTENT_CONT 0x40
18#define OMFS_XOR_COUNT 19 18#define OMFS_XOR_COUNT 19
19#define OMFS_MAX_BLOCK_SIZE 8192 19#define OMFS_MAX_BLOCK_SIZE 8192
20#define OMFS_MAX_CLUSTER_SIZE 8
20 21
21struct omfs_super_block { 22struct omfs_super_block {
22 char s_fill1[256]; 23 char s_fill1[256];
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e..4197b9ed023 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/falloc.h> 29#include <linux/falloc.h>
30#include <linux/fs_struct.h> 30#include <linux/fs_struct.h>
31#include <linux/ima.h> 31#include <linux/ima.h>
32#include <linux/dnotify.h>
32 33
33#include "internal.h" 34#include "internal.h"
34 35
@@ -110,7 +111,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
110 111
111 error = locks_verify_truncate(inode, NULL, length); 112 error = locks_verify_truncate(inode, NULL, length);
112 if (!error) 113 if (!error)
113 error = security_path_truncate(&path, length, 0); 114 error = security_path_truncate(&path);
114 if (!error) 115 if (!error)
115 error = do_truncate(path.dentry, length, 0, NULL); 116 error = do_truncate(path.dentry, length, 0, NULL);
116 117
@@ -165,8 +166,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
165 166
166 error = locks_verify_truncate(inode, file, length); 167 error = locks_verify_truncate(inode, file, length);
167 if (!error) 168 if (!error)
168 error = security_path_truncate(&file->f_path, length, 169 error = security_path_truncate(&file->f_path);
169 ATTR_MTIME|ATTR_CTIME);
170 if (!error) 170 if (!error)
171 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 171 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
172out_putf: 172out_putf:
@@ -367,7 +367,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
367 if (error) 367 if (error)
368 goto out; 368 goto out;
369 369
370 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); 370 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
371 if (error) 371 if (error)
372 goto dput_and_out; 372 goto dput_and_out;
373 373
@@ -396,7 +396,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
396 if (!S_ISDIR(inode->i_mode)) 396 if (!S_ISDIR(inode->i_mode))
397 goto out_putf; 397 goto out_putf;
398 398
399 error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); 399 error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
400 if (!error) 400 if (!error)
401 set_fs_pwd(current->fs, &file->f_path); 401 set_fs_pwd(current->fs, &file->f_path);
402out_putf: 402out_putf:
@@ -414,7 +414,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
414 if (error) 414 if (error)
415 goto out; 415 goto out;
416 416
417 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); 417 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
418 if (error) 418 if (error)
419 goto dput_and_out; 419 goto dput_and_out;
420 420
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
675 f->f_path.mnt = mnt; 675 f->f_path.mnt = mnt;
676 f->f_pos = 0; 676 f->f_pos = 0;
677 f->f_op = fops_get(inode->i_fop); 677 f->f_op = fops_get(inode->i_fop);
678 file_move(f, &inode->i_sb->s_files); 678 file_sb_list_add(f, inode->i_sb);
679 679
680 error = security_dentry_open(f, cred); 680 error = security_dentry_open(f, cred);
681 if (error) 681 if (error)
@@ -721,7 +721,7 @@ cleanup_all:
721 mnt_drop_write(mnt); 721 mnt_drop_write(mnt);
722 } 722 }
723 } 723 }
724 file_kill(f); 724 file_sb_list_del(f);
725 f->f_path.dentry = NULL; 725 f->f_path.dentry = NULL;
726 f->f_path.mnt = NULL; 726 f->f_path.mnt = NULL;
727cleanup_file: 727cleanup_file:
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
786 /* Pick up the filp from the open intent */ 786 /* Pick up the filp from the open intent */
787 filp = nd->intent.open.file; 787 filp = nd->intent.open.file;
788 /* Has the filesystem initialised the file for us? */ 788 /* Has the filesystem initialised the file for us? */
789 if (filp->f_path.dentry == NULL) 789 if (filp->f_path.dentry == NULL) {
790 path_get(&nd->path);
790 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, 791 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
791 NULL, cred); 792 NULL, cred);
792 else 793 }
793 path_put(&nd->path);
794 return filp; 794 return filp;
795} 795}
796 796
@@ -888,7 +888,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
888 put_unused_fd(fd); 888 put_unused_fd(fd);
889 fd = PTR_ERR(f); 889 fd = PTR_ERR(f);
890 } else { 890 } else {
891 fsnotify_open(f->f_path.dentry); 891 fsnotify_open(f);
892 fd_install(fd, f); 892 fd_install(fd, f);
893 } 893 }
894 } 894 }
@@ -1031,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
1031 1031
1032/* 1032/*
1033 * This is used by subsystems that don't want seekable 1033 * This is used by subsystems that don't want seekable
1034 * file descriptors 1034 * file descriptors. The function is not supposed to ever fail, the only
1035 * reason it returns an 'int' and not 'void' is so that it can be plugged
1036 * directly into file_operations structure.
1035 */ 1037 */
1036int nonseekable_open(struct inode *inode, struct file *filp) 1038int nonseekable_open(struct inode *inode, struct file *filp)
1037{ 1039{
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012..ddb1f41376e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
415 return ret; 415 return ret;
416} 416}
417 417
418static int openprom_get_sb(struct file_system_type *fs_type, 418static struct dentry *openprom_mount(struct file_system_type *fs_type,
419 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 419 int flags, const char *dev_name, void *data)
420{ 420{
421 return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); 421 return mount_single(fs_type, flags, data, openprom_fill_super)
422} 422}
423 423
424static struct file_system_type openprom_fs_type = { 424static struct file_system_type openprom_fs_type = {
425 .owner = THIS_MODULE, 425 .owner = THIS_MODULE,
426 .name = "openpromfs", 426 .name = "openpromfs",
427 .get_sb = openprom_get_sb, 427 .mount = openprom_mount,
428 .kill_sb = kill_anon_super, 428 .kill_sb = kill_anon_super,
429}; 429};
430 430
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 6921e7890be..fbeb697374d 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | 45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
46 (le32_to_cpu(dr->disc_size) >> 9); 46 (le32_to_cpu(dr->disc_size) >> 9);
47 47
48 if (name) 48 if (name) {
49 printk(" [%s]", name); 49 strlcat(state->pp_buf, " [", PAGE_SIZE);
50 strlcat(state->pp_buf, name, PAGE_SIZE);
51 strlcat(state->pp_buf, "]", PAGE_SIZE);
52 }
50 put_partition(state, slot, first_sector, nr_sects); 53 put_partition(state, slot, first_sector, nr_sects);
51 return dr; 54 return dr;
52} 55}
@@ -81,14 +84,14 @@ static int riscix_partition(struct parsed_partitions *state,
81 if (!rr) 84 if (!rr)
82 return -1; 85 return -1;
83 86
84 printk(" [RISCiX]"); 87 strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
85 88
86 89
87 if (rr->magic == RISCIX_MAGIC) { 90 if (rr->magic == RISCIX_MAGIC) {
88 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 91 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
89 int part; 92 int part;
90 93
91 printk(" <"); 94 strlcat(state->pp_buf, " <", PAGE_SIZE);
92 95
93 put_partition(state, slot++, first_sect, size); 96 put_partition(state, slot++, first_sect, size);
94 for (part = 0; part < 8; part++) { 97 for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ static int riscix_partition(struct parsed_partitions *state,
97 put_partition(state, slot++, 100 put_partition(state, slot++,
98 le32_to_cpu(rr->part[part].start), 101 le32_to_cpu(rr->part[part].start),
99 le32_to_cpu(rr->part[part].length)); 102 le32_to_cpu(rr->part[part].length));
100 printk("(%s)", rr->part[part].name); 103 strlcat(state->pp_buf, "(", PAGE_SIZE);
104 strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
105 strlcat(state->pp_buf, ")", PAGE_SIZE);
101 } 106 }
102 } 107 }
103 108
104 printk(" >\n"); 109 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
105 } else { 110 } else {
106 put_partition(state, slot++, first_sect, nr_sects); 111 put_partition(state, slot++, first_sect, nr_sects);
107 } 112 }
@@ -131,7 +136,7 @@ static int linux_partition(struct parsed_partitions *state,
131 struct linux_part *linuxp; 136 struct linux_part *linuxp;
132 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 137 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
133 138
134 printk(" [Linux]"); 139 strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
135 140
136 put_partition(state, slot++, first_sect, size); 141 put_partition(state, slot++, first_sect, size);
137 142
@@ -139,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
139 if (!linuxp) 144 if (!linuxp)
140 return -1; 145 return -1;
141 146
142 printk(" <"); 147 strlcat(state->pp_buf, " <", PAGE_SIZE);
143 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || 148 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
144 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { 149 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
145 if (slot == state->limit) 150 if (slot == state->limit)
@@ -149,7 +154,7 @@ static int linux_partition(struct parsed_partitions *state,
149 le32_to_cpu(linuxp->nr_sects)); 154 le32_to_cpu(linuxp->nr_sects));
150 linuxp ++; 155 linuxp ++;
151 } 156 }
152 printk(" >"); 157 strlcat(state->pp_buf, " >", PAGE_SIZE);
153 158
154 put_dev_sector(sect); 159 put_dev_sector(sect);
155 return slot; 160 return slot;
@@ -294,7 +299,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
294 break; 299 break;
295 } 300 }
296 } 301 }
297 printk("\n"); 302 strlcat(state->pp_buf, "\n", PAGE_SIZE);
298 return 1; 303 return 1;
299} 304}
300#endif 305#endif
@@ -367,7 +372,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
367 return 0; 372 return 0;
368 } 373 }
369 374
370 printk(" [ICS]"); 375 strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
371 376
372 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { 377 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
373 u32 start = le32_to_cpu(p->start); 378 u32 start = le32_to_cpu(p->start);
@@ -401,7 +406,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
401 } 406 }
402 407
403 put_dev_sector(sect); 408 put_dev_sector(sect);
404 printk("\n"); 409 strlcat(state->pp_buf, "\n", PAGE_SIZE);
405 return 1; 410 return 1;
406} 411}
407#endif 412#endif
@@ -461,7 +466,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
461 return 0; 466 return 0;
462 } 467 }
463 468
464 printk(" [POWERTEC]"); 469 strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
465 470
466 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { 471 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
467 u32 start = le32_to_cpu(p->start); 472 u32 start = le32_to_cpu(p->start);
@@ -472,7 +477,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
472 } 477 }
473 478
474 put_dev_sector(sect); 479 put_dev_sector(sect);
475 printk("\n"); 480 strlcat(state->pp_buf, "\n", PAGE_SIZE);
476 return 1; 481 return 1;
477} 482}
478#endif 483#endif
@@ -543,7 +548,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
543 548
544 size = get_capacity(state->bdev->bd_disk); 549 size = get_capacity(state->bdev->bd_disk);
545 put_partition(state, slot++, start, size - start); 550 put_partition(state, slot++, start, size - start);
546 printk("\n"); 551 strlcat(state->pp_buf, "\n", PAGE_SIZE);
547 } 552 }
548 553
549 return i ? 1 : 0; 554 return i ? 1 : 0;
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index ba443d4229f..70cbf44a156 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -69,7 +69,13 @@ int amiga_partition(struct parsed_partitions *state)
69 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; 70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
71 71
72 printk(" RDSK (%d)", blksize * 512); /* Be more informative */ 72 {
73 char tmp[7 + 10 + 1 + 1];
74
75 /* Be more informative */
76 snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
77 strlcat(state->pp_buf, tmp, PAGE_SIZE);
78 }
73 blk = be32_to_cpu(rdb->rdb_PartitionList); 79 blk = be32_to_cpu(rdb->rdb_PartitionList);
74 put_dev_sector(sect); 80 put_dev_sector(sect);
75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 81 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -106,23 +112,27 @@ int amiga_partition(struct parsed_partitions *state)
106 { 112 {
107 /* Be even more informative to aid mounting */ 113 /* Be even more informative to aid mounting */
108 char dostype[4]; 114 char dostype[4];
115 char tmp[42];
116
109 __be32 *dt = (__be32 *)dostype; 117 __be32 *dt = (__be32 *)dostype;
110 *dt = pb->pb_Environment[16]; 118 *dt = pb->pb_Environment[16];
111 if (dostype[3] < ' ') 119 if (dostype[3] < ' ')
112 printk(" (%c%c%c^%c)", 120 snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
113 dostype[0], dostype[1], 121 dostype[0], dostype[1],
114 dostype[2], dostype[3] + '@' ); 122 dostype[2], dostype[3] + '@' );
115 else 123 else
116 printk(" (%c%c%c%c)", 124 snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
117 dostype[0], dostype[1], 125 dostype[0], dostype[1],
118 dostype[2], dostype[3]); 126 dostype[2], dostype[3]);
119 printk("(res %d spb %d)", 127 strlcat(state->pp_buf, tmp, PAGE_SIZE);
128 snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
120 be32_to_cpu(pb->pb_Environment[6]), 129 be32_to_cpu(pb->pb_Environment[6]),
121 be32_to_cpu(pb->pb_Environment[4])); 130 be32_to_cpu(pb->pb_Environment[4]));
131 strlcat(state->pp_buf, tmp, PAGE_SIZE);
122 } 132 }
123 res = 1; 133 res = 1;
124 } 134 }
125 printk("\n"); 135 strlcat(state->pp_buf, "\n", PAGE_SIZE);
126 136
127rdb_done: 137rdb_done:
128 return res; 138 return res;
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 4439ff1b6ce..9875b05e80a 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state)
62 } 62 }
63 63
64 pi = &rs->part[0]; 64 pi = &rs->part[0];
65 printk (" AHDI"); 65 strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { 66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
67 struct rootsector *xrs; 67 struct rootsector *xrs;
68 Sector sect2; 68 Sector sect2;
@@ -81,7 +81,7 @@ int atari_partition(struct parsed_partitions *state)
81#ifdef ICD_PARTS 81#ifdef ICD_PARTS
82 part_fmt = 1; 82 part_fmt = 1;
83#endif 83#endif
84 printk(" XGM<"); 84 strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = read_part_sector(state, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state)
120 break; 120 break;
121 } 121 }
122 } 122 }
123 printk(" >"); 123 strlcat(state->pp_buf, " >", PAGE_SIZE);
124 } 124 }
125#ifdef ICD_PARTS 125#ifdef ICD_PARTS
126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ 126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
127 pi = &rs->icdpart[0]; 127 pi = &rs->icdpart[0];
128 /* sanity check: no ICD format if first partition invalid */ 128 /* sanity check: no ICD format if first partition invalid */
129 if (OK_id(pi->id)) { 129 if (OK_id(pi->id)) {
130 printk(" ICD<"); 130 strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { 131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */ 132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */
133 if (!((pi->flg & 1) && OK_id(pi->id))) 133 if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state)
137 be32_to_cpu(pi->st), 137 be32_to_cpu(pi->st),
138 be32_to_cpu(pi->siz)); 138 be32_to_cpu(pi->siz));
139 } 139 }
140 printk(" >"); 140 strlcat(state->pp_buf, " >", PAGE_SIZE);
141 } 141 }
142 } 142 }
143#endif 143#endif
144 put_dev_sector(sect); 144 put_dev_sector(sect);
145 145
146 printk ("\n"); 146 strlcat(state->pp_buf, "\n", PAGE_SIZE);
147 147
148 return 1; 148 return 1;
149} 149}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5dcd4b0c553..0a8b0ad0c7e 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -164,10 +164,16 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
168 if (!state->pp_buf) {
169 kfree(state);
170 return NULL;
171 }
172 state->pp_buf[0] = '\0';
167 173
168 state->bdev = bdev; 174 state->bdev = bdev;
169 disk_name(hd, 0, state->name); 175 disk_name(hd, 0, state->name);
170 printk(KERN_INFO " %s:", state->name); 176 snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
171 if (isdigit(state->name[strlen(state->name)-1])) 177 if (isdigit(state->name[strlen(state->name)-1]))
172 sprintf(state->name, "p"); 178 sprintf(state->name, "p");
173 179
@@ -185,17 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
185 } 191 }
186 192
187 } 193 }
188 if (res > 0) 194 if (res > 0) {
195 printk(KERN_INFO "%s", state->pp_buf);
196
197 free_page((unsigned long)state->pp_buf);
189 return state; 198 return state;
199 }
190 if (state->access_beyond_eod) 200 if (state->access_beyond_eod)
191 err = -ENOSPC; 201 err = -ENOSPC;
192 if (err) 202 if (err)
193 /* The partition is unrecognized. So report I/O errors if there were any */ 203 /* The partition is unrecognized. So report I/O errors if there were any */
194 res = err; 204 res = err;
195 if (!res) 205 if (!res)
196 printk(" unknown partition table\n"); 206 strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
197 else if (warn_no_part) 207 else if (warn_no_part)
198 printk(" unable to read partition table\n"); 208 strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
209
210 printk(KERN_INFO "%s", state->pp_buf);
211
212 free_page((unsigned long)state->pp_buf);
199 kfree(state); 213 kfree(state);
200 return ERR_PTR(res); 214 return ERR_PTR(res);
201} 215}
@@ -338,6 +352,7 @@ static void part_release(struct device *dev)
338{ 352{
339 struct hd_struct *p = dev_to_part(dev); 353 struct hd_struct *p = dev_to_part(dev);
340 free_part_stats(p); 354 free_part_stats(p);
355 free_part_info(p);
341 kfree(p); 356 kfree(p);
342} 357}
343 358
@@ -387,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
387 whole_disk_show, NULL); 402 whole_disk_show, NULL);
388 403
389struct hd_struct *add_partition(struct gendisk *disk, int partno, 404struct hd_struct *add_partition(struct gendisk *disk, int partno,
390 sector_t start, sector_t len, int flags) 405 sector_t start, sector_t len, int flags,
406 struct partition_meta_info *info)
391{ 407{
392 struct hd_struct *p; 408 struct hd_struct *p;
393 dev_t devt = MKDEV(0, 0); 409 dev_t devt = MKDEV(0, 0);
@@ -424,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
424 p->partno = partno; 440 p->partno = partno;
425 p->policy = get_disk_ro(disk); 441 p->policy = get_disk_ro(disk);
426 442
443 if (info) {
444 struct partition_meta_info *pinfo = alloc_part_info(disk);
445 if (!pinfo)
446 goto out_free_stats;
447 memcpy(pinfo, info, sizeof(*info));
448 p->info = pinfo;
449 }
450
427 dname = dev_name(ddev); 451 dname = dev_name(ddev);
428 if (isdigit(dname[strlen(dname) - 1])) 452 if (isdigit(dname[strlen(dname) - 1]))
429 dev_set_name(pdev, "%sp%d", dname, partno); 453 dev_set_name(pdev, "%sp%d", dname, partno);
@@ -437,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
437 461
438 err = blk_alloc_devt(p, &devt); 462 err = blk_alloc_devt(p, &devt);
439 if (err) 463 if (err)
440 goto out_free_stats; 464 goto out_free_info;
441 pdev->devt = devt; 465 pdev->devt = devt;
442 466
443 /* delay uevent until 'holders' subdir is created */ 467 /* delay uevent until 'holders' subdir is created */
@@ -459,7 +483,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
459 } 483 }
460 484
461 /* everything is up and running, commence */ 485 /* everything is up and running, commence */
462 INIT_RCU_HEAD(&p->rcu_head);
463 rcu_assign_pointer(ptbl->part[partno], p); 486 rcu_assign_pointer(ptbl->part[partno], p);
464 487
465 /* suppress uevent if the disk supresses it */ 488 /* suppress uevent if the disk supresses it */
@@ -468,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
468 491
469 return p; 492 return p;
470 493
494out_free_info:
495 free_part_info(p);
471out_free_stats: 496out_free_stats:
472 free_part_stats(p); 497 free_part_stats(p);
473out_free: 498out_free:
@@ -500,14 +525,14 @@ void register_disk(struct gendisk *disk)
500 525
501 if (device_add(ddev)) 526 if (device_add(ddev))
502 return; 527 return;
503#ifndef CONFIG_SYSFS_DEPRECATED 528 if (!sysfs_deprecated) {
504 err = sysfs_create_link(block_depr, &ddev->kobj, 529 err = sysfs_create_link(block_depr, &ddev->kobj,
505 kobject_name(&ddev->kobj)); 530 kobject_name(&ddev->kobj));
506 if (err) { 531 if (err) {
507 device_del(ddev); 532 device_del(ddev);
508 return; 533 return;
534 }
509 } 535 }
510#endif
511 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); 536 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
512 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
513 538
@@ -629,6 +654,7 @@ rescan:
629 /* add partitions */ 654 /* add partitions */
630 for (p = 1; p < state->limit; p++) { 655 for (p = 1; p < state->limit; p++) {
631 sector_t size, from; 656 sector_t size, from;
657 struct partition_meta_info *info = NULL;
632 658
633 size = state->parts[p].size; 659 size = state->parts[p].size;
634 if (!size) 660 if (!size)
@@ -662,8 +688,12 @@ rescan:
662 size = get_capacity(disk) - from; 688 size = get_capacity(disk) - from;
663 } 689 }
664 } 690 }
691
692 if (state->parts[p].has_info)
693 info = &state->parts[p].info;
665 part = add_partition(disk, p, from, size, 694 part = add_partition(disk, p, from, size,
666 state->parts[p].flags); 695 state->parts[p].flags,
696 &state->parts[p].info);
667 if (IS_ERR(part)) { 697 if (IS_ERR(part)) {
668 printk(KERN_ERR " %s: p%d could not be added: %ld\n", 698 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
669 disk->disk_name, p, -PTR_ERR(part)); 699 disk->disk_name, p, -PTR_ERR(part));
@@ -724,8 +754,7 @@ void del_gendisk(struct gendisk *disk)
724 kobject_put(disk->part0.holder_dir); 754 kobject_put(disk->part0.holder_dir);
725 kobject_put(disk->slave_dir); 755 kobject_put(disk->slave_dir);
726 disk->driverfs_dev = NULL; 756 disk->driverfs_dev = NULL;
727#ifndef CONFIG_SYSFS_DEPRECATED 757 if (!sysfs_deprecated)
728 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 758 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
729#endif
730 device_del(disk_to_dev(disk)); 759 device_del(disk_to_dev(disk));
731} 760}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 52f8bd39939..d68bf4dc3bc 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
1#include <linux/pagemap.h> 1#include <linux/pagemap.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/genhd.h>
3 4
4/* 5/*
5 * add_gd_partition adds a partitions details to the devices partition 6 * add_gd_partition adds a partitions details to the devices partition
@@ -12,10 +13,13 @@ struct parsed_partitions {
12 sector_t from; 13 sector_t from;
13 sector_t size; 14 sector_t size;
14 int flags; 15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
15 } parts[DISK_MAX_PARTS]; 18 } parts[DISK_MAX_PARTS];
16 int next; 19 int next;
17 int limit; 20 int limit;
18 bool access_beyond_eod; 21 bool access_beyond_eod;
22 char *pp_buf;
19}; 23};
20 24
21static inline void *read_part_sector(struct parsed_partitions *state, 25static inline void *read_part_sector(struct parsed_partitions *state,
@@ -32,9 +36,12 @@ static inline void
32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 36put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
33{ 37{
34 if (n < p->limit) { 38 if (n < p->limit) {
39 char tmp[1 + BDEVNAME_SIZE + 10 + 1];
40
35 p->parts[n].from = from; 41 p->parts[n].from = from;
36 p->parts[n].size = size; 42 p->parts[n].size = size;
37 printk(" %s%d", p->name, n); 43 snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
44 strlcat(p->pp_buf, tmp, PAGE_SIZE);
38 } 45 }
39} 46}
40 47
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 9efb2cfe241..ac0ccb5026a 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
94 * 94 *
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/ctype.h>
97#include <linux/math64.h> 98#include <linux/math64.h>
98#include <linux/slab.h> 99#include <linux/slab.h>
99#include "check.h" 100#include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
604 gpt_entry *ptes = NULL; 605 gpt_entry *ptes = NULL;
605 u32 i; 606 u32 i;
606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512; 607 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
608 u8 unparsed_guid[37];
607 609
608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { 610 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
609 kfree(gpt); 611 kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
614 pr_debug("GUID Partition Table is valid! Yea!\n"); 616 pr_debug("GUID Partition Table is valid! Yea!\n");
615 617
616 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 618 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
619 struct partition_meta_info *info;
620 unsigned label_count = 0;
621 unsigned label_max;
617 u64 start = le64_to_cpu(ptes[i].starting_lba); 622 u64 start = le64_to_cpu(ptes[i].starting_lba);
618 u64 size = le64_to_cpu(ptes[i].ending_lba) - 623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,9 +632,29 @@ int efi_partition(struct parsed_partitions *state)
627 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
628 PARTITION_LINUX_RAID_GUID)) 633 PARTITION_LINUX_RAID_GUID))
629 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 634 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635
636 info = &state->parts[i + 1].info;
637 /* Instead of doing a manual swap to big endian, reuse the
638 * common ASCII hex format as the interim.
639 */
640 efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
641 part_pack_uuid(unparsed_guid, info->uuid);
642
643 /* Naively convert UTF16-LE to 7 bits. */
644 label_max = min(sizeof(info->volname) - 1,
645 sizeof(ptes[i].partition_name));
646 info->volname[label_max] = 0;
647 while (label_count < label_max) {
648 u8 c = ptes[i].partition_name[label_count] & 0xff;
649 if (c && !isprint(c))
650 c = '!';
651 info->volname[label_count] = c;
652 label_count++;
653 }
654 state->parts[i + 1].has_info = true;
630 } 655 }
631 kfree(ptes); 656 kfree(ptes);
632 kfree(gpt); 657 kfree(gpt);
633 printk("\n"); 658 strlcat(state->pp_buf, "\n", PAGE_SIZE);
634 return 1; 659 return 1;
635} 660}
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc8497643fd..d513a07f44b 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -75,6 +75,7 @@ int ibm_partition(struct parsed_partitions *state)
75 unsigned char *data; 75 unsigned char *data;
76 Sector sect; 76 Sector sect;
77 sector_t labelsect; 77 sector_t labelsect;
78 char tmp[64];
78 79
79 res = 0; 80 res = 0;
80 blocksize = bdev_logical_block_size(bdev); 81 blocksize = bdev_logical_block_size(bdev);
@@ -144,13 +145,15 @@ int ibm_partition(struct parsed_partitions *state)
144 */ 145 */
145 blocksize = label->cms.block_size; 146 blocksize = label->cms.block_size;
146 if (label->cms.disk_offset != 0) { 147 if (label->cms.disk_offset != 0) {
147 printk("CMS1/%8s(MDSK):", name); 148 snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
149 strlcat(state->pp_buf, tmp, PAGE_SIZE);
148 /* disk is reserved minidisk */ 150 /* disk is reserved minidisk */
149 offset = label->cms.disk_offset; 151 offset = label->cms.disk_offset;
150 size = (label->cms.block_count - 1) 152 size = (label->cms.block_count - 1)
151 * (blocksize >> 9); 153 * (blocksize >> 9);
152 } else { 154 } else {
153 printk("CMS1/%8s:", name); 155 snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
156 strlcat(state->pp_buf, tmp, PAGE_SIZE);
154 offset = (info->label_block + 1); 157 offset = (info->label_block + 1);
155 size = label->cms.block_count 158 size = label->cms.block_count
156 * (blocksize >> 9); 159 * (blocksize >> 9);
@@ -159,7 +162,8 @@ int ibm_partition(struct parsed_partitions *state)
159 size-offset*(blocksize >> 9)); 162 size-offset*(blocksize >> 9));
160 } else { 163 } else {
161 if (strncmp(type, "LNX1", 4) == 0) { 164 if (strncmp(type, "LNX1", 4) == 0) {
162 printk("LNX1/%8s:", name); 165 snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
166 strlcat(state->pp_buf, tmp, PAGE_SIZE);
163 if (label->lnx.ldl_version == 0xf2) { 167 if (label->lnx.ldl_version == 0xf2) {
164 fmt_size = label->lnx.formatted_blocks 168 fmt_size = label->lnx.formatted_blocks
165 * (blocksize >> 9); 169 * (blocksize >> 9);
@@ -178,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state)
178 offset = (info->label_block + 1); 182 offset = (info->label_block + 1);
179 } else { 183 } else {
180 /* unlabeled disk */ 184 /* unlabeled disk */
181 printk("(nonl)"); 185 strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
182 size = i_size >> 9; 186 size = i_size >> 9;
183 offset = (info->label_block + 1); 187 offset = (info->label_block + 1);
184 } 188 }
@@ -197,7 +201,8 @@ int ibm_partition(struct parsed_partitions *state)
197 * if not, something is wrong, skipping partition detection 201 * if not, something is wrong, skipping partition detection
198 */ 202 */
199 if (strncmp(type, "VOL1", 4) == 0) { 203 if (strncmp(type, "VOL1", 4) == 0) {
200 printk("VOL1/%8s:", name); 204 snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
205 strlcat(state->pp_buf, tmp, PAGE_SIZE);
201 /* 206 /*
202 * get block number and read then go through format1 207 * get block number and read then go through format1
203 * labels 208 * labels
@@ -253,7 +258,7 @@ int ibm_partition(struct parsed_partitions *state)
253 258
254 } 259 }
255 260
256 printk("\n"); 261 strlcat(state->pp_buf, "\n", PAGE_SIZE);
257 goto out_freeall; 262 goto out_freeall;
258 263
259 264
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 1cc928bb762..0ea19312706 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state)
50 } 50 }
51 slot++; 51 slot++;
52 } 52 }
53 printk("\n"); 53 strlcat(state->pp_buf, "\n", PAGE_SIZE);
54 put_dev_sector(sect); 54 put_dev_sector(sect);
55 return 1; 55 return 1;
56} 56}
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 648c9d8f335..789c625c7aa 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it under 10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software 11 * the terms of the GNU General Public License as published by the Free Software
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
643 return false; 643 return false;
644 } 644 }
645 645
646 printk (" [LDM]"); 646 strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
647 647
648 /* Create the data partitions */ 648 /* Create the data partitions */
649 list_for_each (item, &ldb->v_part) { 649 list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
658 part_num++; 658 part_num++;
659 } 659 }
660 660
661 printk ("\n"); 661 strlcat(pp->pp_buf, "\n", PAGE_SIZE);
662 return true; 662 return true;
663} 663}
664 664
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d8..374242c0971 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free 11 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 74465ff7c26..68d6a216ee7 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -59,7 +59,7 @@ int mac_partition(struct parsed_partitions *state)
59 put_dev_sector(sect); 59 put_dev_sector(sect);
60 return 0; /* not a MacOS disk */ 60 return 0; /* not a MacOS disk */
61 } 61 }
62 printk(" [mac]"); 62 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
63 blocks_in_map = be32_to_cpu(part->map_count); 63 blocks_in_map = be32_to_cpu(part->map_count);
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
@@ -128,6 +128,6 @@ int mac_partition(struct parsed_partitions *state)
128#endif 128#endif
129 129
130 put_dev_sector(sect); 130 put_dev_sector(sect);
131 printk("\n"); 131 strlcat(state->pp_buf, "\n", PAGE_SIZE);
132 return 1; 132 return 1;
133} 133}
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 15bfb7b1e04..5f79a6677c6 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -213,10 +213,18 @@ static void parse_solaris_x86(struct parsed_partitions *state,
213 put_dev_sector(sect); 213 put_dev_sector(sect);
214 return; 214 return;
215 } 215 }
216 printk(" %s%d: <solaris:", state->name, origin); 216 {
217 char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
218
219 snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
220 strlcat(state->pp_buf, tmp, PAGE_SIZE);
221 }
217 if (le32_to_cpu(v->v_version) != 1) { 222 if (le32_to_cpu(v->v_version) != 1) {
218 printk(" cannot handle version %d vtoc>\n", 223 char tmp[64];
219 le32_to_cpu(v->v_version)); 224
225 snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
226 le32_to_cpu(v->v_version));
227 strlcat(state->pp_buf, tmp, PAGE_SIZE);
220 put_dev_sector(sect); 228 put_dev_sector(sect);
221 return; 229 return;
222 } 230 }
@@ -224,9 +232,12 @@ static void parse_solaris_x86(struct parsed_partitions *state,
224 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; 232 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
225 for (i=0; i<max_nparts && state->next<state->limit; i++) { 233 for (i=0; i<max_nparts && state->next<state->limit; i++) {
226 struct solaris_x86_slice *s = &v->v_slice[i]; 234 struct solaris_x86_slice *s = &v->v_slice[i];
235 char tmp[3 + 10 + 1 + 1];
236
227 if (s->s_size == 0) 237 if (s->s_size == 0)
228 continue; 238 continue;
229 printk(" [s%d]", i); 239 snprintf(tmp, sizeof(tmp), " [s%d]", i);
240 strlcat(state->pp_buf, tmp, PAGE_SIZE);
230 /* solaris partitions are relative to current MS-DOS 241 /* solaris partitions are relative to current MS-DOS
231 * one; must add the offset of the current partition */ 242 * one; must add the offset of the current partition */
232 put_partition(state, state->next++, 243 put_partition(state, state->next++,
@@ -234,7 +245,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
234 le32_to_cpu(s->s_size)); 245 le32_to_cpu(s->s_size));
235 } 246 }
236 put_dev_sector(sect); 247 put_dev_sector(sect);
237 printk(" >\n"); 248 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
238#endif 249#endif
239} 250}
240 251
@@ -250,6 +261,7 @@ static void parse_bsd(struct parsed_partitions *state,
250 Sector sect; 261 Sector sect;
251 struct bsd_disklabel *l; 262 struct bsd_disklabel *l;
252 struct bsd_partition *p; 263 struct bsd_partition *p;
264 char tmp[64];
253 265
254 l = read_part_sector(state, offset + 1, &sect); 266 l = read_part_sector(state, offset + 1, &sect);
255 if (!l) 267 if (!l)
@@ -258,7 +270,9 @@ static void parse_bsd(struct parsed_partitions *state,
258 put_dev_sector(sect); 270 put_dev_sector(sect);
259 return; 271 return;
260 } 272 }
261 printk(" %s%d: <%s:", state->name, origin, flavour); 273
274 snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
275 strlcat(state->pp_buf, tmp, PAGE_SIZE);
262 276
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 277 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 278 max_partitions = le16_to_cpu(l->d_npartitions);
@@ -275,16 +289,18 @@ static void parse_bsd(struct parsed_partitions *state,
275 /* full parent partition, we have it already */ 289 /* full parent partition, we have it already */
276 continue; 290 continue;
277 if (offset > bsd_start || offset+size < bsd_start+bsd_size) { 291 if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
278 printk("bad subpartition - ignored\n"); 292 strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
279 continue; 293 continue;
280 } 294 }
281 put_partition(state, state->next++, bsd_start, bsd_size); 295 put_partition(state, state->next++, bsd_start, bsd_size);
282 } 296 }
283 put_dev_sector(sect); 297 put_dev_sector(sect);
284 if (le16_to_cpu(l->d_npartitions) > max_partitions) 298 if (le16_to_cpu(l->d_npartitions) > max_partitions) {
285 printk(" (ignored %d more)", 299 snprintf(tmp, sizeof(tmp), " (ignored %d more)",
286 le16_to_cpu(l->d_npartitions) - max_partitions); 300 le16_to_cpu(l->d_npartitions) - max_partitions);
287 printk(" >\n"); 301 strlcat(state->pp_buf, tmp, PAGE_SIZE);
302 }
303 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
288} 304}
289#endif 305#endif
290 306
@@ -333,7 +349,12 @@ static void parse_unixware(struct parsed_partitions *state,
333 put_dev_sector(sect); 349 put_dev_sector(sect);
334 return; 350 return;
335 } 351 }
336 printk(" %s%d: <unixware:", state->name, origin); 352 {
353 char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
354
355 snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
356 strlcat(state->pp_buf, tmp, PAGE_SIZE);
357 }
337 p = &l->vtoc.v_slice[1]; 358 p = &l->vtoc.v_slice[1];
338 /* I omit the 0th slice as it is the same as whole disk. */ 359 /* I omit the 0th slice as it is the same as whole disk. */
339 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { 360 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -347,7 +368,7 @@ static void parse_unixware(struct parsed_partitions *state,
347 p++; 368 p++;
348 } 369 }
349 put_dev_sector(sect); 370 put_dev_sector(sect);
350 printk(" >\n"); 371 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
351#endif 372#endif
352} 373}
353 374
@@ -376,8 +397,10 @@ static void parse_minix(struct parsed_partitions *state,
376 * the normal boot sector. */ 397 * the normal boot sector. */
377 if (msdos_magic_present (data + 510) && 398 if (msdos_magic_present (data + 510) &&
378 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ 399 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
400 char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
379 401
380 printk(" %s%d: <minix:", state->name, origin); 402 snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
403 strlcat(state->pp_buf, tmp, PAGE_SIZE);
381 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { 404 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
382 if (state->next == state->limit) 405 if (state->next == state->limit)
383 break; 406 break;
@@ -386,7 +409,7 @@ static void parse_minix(struct parsed_partitions *state,
386 put_partition(state, state->next++, 409 put_partition(state, state->next++,
387 start_sect(p), nr_sects(p)); 410 start_sect(p), nr_sects(p));
388 } 411 }
389 printk(" >\n"); 412 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
390 } 413 }
391 put_dev_sector(sect); 414 put_dev_sector(sect);
392#endif /* CONFIG_MINIX_SUBPARTITION */ 415#endif /* CONFIG_MINIX_SUBPARTITION */
@@ -425,7 +448,7 @@ int msdos_partition(struct parsed_partitions *state)
425 448
426 if (aix_magic_present(state, data)) { 449 if (aix_magic_present(state, data)) {
427 put_dev_sector(sect); 450 put_dev_sector(sect);
428 printk( " [AIX]"); 451 strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
429 return 0; 452 return 0;
430 } 453 }
431 454
@@ -446,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state)
446 fb = (struct fat_boot_sector *) data; 469 fb = (struct fat_boot_sector *) data;
447 if (slot == 1 && fb->reserved && fb->fats 470 if (slot == 1 && fb->reserved && fb->fats
448 && fat_valid_media(fb->media)) { 471 && fat_valid_media(fb->media)) {
449 printk("\n"); 472 strlcat(state->pp_buf, "\n", PAGE_SIZE);
450 put_dev_sector(sect); 473 put_dev_sector(sect);
451 return 1; 474 return 1;
452 } else { 475 } else {
@@ -491,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state)
491 n = min(size, max(sector_size, n)); 514 n = min(size, max(sector_size, n));
492 put_partition(state, slot, start, n); 515 put_partition(state, slot, start, n);
493 516
494 printk(" <"); 517 strlcat(state->pp_buf, " <", PAGE_SIZE);
495 parse_extended(state, start, size); 518 parse_extended(state, start, size);
496 printk(" >"); 519 strlcat(state->pp_buf, " >", PAGE_SIZE);
497 continue; 520 continue;
498 } 521 }
499 put_partition(state, slot, start, size); 522 put_partition(state, slot, start, size);
500 if (SYS_IND(p) == LINUX_RAID_PARTITION) 523 if (SYS_IND(p) == LINUX_RAID_PARTITION)
501 state->parts[slot].flags = ADDPART_FLAG_RAID; 524 state->parts[slot].flags = ADDPART_FLAG_RAID;
502 if (SYS_IND(p) == DM6_PARTITION) 525 if (SYS_IND(p) == DM6_PARTITION)
503 printk("[DM]"); 526 strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
504 if (SYS_IND(p) == EZD_PARTITION) 527 if (SYS_IND(p) == EZD_PARTITION)
505 printk("[EZD]"); 528 strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
506 } 529 }
507 530
508 printk("\n"); 531 strlcat(state->pp_buf, "\n", PAGE_SIZE);
509 532
510 /* second pass - output for each on a separate line */ 533 /* second pass - output for each on a separate line */
511 p = (struct partition *) (0x1be + data); 534 p = (struct partition *) (0x1be + data);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index fc22b85d436..48cec7cbca1 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state)
72 le32_to_cpu(partition->p_size)); 72 le32_to_cpu(partition->p_size));
73 slot++; 73 slot++;
74 } 74 }
75 printk("\n"); 75 strlcat(state->pp_buf, "\n", PAGE_SIZE);
76 put_dev_sector(sect); 76 put_dev_sector(sect);
77 return 1; 77 return 1;
78} 78}
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index 43b1df9aa16..ea8a86dceaf 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state)
76 } 76 }
77 slot++; 77 slot++;
78 } 78 }
79 printk("\n"); 79 strlcat(state->pp_buf, "\n", PAGE_SIZE);
80 put_dev_sector(sect); 80 put_dev_sector(sect);
81 return 1; 81 return 1;
82} 82}
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index a32660e25f7..b5b6fcfb3d3 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state)
116 } 116 }
117 slot++; 117 slot++;
118 } 118 }
119 printk("\n"); 119 strlcat(state->pp_buf, "\n", PAGE_SIZE);
120 put_dev_sector(sect); 120 put_dev_sector(sect);
121 return 1; 121 return 1;
122} 122}
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 9030c864428..9627ccffc1c 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -54,6 +54,7 @@ int sysv68_partition(struct parsed_partitions *state)
54 unsigned char *data; 54 unsigned char *data;
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 char tmp[64];
57 58
58 data = read_part_sector(state, 0, &sect); 59 data = read_part_sector(state, 0, &sect);
59 if (!data) 60 if (!data)
@@ -73,7 +74,8 @@ int sysv68_partition(struct parsed_partitions *state)
73 return -1; 74 return -1;
74 75
75 slices -= 1; /* last slice is the whole disk */ 76 slices -= 1; /* last slice is the whole disk */
76 printk("sysV68: %s(s%u)", state->name, slices); 77 snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
78 strlcat(state->pp_buf, tmp, PAGE_SIZE);
77 slice = (struct slice *)data; 79 slice = (struct slice *)data;
78 for (i = 0; i < slices; i++, slice++) { 80 for (i = 0; i < slices; i++, slice++) {
79 if (slot == state->limit) 81 if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state)
82 put_partition(state, slot, 84 put_partition(state, slot,
83 be32_to_cpu(slice->blkoff), 85 be32_to_cpu(slice->blkoff),
84 be32_to_cpu(slice->nblocks)); 86 be32_to_cpu(slice->nblocks));
85 printk("(s%u)", i); 87 snprintf(tmp, sizeof(tmp), "(s%u)", i);
88 strlcat(state->pp_buf, tmp, PAGE_SIZE);
86 } 89 }
87 slot++; 90 slot++;
88 } 91 }
89 printk("\n"); 92 strlcat(state->pp_buf, "\n", PAGE_SIZE);
90 put_dev_sector(sect); 93 put_dev_sector(sect);
91 return 1; 94 return 1;
92} 95}
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index db9eef26036..8dbaf9f77a9 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
39 label->pt_part[i].pi_blkoff, 39 label->pt_part[i].pi_blkoff,
40 label->pt_part[i].pi_nblocks); 40 label->pt_part[i].pi_nblocks);
41 put_dev_sector(sect); 41 put_dev_sector(sect);
42 printk ("\n"); 42 strlcat(state->pp_buf, "\n", PAGE_SIZE);
43 return 1; 43 return 1;
44 } else { 44 } else {
45 put_dev_sector(sect); 45 put_dev_sector(sect);
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51..a8012a95572 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
382 error = ops->confirm(pipe, buf); 382 error = ops->confirm(pipe, buf);
383 if (error) { 383 if (error) {
384 if (!ret) 384 if (!ret)
385 error = ret; 385 ret = error;
386 break; 386 break;
387 } 387 }
388 388
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
954 if (!inode) 954 if (!inode)
955 goto fail_inode; 955 goto fail_inode;
956 956
957 inode->i_ino = get_next_ino();
958
957 pipe = alloc_pipe_info(inode); 959 pipe = alloc_pipe_info(inode);
958 if (!pipe) 960 if (!pipe)
959 goto fail_iput; 961 goto fail_iput;
@@ -1245,16 +1247,15 @@ out:
1245 * any operations on the root directory. However, we need a non-trivial 1247 * any operations on the root directory. However, we need a non-trivial
1246 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1248 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1247 */ 1249 */
1248static int pipefs_get_sb(struct file_system_type *fs_type, 1250static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1249 int flags, const char *dev_name, void *data, 1251 int flags, const char *dev_name, void *data)
1250 struct vfsmount *mnt)
1251{ 1252{
1252 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1253 return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
1253} 1254}
1254 1255
1255static struct file_system_type pipe_fs_type = { 1256static struct file_system_type pipe_fs_type = {
1256 .name = "pipefs", 1257 .name = "pipefs",
1257 .get_sb = pipefs_get_sb, 1258 .mount = pipefs_mount,
1258 .kill_sb = kill_anon_super, 1259 .kill_sb = kill_anon_super,
1259}; 1260};
1260 1261
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a8314..8066b8dd748 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
126 return 0; 126 return 0;
127} 127}
128 128
129/*
130 * vfsmount lock must be held for write
131 */
129void change_mnt_propagation(struct vfsmount *mnt, int type) 132void change_mnt_propagation(struct vfsmount *mnt, int type)
130{ 133{
131 if (type == MS_SHARED) { 134 if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
270 prev_src_mnt = child; 273 prev_src_mnt = child;
271 } 274 }
272out: 275out:
273 spin_lock(&vfsmount_lock); 276 br_write_lock(vfsmount_lock);
274 while (!list_empty(&tmp_list)) { 277 while (!list_empty(&tmp_list)) {
275 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); 278 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
276 umount_tree(child, 0, &umount_list); 279 umount_tree(child, 0, &umount_list);
277 } 280 }
278 spin_unlock(&vfsmount_lock); 281 br_write_unlock(vfsmount_lock);
279 release_mounts(&umount_list); 282 release_mounts(&umount_list);
280 return ret; 283 return ret;
281} 284}
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
296 * other mounts its parent propagates to. 299 * other mounts its parent propagates to.
297 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
298 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 *
303 * vfsmount lock must be held for read or write
299 */ 304 */
300int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
301{ 306{
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
353 * collect all mounts that receive propagation from the mount in @list, 358 * collect all mounts that receive propagation from the mount in @list,
354 * and return these additional mounts in the same list. 359 * and return these additional mounts in the same list.
355 * @list: the list of mounts to be unmounted. 360 * @list: the list of mounts to be unmounted.
361 *
362 * vfsmount lock must be held for write
356 */ 363 */
357int propagate_umount(struct list_head *list) 364int propagate_umount(struct list_head *list)
358{ 365{
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f0..6a0068841d9 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
33 depends on PROC_FS && MMU 33 depends on PROC_FS && MMU
34 34
35config PROC_VMCORE 35config PROC_VMCORE
36 bool "/proc/vmcore support (EXPERIMENTAL)" 36 bool "/proc/vmcore support"
37 depends on PROC_FS && CRASH_DUMP 37 depends on PROC_FS && CRASH_DUMP
38 default y 38 default y
39 help 39 help
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c6815..2758e2afc51 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the Linux proc filesystem routines. 2# Makefile for the Linux proc filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_PROC_FS) += proc.o 5obj-y += proc.o
6 6
7proc-y := nommu.o task_nommu.o 7proc-y := nommu.o task_nommu.o
8proc-$(CONFIG_MMU) := mmu.o task_mmu.o 8proc-$(CONFIG_MMU) := mmu.o task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef80ea4..f3d02ca461e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
63#include <linux/namei.h> 63#include <linux/namei.h>
64#include <linux/mnt_namespace.h> 64#include <linux/mnt_namespace.h>
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/swap.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 68#include <linux/kallsyms.h>
68#include <linux/stacktrace.h> 69#include <linux/stacktrace.h>
@@ -148,18 +149,13 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
148 return count; 149 return count;
149} 150}
150 151
151static int get_fs_path(struct task_struct *task, struct path *path, bool root) 152static int get_task_root(struct task_struct *task, struct path *root)
152{ 153{
153 struct fs_struct *fs;
154 int result = -ENOENT; 154 int result = -ENOENT;
155 155
156 task_lock(task); 156 task_lock(task);
157 fs = task->fs; 157 if (task->fs) {
158 if (fs) { 158 get_fs_root(task->fs, root);
159 read_lock(&fs->lock);
160 *path = root ? fs->root : fs->pwd;
161 path_get(path);
162 read_unlock(&fs->lock);
163 result = 0; 159 result = 0;
164 } 160 }
165 task_unlock(task); 161 task_unlock(task);
@@ -172,7 +168,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
172 int result = -ENOENT; 168 int result = -ENOENT;
173 169
174 if (task) { 170 if (task) {
175 result = get_fs_path(task, path, 0); 171 task_lock(task);
172 if (task->fs) {
173 get_fs_pwd(task->fs, path);
174 result = 0;
175 }
176 task_unlock(task);
176 put_task_struct(task); 177 put_task_struct(task);
177 } 178 }
178 return result; 179 return result;
@@ -184,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
184 int result = -ENOENT; 185 int result = -ENOENT;
185 186
186 if (task) { 187 if (task) {
187 result = get_fs_path(task, path, 1); 188 result = get_task_root(task, path);
188 put_task_struct(task); 189 put_task_struct(task);
189 } 190 }
190 return result; 191 return result;
@@ -225,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
225{ 226{
226 struct mm_struct *mm; 227 struct mm_struct *mm;
227 228
228 if (mutex_lock_killable(&task->cred_guard_mutex)) 229 if (mutex_lock_killable(&task->signal->cred_guard_mutex))
229 return NULL; 230 return NULL;
230 231
231 mm = get_task_mm(task); 232 mm = get_task_mm(task);
@@ -234,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
234 mmput(mm); 235 mmput(mm);
235 mm = NULL; 236 mm = NULL;
236 } 237 }
237 mutex_unlock(&task->cred_guard_mutex); 238 mutex_unlock(&task->signal->cred_guard_mutex);
238 239
239 return mm; 240 return mm;
240} 241}
@@ -427,17 +428,14 @@ static const struct file_operations proc_lstats_operations = {
427 428
428#endif 429#endif
429 430
430/* The badness from the OOM killer */
431unsigned long badness(struct task_struct *p, unsigned long uptime);
432static int proc_oom_score(struct task_struct *task, char *buffer) 431static int proc_oom_score(struct task_struct *task, char *buffer)
433{ 432{
434 unsigned long points = 0; 433 unsigned long points = 0;
435 struct timespec uptime;
436 434
437 do_posix_clock_monotonic_gettime(&uptime);
438 read_lock(&tasklist_lock); 435 read_lock(&tasklist_lock);
439 if (pid_alive(task)) 436 if (pid_alive(task))
440 points = badness(task, uptime.tv_sec); 437 points = oom_badness(task, NULL, NULL,
438 totalram_pages + total_swap_pages);
441 read_unlock(&tasklist_lock); 439 read_unlock(&tasklist_lock);
442 return sprintf(buffer, "%lu\n", points); 440 return sprintf(buffer, "%lu\n", points);
443} 441}
@@ -561,9 +559,19 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
561 return -EPERM; 559 return -EPERM;
562 560
563 error = inode_change_ok(inode, attr); 561 error = inode_change_ok(inode, attr);
564 if (!error) 562 if (error)
565 error = inode_setattr(inode, attr); 563 return error;
566 return error; 564
565 if ((attr->ia_valid & ATTR_SIZE) &&
566 attr->ia_size != i_size_read(inode)) {
567 error = vmtruncate(inode, attr->ia_size);
568 if (error)
569 return error;
570 }
571
572 setattr_copy(inode, attr);
573 mark_inode_dirty(inode);
574 return 0;
567} 575}
568 576
569static const struct inode_operations proc_def_inode_operations = { 577static const struct inode_operations proc_def_inode_operations = {
@@ -589,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
589 get_mnt_ns(ns); 597 get_mnt_ns(ns);
590 } 598 }
591 rcu_read_unlock(); 599 rcu_read_unlock();
592 if (ns && get_fs_path(task, &root, 1) == 0) 600 if (ns && get_task_root(task, &root) == 0)
593 ret = 0; 601 ret = 0;
594 put_task_struct(task); 602 put_task_struct(task);
595 } 603 }
@@ -763,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
763static int mem_open(struct inode* inode, struct file* file) 771static int mem_open(struct inode* inode, struct file* file)
764{ 772{
765 file->private_data = (void*)((long)current->self_exec_id); 773 file->private_data = (void*)((long)current->self_exec_id);
774 /* OK to pass negative loff_t, we can catch out-of-range */
775 file->f_mode |= FMODE_UNSIGNED_OFFSET;
766 return 0; 776 return 0;
767} 777}
768 778
@@ -1015,36 +1025,74 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1015 memset(buffer, 0, sizeof(buffer)); 1025 memset(buffer, 0, sizeof(buffer));
1016 if (count > sizeof(buffer) - 1) 1026 if (count > sizeof(buffer) - 1)
1017 count = sizeof(buffer) - 1; 1027 count = sizeof(buffer) - 1;
1018 if (copy_from_user(buffer, buf, count)) 1028 if (copy_from_user(buffer, buf, count)) {
1019 return -EFAULT; 1029 err = -EFAULT;
1030 goto out;
1031 }
1020 1032
1021 err = strict_strtol(strstrip(buffer), 0, &oom_adjust); 1033 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1022 if (err) 1034 if (err)
1023 return -EINVAL; 1035 goto out;
1024 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1036 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1025 oom_adjust != OOM_DISABLE) 1037 oom_adjust != OOM_DISABLE) {
1026 return -EINVAL; 1038 err = -EINVAL;
1039 goto out;
1040 }
1027 1041
1028 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1029 if (!task) 1043 if (!task) {
1030 return -ESRCH; 1044 err = -ESRCH;
1045 goto out;
1046 }
1047
1048 task_lock(task);
1049 if (!task->mm) {
1050 err = -EINVAL;
1051 goto err_task_lock;
1052 }
1053
1031 if (!lock_task_sighand(task, &flags)) { 1054 if (!lock_task_sighand(task, &flags)) {
1032 put_task_struct(task); 1055 err = -ESRCH;
1033 return -ESRCH; 1056 goto err_task_lock;
1034 } 1057 }
1035 1058
1036 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { 1059 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1037 unlock_task_sighand(task, &flags); 1060 err = -EACCES;
1038 put_task_struct(task); 1061 goto err_sighand;
1039 return -EACCES;
1040 } 1062 }
1041 1063
1042 task->signal->oom_adj = oom_adjust; 1064 if (oom_adjust != task->signal->oom_adj) {
1065 if (oom_adjust == OOM_DISABLE)
1066 atomic_inc(&task->mm->oom_disable_count);
1067 if (task->signal->oom_adj == OOM_DISABLE)
1068 atomic_dec(&task->mm->oom_disable_count);
1069 }
1043 1070
1071 /*
1072 * Warn that /proc/pid/oom_adj is deprecated, see
1073 * Documentation/feature-removal-schedule.txt.
1074 */
1075 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
1076 "please use /proc/%d/oom_score_adj instead.\n",
1077 current->comm, task_pid_nr(current),
1078 task_pid_nr(task), task_pid_nr(task));
1079 task->signal->oom_adj = oom_adjust;
1080 /*
1081 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1082 * value is always attainable.
1083 */
1084 if (task->signal->oom_adj == OOM_ADJUST_MAX)
1085 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
1086 else
1087 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1088 -OOM_DISABLE;
1089err_sighand:
1044 unlock_task_sighand(task, &flags); 1090 unlock_task_sighand(task, &flags);
1091err_task_lock:
1092 task_unlock(task);
1045 put_task_struct(task); 1093 put_task_struct(task);
1046 1094out:
1047 return count; 1095 return err < 0 ? err : count;
1048} 1096}
1049 1097
1050static const struct file_operations proc_oom_adjust_operations = { 1098static const struct file_operations proc_oom_adjust_operations = {
@@ -1053,6 +1101,106 @@ static const struct file_operations proc_oom_adjust_operations = {
1053 .llseek = generic_file_llseek, 1101 .llseek = generic_file_llseek,
1054}; 1102};
1055 1103
1104static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1105 size_t count, loff_t *ppos)
1106{
1107 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1108 char buffer[PROC_NUMBUF];
1109 int oom_score_adj = OOM_SCORE_ADJ_MIN;
1110 unsigned long flags;
1111 size_t len;
1112
1113 if (!task)
1114 return -ESRCH;
1115 if (lock_task_sighand(task, &flags)) {
1116 oom_score_adj = task->signal->oom_score_adj;
1117 unlock_task_sighand(task, &flags);
1118 }
1119 put_task_struct(task);
1120 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
1121 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1122}
1123
1124static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1125 size_t count, loff_t *ppos)
1126{
1127 struct task_struct *task;
1128 char buffer[PROC_NUMBUF];
1129 unsigned long flags;
1130 long oom_score_adj;
1131 int err;
1132
1133 memset(buffer, 0, sizeof(buffer));
1134 if (count > sizeof(buffer) - 1)
1135 count = sizeof(buffer) - 1;
1136 if (copy_from_user(buffer, buf, count)) {
1137 err = -EFAULT;
1138 goto out;
1139 }
1140
1141 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1142 if (err)
1143 goto out;
1144 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1145 oom_score_adj > OOM_SCORE_ADJ_MAX) {
1146 err = -EINVAL;
1147 goto out;
1148 }
1149
1150 task = get_proc_task(file->f_path.dentry->d_inode);
1151 if (!task) {
1152 err = -ESRCH;
1153 goto out;
1154 }
1155
1156 task_lock(task);
1157 if (!task->mm) {
1158 err = -EINVAL;
1159 goto err_task_lock;
1160 }
1161
1162 if (!lock_task_sighand(task, &flags)) {
1163 err = -ESRCH;
1164 goto err_task_lock;
1165 }
1166
1167 if (oom_score_adj < task->signal->oom_score_adj &&
1168 !capable(CAP_SYS_RESOURCE)) {
1169 err = -EACCES;
1170 goto err_sighand;
1171 }
1172
1173 if (oom_score_adj != task->signal->oom_score_adj) {
1174 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1175 atomic_inc(&task->mm->oom_disable_count);
1176 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1177 atomic_dec(&task->mm->oom_disable_count);
1178 }
1179 task->signal->oom_score_adj = oom_score_adj;
1180 /*
1181 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1182 * always attainable.
1183 */
1184 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1185 task->signal->oom_adj = OOM_DISABLE;
1186 else
1187 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1188 OOM_SCORE_ADJ_MAX;
1189err_sighand:
1190 unlock_task_sighand(task, &flags);
1191err_task_lock:
1192 task_unlock(task);
1193 put_task_struct(task);
1194out:
1195 return err < 0 ? err : count;
1196}
1197
1198static const struct file_operations proc_oom_score_adj_operations = {
1199 .read = oom_score_adj_read,
1200 .write = oom_score_adj_write,
1201 .llseek = default_llseek,
1202};
1203
1056#ifdef CONFIG_AUDITSYSCALL 1204#ifdef CONFIG_AUDITSYSCALL
1057#define TMPBUFLEN 21 1205#define TMPBUFLEN 21
1058static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 1206static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1426,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1426 if (!tmp) 1574 if (!tmp)
1427 return -ENOMEM; 1575 return -ENOMEM;
1428 1576
1429 pathname = d_path(path, tmp, PAGE_SIZE); 1577 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
1430 len = PTR_ERR(pathname); 1578 len = PTR_ERR(pathname);
1431 if (IS_ERR(pathname)) 1579 if (IS_ERR(pathname))
1432 goto out; 1580 goto out;
@@ -1500,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1500 1648
1501 /* Common stuff */ 1649 /* Common stuff */
1502 ei = PROC_I(inode); 1650 ei = PROC_I(inode);
1651 inode->i_ino = get_next_ino();
1503 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1652 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1504 inode->i_op = &proc_def_inode_operations; 1653 inode->i_op = &proc_def_inode_operations;
1505 1654
@@ -1939,11 +2088,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1939static const struct file_operations proc_fdinfo_file_operations = { 2088static const struct file_operations proc_fdinfo_file_operations = {
1940 .open = nonseekable_open, 2089 .open = nonseekable_open,
1941 .read = proc_fdinfo_read, 2090 .read = proc_fdinfo_read,
2091 .llseek = no_llseek,
1942}; 2092};
1943 2093
1944static const struct file_operations proc_fd_operations = { 2094static const struct file_operations proc_fd_operations = {
1945 .read = generic_read_dir, 2095 .read = generic_read_dir,
1946 .readdir = proc_readfd, 2096 .readdir = proc_readfd,
2097 .llseek = default_llseek,
1947}; 2098};
1948 2099
1949/* 2100/*
@@ -2012,6 +2163,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2012static const struct file_operations proc_fdinfo_operations = { 2163static const struct file_operations proc_fdinfo_operations = {
2013 .read = generic_read_dir, 2164 .read = generic_read_dir,
2014 .readdir = proc_readfdinfo, 2165 .readdir = proc_readfdinfo,
2166 .llseek = default_llseek,
2015}; 2167};
2016 2168
2017/* 2169/*
@@ -2202,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2202 goto out_free; 2354 goto out_free;
2203 2355
2204 /* Guard against adverse ptrace interaction */ 2356 /* Guard against adverse ptrace interaction */
2205 length = mutex_lock_interruptible(&task->cred_guard_mutex); 2357 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
2206 if (length < 0) 2358 if (length < 0)
2207 goto out_free; 2359 goto out_free;
2208 2360
2209 length = security_setprocattr(task, 2361 length = security_setprocattr(task,
2210 (char*)file->f_path.dentry->d_name.name, 2362 (char*)file->f_path.dentry->d_name.name,
2211 (void*)page, count); 2363 (void*)page, count);
2212 mutex_unlock(&task->cred_guard_mutex); 2364 mutex_unlock(&task->signal->cred_guard_mutex);
2213out_free: 2365out_free:
2214 free_page((unsigned long) page); 2366 free_page((unsigned long) page);
2215out: 2367out:
@@ -2243,6 +2395,7 @@ static int proc_attr_dir_readdir(struct file * filp,
2243static const struct file_operations proc_attr_dir_operations = { 2395static const struct file_operations proc_attr_dir_operations = {
2244 .read = generic_read_dir, 2396 .read = generic_read_dir,
2245 .readdir = proc_attr_dir_readdir, 2397 .readdir = proc_attr_dir_readdir,
2398 .llseek = default_llseek,
2246}; 2399};
2247 2400
2248static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2401static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2442,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2442 2595
2443 /* Initialize the inode */ 2596 /* Initialize the inode */
2444 ei = PROC_I(inode); 2597 ei = PROC_I(inode);
2598 inode->i_ino = get_next_ino();
2445 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2599 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2446 2600
2447 /* 2601 /*
@@ -2575,7 +2729,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2575 INF("auxv", S_IRUSR, proc_pid_auxv), 2729 INF("auxv", S_IRUSR, proc_pid_auxv),
2576 ONE("status", S_IRUGO, proc_pid_status), 2730 ONE("status", S_IRUGO, proc_pid_status),
2577 ONE("personality", S_IRUSR, proc_pid_personality), 2731 ONE("personality", S_IRUSR, proc_pid_personality),
2578 INF("limits", S_IRUSR, proc_pid_limits), 2732 INF("limits", S_IRUGO, proc_pid_limits),
2579#ifdef CONFIG_SCHED_DEBUG 2733#ifdef CONFIG_SCHED_DEBUG
2580 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2581#endif 2735#endif
@@ -2625,6 +2779,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2625#endif 2779#endif
2626 INF("oom_score", S_IRUGO, proc_oom_score), 2780 INF("oom_score", S_IRUGO, proc_oom_score),
2627 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 2781 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2782 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2628#ifdef CONFIG_AUDITSYSCALL 2783#ifdef CONFIG_AUDITSYSCALL
2629 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2784 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2630 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2785 REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2650,6 +2805,7 @@ static int proc_tgid_base_readdir(struct file * filp,
2650static const struct file_operations proc_tgid_base_operations = { 2805static const struct file_operations proc_tgid_base_operations = {
2651 .read = generic_read_dir, 2806 .read = generic_read_dir,
2652 .readdir = proc_tgid_base_readdir, 2807 .readdir = proc_tgid_base_readdir,
2808 .llseek = default_llseek,
2653}; 2809};
2654 2810
2655static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ 2811static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -2910,7 +3066,7 @@ static const struct pid_entry tid_base_stuff[] = {
2910 INF("auxv", S_IRUSR, proc_pid_auxv), 3066 INF("auxv", S_IRUSR, proc_pid_auxv),
2911 ONE("status", S_IRUGO, proc_pid_status), 3067 ONE("status", S_IRUGO, proc_pid_status),
2912 ONE("personality", S_IRUSR, proc_pid_personality), 3068 ONE("personality", S_IRUSR, proc_pid_personality),
2913 INF("limits", S_IRUSR, proc_pid_limits), 3069 INF("limits", S_IRUGO, proc_pid_limits),
2914#ifdef CONFIG_SCHED_DEBUG 3070#ifdef CONFIG_SCHED_DEBUG
2915 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3071 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2916#endif 3072#endif
@@ -2959,6 +3115,7 @@ static const struct pid_entry tid_base_stuff[] = {
2959#endif 3115#endif
2960 INF("oom_score", S_IRUGO, proc_oom_score), 3116 INF("oom_score", S_IRUGO, proc_oom_score),
2961 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 3117 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3118 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2962#ifdef CONFIG_AUDITSYSCALL 3119#ifdef CONFIG_AUDITSYSCALL
2963 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3120 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2964 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3121 REG("sessionid", S_IRUSR, proc_sessionid_operations),
@@ -2986,6 +3143,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
2986static const struct file_operations proc_tid_base_operations = { 3143static const struct file_operations proc_tid_base_operations = {
2987 .read = generic_read_dir, 3144 .read = generic_read_dir,
2988 .readdir = proc_tid_base_readdir, 3145 .readdir = proc_tid_base_readdir,
3146 .llseek = default_llseek,
2989}; 3147};
2990 3148
2991static const struct inode_operations proc_tid_base_inode_operations = { 3149static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3222,4 +3380,5 @@ static const struct inode_operations proc_task_inode_operations = {
3222static const struct file_operations proc_task_operations = { 3380static const struct file_operations proc_task_operations = {
3223 .read = generic_read_dir, 3381 .read = generic_read_dir,
3224 .readdir = proc_task_readdir, 3382 .readdir = proc_task_readdir,
3383 .llseek = default_llseek,
3225}; 3384};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2791907744e..dd29f033766 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/mount.h> 18#include <linux/mount.h>
@@ -258,17 +259,22 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
258 259
259 error = inode_change_ok(inode, iattr); 260 error = inode_change_ok(inode, iattr);
260 if (error) 261 if (error)
261 goto out; 262 return error;
262 263
263 error = inode_setattr(inode, iattr); 264 if ((iattr->ia_valid & ATTR_SIZE) &&
264 if (error) 265 iattr->ia_size != i_size_read(inode)) {
265 goto out; 266 error = vmtruncate(inode, iattr->ia_size);
267 if (error)
268 return error;
269 }
270
271 setattr_copy(inode, iattr);
272 mark_inode_dirty(inode);
266 273
267 de->uid = inode->i_uid; 274 de->uid = inode->i_uid;
268 de->gid = inode->i_gid; 275 de->gid = inode->i_gid;
269 de->mode = inode->i_mode; 276 de->mode = inode->i_mode;
270out: 277 return 0;
271 return error;
272} 278}
273 279
274static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, 280static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index aea8502e58a..9c2b5f48487 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -25,11 +25,12 @@
25 25
26#include "internal.h" 26#include "internal.h"
27 27
28static void proc_delete_inode(struct inode *inode) 28static void proc_evict_inode(struct inode *inode)
29{ 29{
30 struct proc_dir_entry *de; 30 struct proc_dir_entry *de;
31 31
32 truncate_inode_pages(&inode->i_data, 0); 32 truncate_inode_pages(&inode->i_data, 0);
33 end_writeback(inode);
33 34
34 /* Stop tracking associated processes */ 35 /* Stop tracking associated processes */
35 put_pid(PROC_I(inode)->pid); 36 put_pid(PROC_I(inode)->pid);
@@ -40,7 +41,6 @@ static void proc_delete_inode(struct inode *inode)
40 pde_put(de); 41 pde_put(de);
41 if (PROC_I(inode)->sysctl) 42 if (PROC_I(inode)->sysctl)
42 sysctl_head_put(PROC_I(inode)->sysctl); 43 sysctl_head_put(PROC_I(inode)->sysctl);
43 clear_inode(inode);
44} 44}
45 45
46struct vfsmount *proc_mnt; 46struct vfsmount *proc_mnt;
@@ -91,7 +91,7 @@ static const struct super_operations proc_sops = {
91 .alloc_inode = proc_alloc_inode, 91 .alloc_inode = proc_alloc_inode,
92 .destroy_inode = proc_destroy_inode, 92 .destroy_inode = proc_destroy_inode,
93 .drop_inode = generic_delete_inode, 93 .drop_inode = generic_delete_inode,
94 .delete_inode = proc_delete_inode, 94 .evict_inode = proc_evict_inode,
95 .statfs = simple_statfs, 95 .statfs = simple_statfs,
96}; 96};
97 97
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
214{ 214{
215 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); 215 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
216 long rv = -ENOTTY; 216 long rv = -ENOTTY;
217 long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); 217 long (*ioctl)(struct file *, unsigned int, unsigned long);
218 int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
219 218
220 spin_lock(&pde->pde_unload_lock); 219 spin_lock(&pde->pde_unload_lock);
221 if (!pde->proc_fops) { 220 if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
223 return rv; 222 return rv;
224 } 223 }
225 pde->pde_users++; 224 pde->pde_users++;
226 unlocked_ioctl = pde->proc_fops->unlocked_ioctl; 225 ioctl = pde->proc_fops->unlocked_ioctl;
227 ioctl = pde->proc_fops->ioctl;
228 spin_unlock(&pde->pde_unload_lock); 226 spin_unlock(&pde->pde_unload_lock);
229 227
230 if (unlocked_ioctl) { 228 if (ioctl)
231 rv = unlocked_ioctl(file, cmd, arg); 229 rv = ioctl(file, cmd, arg);
232 if (rv == -ENOIOCTLCMD)
233 rv = -EINVAL;
234 } else if (ioctl) {
235 WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
236 "%pf will be called without the Bkl held\n", ioctl);
237 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
238 }
239 230
240 pde_users_dec(pde); 231 pde_users_dec(pde);
241 return rv; 232 return rv;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd6..3b8b4566033 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); 146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
147#endif 147#endif
148 148
149#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 149#ifdef CONFIG_ARCH_USES_PG_UNCACHED
150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); 150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
151#endif 151#endif
152 152
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a1..b652cb00906 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
23 if (!inode) 23 if (!inode)
24 goto out; 24 goto out;
25 25
26 inode->i_ino = get_next_ino();
27
26 sysctl_head_get(head); 28 sysctl_head_get(head);
27 ei = PROC_I(inode); 29 ei = PROC_I(inode);
28 ei->sysctl = head; 30 ei->sysctl = head;
@@ -329,10 +331,19 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
329 return -EPERM; 331 return -EPERM;
330 332
331 error = inode_change_ok(inode, attr); 333 error = inode_change_ok(inode, attr);
332 if (!error) 334 if (error)
333 error = inode_setattr(inode, attr); 335 return error;
336
337 if ((attr->ia_valid & ATTR_SIZE) &&
338 attr->ia_size != i_size_read(inode)) {
339 error = vmtruncate(inode, attr->ia_size);
340 if (error)
341 return error;
342 }
334 343
335 return error; 344 setattr_copy(inode, attr);
345 mark_inode_dirty(inode);
346 return 0;
336} 347}
337 348
338static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 349static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -355,6 +366,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
355static const struct file_operations proc_sys_file_operations = { 366static const struct file_operations proc_sys_file_operations = {
356 .read = proc_sys_read, 367 .read = proc_sys_read,
357 .write = proc_sys_write, 368 .write = proc_sys_write,
369 .llseek = default_llseek,
358}; 370};
359 371
360static const struct file_operations proc_sys_dir_file_operations = { 372static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22..ef9fa8e24ad 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
35 return set_anon_super(sb, NULL); 35 return set_anon_super(sb, NULL);
36} 36}
37 37
38static int proc_get_sb(struct file_system_type *fs_type, 38static struct dentry *proc_mount(struct file_system_type *fs_type,
39 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 39 int flags, const char *dev_name, void *data)
40{ 40{
41 int err; 41 int err;
42 struct super_block *sb; 42 struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
61 61
62 sb = sget(fs_type, proc_test_super, proc_set_super, ns); 62 sb = sget(fs_type, proc_test_super, proc_set_super, ns);
63 if (IS_ERR(sb)) 63 if (IS_ERR(sb))
64 return PTR_ERR(sb); 64 return ERR_CAST(sb);
65 65
66 if (!sb->s_root) { 66 if (!sb->s_root) {
67 sb->s_flags = flags; 67 sb->s_flags = flags;
68 err = proc_fill_super(sb); 68 err = proc_fill_super(sb);
69 if (err) { 69 if (err) {
70 deactivate_locked_super(sb); 70 deactivate_locked_super(sb);
71 return err; 71 return ERR_PTR(err);
72 } 72 }
73 73
74 ei = PROC_I(sb->s_root->d_inode); 74 ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
79 } 79 }
80 80
81 sb->s_flags |= MS_ACTIVE; 81 sb->s_flags |= MS_ACTIVE;
82 ns->proc_mnt = mnt;
83 } 82 }
84 83
85 simple_set_mnt(mnt, sb); 84 return dget(sb->s_root);
86 return 0;
87} 85}
88 86
89static void proc_kill_sb(struct super_block *sb) 87static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
97 95
98static struct file_system_type proc_fs_type = { 96static struct file_system_type proc_fs_type = {
99 .name = "proc", 97 .name = "proc",
100 .get_sb = proc_get_sb, 98 .mount = proc_mount,
101 .kill_sb = proc_kill_sb, 99 .kill_sb = proc_kill_sb,
102}; 100};
103 101
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
115 return; 113 return;
116 } 114 }
117 115
116 init_pid_ns.proc_mnt = proc_mnt;
118 proc_symlink("mounts", NULL, "self/mounts"); 117 proc_symlink("mounts", NULL, "self/mounts");
119 118
120 proc_net_init(); 119 proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
179static const struct file_operations proc_root_operations = { 178static const struct file_operations proc_root_operations = {
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_root_readdir, 180 .readdir = proc_root_readdir,
181 .llseek = default_llseek,
182}; 182};
183 183
184/* 184/*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
212 if (IS_ERR(mnt)) 212 if (IS_ERR(mnt))
213 return PTR_ERR(mnt); 213 return PTR_ERR(mnt);
214 214
215 ns->proc_mnt = mnt;
215 return 0; 216 return 0;
216} 217}
217 218
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f1..37994737c98 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_printf(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_printf(p, "\n");
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%8s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc27..e15a19c93ba 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
31 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
33 struct timespec boottime; 33 struct timespec boottime;
34 unsigned int per_irq_sum;
35 34
36 user = nice = system = idle = iowait = 35 user = nice = system = idle = iowait =
37 irq = softirq = steal = cputime64_zero; 36 irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
52 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 51 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
53 guest_nice = cputime64_add(guest_nice, 52 guest_nice = cputime64_add(guest_nice,
54 kstat_cpu(i).cpustat.guest_nice); 53 kstat_cpu(i).cpustat.guest_nice);
55 for_each_irq_nr(j) { 54 sum += kstat_cpu_irqs_sum(i);
56 sum += kstat_irqs_cpu(j, i);
57 }
58 sum += arch_irq_stat_cpu(i); 55 sum += arch_irq_stat_cpu(i);
59 56
60 for (j = 0; j < NR_SOFTIRQS; j++) { 57 for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
110 seq_printf(p, "intr %llu", (unsigned long long)sum); 107 seq_printf(p, "intr %llu", (unsigned long long)sum);
111 108
112 /* sum again ? it could be updated? */ 109 /* sum again ? it could be updated? */
113 for_each_irq_nr(j) { 110 for_each_irq_nr(j)
114 per_irq_sum = 0; 111 seq_printf(p, " %u", kstat_irqs(j));
115 for_each_possible_cpu(i)
116 per_irq_sum += kstat_irqs_cpu(j, i);
117
118 seq_printf(p, " %u", per_irq_sum);
119 }
120 112
121 seq_printf(p, 113 seq_printf(p,
122 "\nctxt %llu\n" 114 "\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea1d3f1ffb..da6b01d70f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
210 int flags = vma->vm_flags; 210 int flags = vma->vm_flags;
211 unsigned long ino = 0; 211 unsigned long ino = 0;
212 unsigned long long pgoff = 0; 212 unsigned long long pgoff = 0;
213 unsigned long start;
213 dev_t dev = 0; 214 dev_t dev = 0;
214 int len; 215 int len;
215 216
@@ -220,8 +221,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 221 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
221 } 222 }
222 223
224 /* We don't show the stack guard page in /proc/maps */
225 start = vma->vm_start;
226 if (vma->vm_flags & VM_GROWSDOWN)
227 if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
228 start += PAGE_SIZE;
229
223 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 230 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
224 vma->vm_start, 231 start,
225 vma->vm_end, 232 vma->vm_end,
226 flags & VM_READ ? 'r' : '-', 233 flags & VM_READ ? 'r' : '-',
227 flags & VM_WRITE ? 'w' : '-', 234 flags & VM_WRITE ? 'w' : '-',
@@ -320,6 +327,7 @@ struct mem_size_stats {
320 unsigned long private_clean; 327 unsigned long private_clean;
321 unsigned long private_dirty; 328 unsigned long private_dirty;
322 unsigned long referenced; 329 unsigned long referenced;
330 unsigned long anonymous;
323 unsigned long swap; 331 unsigned long swap;
324 u64 pss; 332 u64 pss;
325}; 333};
@@ -350,19 +358,22 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
350 if (!page) 358 if (!page)
351 continue; 359 continue;
352 360
361 if (PageAnon(page))
362 mss->anonymous += PAGE_SIZE;
363
353 mss->resident += PAGE_SIZE; 364 mss->resident += PAGE_SIZE;
354 /* Accumulate the size in pages that have been accessed. */ 365 /* Accumulate the size in pages that have been accessed. */
355 if (pte_young(ptent) || PageReferenced(page)) 366 if (pte_young(ptent) || PageReferenced(page))
356 mss->referenced += PAGE_SIZE; 367 mss->referenced += PAGE_SIZE;
357 mapcount = page_mapcount(page); 368 mapcount = page_mapcount(page);
358 if (mapcount >= 2) { 369 if (mapcount >= 2) {
359 if (pte_dirty(ptent)) 370 if (pte_dirty(ptent) || PageDirty(page))
360 mss->shared_dirty += PAGE_SIZE; 371 mss->shared_dirty += PAGE_SIZE;
361 else 372 else
362 mss->shared_clean += PAGE_SIZE; 373 mss->shared_clean += PAGE_SIZE;
363 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; 374 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
364 } else { 375 } else {
365 if (pte_dirty(ptent)) 376 if (pte_dirty(ptent) || PageDirty(page))
366 mss->private_dirty += PAGE_SIZE; 377 mss->private_dirty += PAGE_SIZE;
367 else 378 else
368 mss->private_clean += PAGE_SIZE; 379 mss->private_clean += PAGE_SIZE;
@@ -403,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
403 "Private_Clean: %8lu kB\n" 414 "Private_Clean: %8lu kB\n"
404 "Private_Dirty: %8lu kB\n" 415 "Private_Dirty: %8lu kB\n"
405 "Referenced: %8lu kB\n" 416 "Referenced: %8lu kB\n"
417 "Anonymous: %8lu kB\n"
406 "Swap: %8lu kB\n" 418 "Swap: %8lu kB\n"
407 "KernelPageSize: %8lu kB\n" 419 "KernelPageSize: %8lu kB\n"
408 "MMUPageSize: %8lu kB\n", 420 "MMUPageSize: %8lu kB\n",
@@ -414,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
414 mss.private_clean >> 10, 426 mss.private_clean >> 10,
415 mss.private_dirty >> 10, 427 mss.private_dirty >> 10,
416 mss.referenced >> 10, 428 mss.referenced >> 10,
429 mss.anonymous >> 10,
417 mss.swap >> 10, 430 mss.swap >> 10,
418 vma_kernel_pagesize(vma) >> 10, 431 vma_kernel_pagesize(vma) >> 10,
419 vma_mmu_pagesize(vma) >> 10); 432 vma_mmu_pagesize(vma) >> 10);
@@ -532,6 +545,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
532 545
533const struct file_operations proc_clear_refs_operations = { 546const struct file_operations proc_clear_refs_operations = {
534 .write = clear_refs_write, 547 .write = clear_refs_write,
548 .llseek = noop_llseek,
535}; 549};
536 550
537struct pagemapread { 551struct pagemapread {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c..2367fb3f70b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
163 163
164static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
165 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = generic_file_llseek, 166 .llseek = default_llseek,
167}; 167};
168 168
169static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a..7b0329468a5 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include "qnx4.h" 15#include "qnx4.h"
17 16
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 30
32 lock_kernel();
33
34 while (filp->f_pos < inode->i_size) { 31 while (filp->f_pos < inode->i_size) {
35 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
36 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
71 brelse(bh); 68 brelse(bh);
72 } 69 }
73out: 70out:
74 unlock_kernel();
75 return 0; 71 return 0;
76} 72}
77 73
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 277575ddc05..fcada42f1aa 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/smp_lock.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
157 struct super_block *sb = dentry->d_sb; 156 struct super_block *sb = dentry->d_sb;
158 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 157 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
159 158
160 lock_kernel();
161
162 buf->f_type = sb->s_magic; 159 buf->f_type = sb->s_magic;
163 buf->f_bsize = sb->s_blocksize; 160 buf->f_bsize = sb->s_blocksize;
164 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; 161 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
168 buf->f_fsid.val[0] = (u32)id; 165 buf->f_fsid.val[0] = (u32)id;
169 buf->f_fsid.val[1] = (u32)(id >> 32); 166 buf->f_fsid.val[1] = (u32)(id >> 32);
170 167
171 unlock_kernel();
172
173 return 0; 168 return 0;
174} 169}
175 170
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 goto outi; 278 goto outi;
284 279
285 brelse(bh); 280 brelse(bh);
286
287 return 0; 281 return 0;
288 282
289 outi: 283 outi:
@@ -320,10 +314,19 @@ static int qnx4_write_begin(struct file *file, struct address_space *mapping,
320 struct page **pagep, void **fsdata) 314 struct page **pagep, void **fsdata)
321{ 315{
322 struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host); 316 struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
317 int ret;
318
323 *pagep = NULL; 319 *pagep = NULL;
324 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 320 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
325 qnx4_get_block, 321 qnx4_get_block,
326 &qnx4_inode->mmu_private); 322 &qnx4_inode->mmu_private);
323 if (unlikely(ret)) {
324 loff_t isize = mapping->host->i_size;
325 if (pos + len > isize)
326 vmtruncate(mapping->host, isize);
327 }
328
329 return ret;
327} 330}
328static sector_t qnx4_bmap(struct address_space *mapping, sector_t block) 331static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
329{ 332{
@@ -451,17 +454,16 @@ static void destroy_inodecache(void)
451 kmem_cache_destroy(qnx4_inode_cachep); 454 kmem_cache_destroy(qnx4_inode_cachep);
452} 455}
453 456
454static int qnx4_get_sb(struct file_system_type *fs_type, 457static struct dentry *qnx4_mount(struct file_system_type *fs_type,
455 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 458 int flags, const char *dev_name, void *data)
456{ 459{
457 return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, 460 return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
458 mnt);
459} 461}
460 462
461static struct file_system_type qnx4_fs_type = { 463static struct file_system_type qnx4_fs_type = {
462 .owner = THIS_MODULE, 464 .owner = THIS_MODULE,
463 .name = "qnx4", 465 .name = "qnx4",
464 .get_sb = qnx4_get_sb, 466 .mount = qnx4_mount,
465 .kill_sb = kill_block_super, 467 .kill_sb = kill_block_super,
466 .fs_flags = FS_REQUIRES_DEV, 468 .fs_flags = FS_REQUIRES_DEV,
467}; 469};
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba87..275327b5615 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include "qnx4.h" 16#include "qnx4.h"
18 17
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
109 int len = dentry->d_name.len; 108 int len = dentry->d_name.len;
110 struct inode *foundinode = NULL; 109 struct inode *foundinode = NULL;
111 110
112 lock_kernel();
113 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) 111 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
114 goto out; 112 goto out;
115 /* The entry is linked, let's get the real info */ 113 /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
123 121
124 foundinode = qnx4_iget(dir->i_sb, ino); 122 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 123 if (IS_ERR(foundinode)) {
126 unlock_kernel();
127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", 124 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 125 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 126 return ERR_CAST(foundinode);
130 } 127 }
131out: 128out:
132 unlock_kernel();
133 d_add(dentry, foundinode); 129 d_add(dentry, foundinode);
134 130
135 return NULL; 131 return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3..880fd988436 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
4 4
5config QUOTA 5config QUOTA
6 bool "Quota support" 6 bool "Quota support"
7 select QUOTACTL
7 help 8 help
8 If you say Y here, you will be able to set per user limits for disk 9 If you say Y here, you will be able to set per user limits for disk
9 usage (also called disk quotas). Currently, it works for the 10 usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
65 66
66config QUOTACTL 67config QUOTACTL
67 bool 68 bool
68 depends on XFS_QUOTA || QUOTA 69 default n
69 default y
70 70
71config QUOTACTL_COMPAT 71config QUOTACTL_COMPAT
72 bool 72 bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 437d2ca2de9..0fed41e6efc 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...)
137{
138 va_list args;
139
140 if (printk_ratelimit()) {
141 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ",
143 sb->s_id, func);
144 vprintk(fmt, args);
145 printk("\n");
146 va_end(args);
147 }
148}
149EXPORT_SYMBOL(__quota_error);
150
135#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) 151#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
136static char *quotatypes[] = INITQFNAMES; 152static char *quotatypes[] = INITQFNAMES;
137#endif 153#endif
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
705 return; 721 return;
706#ifdef CONFIG_QUOTA_DEBUG 722#ifdef CONFIG_QUOTA_DEBUG
707 if (!atomic_read(&dquot->dq_count)) { 723 if (!atomic_read(&dquot->dq_count)) {
708 printk("VFS: dqput: trying to free free dquot\n"); 724 quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
709 printk("VFS: device %s, dquot of %s %d\n", 725 quotatypes[dquot->dq_type], dquot->dq_id);
710 dquot->dq_sb->s_id,
711 quotatypes[dquot->dq_type],
712 dquot->dq_id);
713 BUG(); 726 BUG();
714 } 727 }
715#endif 728#endif
@@ -732,9 +745,9 @@ we_slept:
732 /* Commit dquot before releasing */ 745 /* Commit dquot before releasing */
733 ret = dquot->dq_sb->dq_op->write_dquot(dquot); 746 ret = dquot->dq_sb->dq_op->write_dquot(dquot);
734 if (ret < 0) { 747 if (ret < 0) {
735 printk(KERN_ERR "VFS: cannot write quota structure on " 748 quota_error(dquot->dq_sb, "Can't write quota structure"
736 "device %s (error %d). Quota may get out of " 749 " (error %d). Quota may get out of sync!",
737 "sync!\n", dquot->dq_sb->s_id, ret); 750 ret);
738 /* 751 /*
739 * We clear dirty bit anyway, so that we avoid 752 * We clear dirty bit anyway, so that we avoid
740 * infinite loop here 753 * infinite loop here
@@ -885,7 +898,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
885 898
886 spin_lock(&inode_lock); 899 spin_lock(&inode_lock);
887 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 900 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
888 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 901 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
889 continue; 902 continue;
890#ifdef CONFIG_QUOTA_DEBUG 903#ifdef CONFIG_QUOTA_DEBUG
891 if (unlikely(inode_get_rsv_space(inode) > 0)) 904 if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
914 927
915#ifdef CONFIG_QUOTA_DEBUG 928#ifdef CONFIG_QUOTA_DEBUG
916 if (reserved) { 929 if (reserved) {
917 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 930 quota_error(sb, "Writes happened before quota was turned on "
918 " was turned on thus quota information is probably " 931 "thus quota information is probably inconsistent. "
919 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 932 "Please run quotacheck(8)");
920 } 933 }
921#endif 934#endif
922} 935}
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
947 if (dqput_blocks(dquot)) { 960 if (dqput_blocks(dquot)) {
948#ifdef CONFIG_QUOTA_DEBUG 961#ifdef CONFIG_QUOTA_DEBUG
949 if (atomic_read(&dquot->dq_count) != 1) 962 if (atomic_read(&dquot->dq_count) != 1)
950 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 963 quota_error(inode->i_sb, "Adding dquot with "
964 "dq_count %d to dispose list",
965 atomic_read(&dquot->dq_count));
951#endif 966#endif
952 spin_lock(&dq_list_lock); 967 spin_lock(&dq_list_lock);
953 /* As dquot must have currently users it can't be on 968 /* As dquot must have currently users it can't be on
@@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
986 struct list_head *tofree_head) 1001 struct list_head *tofree_head)
987{ 1002{
988 struct inode *inode; 1003 struct inode *inode;
1004 int reserved = 0;
989 1005
990 spin_lock(&inode_lock); 1006 spin_lock(&inode_lock);
991 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1007 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
995 * only quota pointers and these have separate locking 1011 * only quota pointers and these have separate locking
996 * (dqptr_sem). 1012 * (dqptr_sem).
997 */ 1013 */
998 if (!IS_NOQUOTA(inode)) 1014 if (!IS_NOQUOTA(inode)) {
1015 if (unlikely(inode_get_rsv_space(inode) > 0))
1016 reserved = 1;
999 remove_inode_dquot_ref(inode, type, tofree_head); 1017 remove_inode_dquot_ref(inode, type, tofree_head);
1018 }
1000 } 1019 }
1001 spin_unlock(&inode_lock); 1020 spin_unlock(&inode_lock);
1021#ifdef CONFIG_QUOTA_DEBUG
1022 if (reserved) {
1023 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
1024 " was disabled thus quota information is probably "
1025 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
1026 }
1027#endif
1002} 1028}
1003 1029
1004/* Gather all references from inodes and drop them */ 1030/* Gather all references from inodes and drop them */
@@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1304 return QUOTA_NL_NOWARN; 1330 return QUOTA_NL_NOWARN;
1305} 1331}
1306 1332
1333static int dquot_active(const struct inode *inode)
1334{
1335 struct super_block *sb = inode->i_sb;
1336
1337 if (IS_NOQUOTA(inode))
1338 return 0;
1339 return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
1340}
1341
1307/* 1342/*
1308 * Initialize quota pointers in inode 1343 * Initialize quota pointers in inode
1309 * 1344 *
@@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
1323 1358
1324 /* First test before acquiring mutex - solves deadlocks when we 1359 /* First test before acquiring mutex - solves deadlocks when we
1325 * re-enter the quota code and are already holding the mutex */ 1360 * re-enter the quota code and are already holding the mutex */
1326 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1361 if (!dquot_active(inode))
1327 return; 1362 return;
1328 1363
1329 /* First get references to structures we might need. */ 1364 /* First get references to structures we might need. */
@@ -1351,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
1351 /* Avoid races with quotaoff() */ 1386 /* Avoid races with quotaoff() */
1352 if (!sb_has_quota_active(sb, cnt)) 1387 if (!sb_has_quota_active(sb, cnt))
1353 continue; 1388 continue;
1389 /* We could race with quotaon or dqget() could have failed */
1390 if (!got[cnt])
1391 continue;
1354 if (!inode->i_dquot[cnt]) { 1392 if (!inode->i_dquot[cnt]) {
1355 inode->i_dquot[cnt] = got[cnt]; 1393 inode->i_dquot[cnt] = got[cnt];
1356 got[cnt] = NULL; 1394 got[cnt] = NULL;
@@ -1507,7 +1545,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1507 * First test before acquiring mutex - solves deadlocks when we 1545 * First test before acquiring mutex - solves deadlocks when we
1508 * re-enter the quota code and are already holding the mutex 1546 * re-enter the quota code and are already holding the mutex
1509 */ 1547 */
1510 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1548 if (!dquot_active(inode)) {
1511 inode_incr_space(inode, number, reserve); 1549 inode_incr_space(inode, number, reserve);
1512 goto out; 1550 goto out;
1513 } 1551 }
@@ -1559,7 +1597,7 @@ int dquot_alloc_inode(const struct inode *inode)
1559 1597
1560 /* First test before acquiring mutex - solves deadlocks when we 1598 /* First test before acquiring mutex - solves deadlocks when we
1561 * re-enter the quota code and are already holding the mutex */ 1599 * re-enter the quota code and are already holding the mutex */
1562 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1600 if (!dquot_active(inode))
1563 return 0; 1601 return 0;
1564 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1602 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1565 warntype[cnt] = QUOTA_NL_NOWARN; 1603 warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1596,7 +1634,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1596{ 1634{
1597 int cnt; 1635 int cnt;
1598 1636
1599 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1637 if (!dquot_active(inode)) {
1600 inode_claim_rsv_space(inode, number); 1638 inode_claim_rsv_space(inode, number);
1601 return 0; 1639 return 0;
1602 } 1640 }
@@ -1629,7 +1667,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1629 1667
1630 /* First test before acquiring mutex - solves deadlocks when we 1668 /* First test before acquiring mutex - solves deadlocks when we
1631 * re-enter the quota code and are already holding the mutex */ 1669 * re-enter the quota code and are already holding the mutex */
1632 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1670 if (!dquot_active(inode)) {
1633 inode_decr_space(inode, number, reserve); 1671 inode_decr_space(inode, number, reserve);
1634 return; 1672 return;
1635 } 1673 }
@@ -1667,7 +1705,7 @@ void dquot_free_inode(const struct inode *inode)
1667 1705
1668 /* First test before acquiring mutex - solves deadlocks when we 1706 /* First test before acquiring mutex - solves deadlocks when we
1669 * re-enter the quota code and are already holding the mutex */ 1707 * re-enter the quota code and are already holding the mutex */
1670 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1708 if (!dquot_active(inode))
1671 return; 1709 return;
1672 1710
1673 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1711 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1701,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1701 qsize_t rsv_space = 0; 1739 qsize_t rsv_space = 0;
1702 struct dquot *transfer_from[MAXQUOTAS] = {}; 1740 struct dquot *transfer_from[MAXQUOTAS] = {};
1703 int cnt, ret = 0; 1741 int cnt, ret = 0;
1742 char is_valid[MAXQUOTAS] = {};
1704 char warntype_to[MAXQUOTAS]; 1743 char warntype_to[MAXQUOTAS];
1705 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1744 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1706 1745
@@ -1722,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1722 space = cur_space + rsv_space; 1761 space = cur_space + rsv_space;
1723 /* Build the transfer_from list and check the limits */ 1762 /* Build the transfer_from list and check the limits */
1724 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1763 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1764 /*
1765 * Skip changes for same uid or gid or for turned off quota-type.
1766 */
1725 if (!transfer_to[cnt]) 1767 if (!transfer_to[cnt])
1726 continue; 1768 continue;
1769 /* Avoid races with quotaoff() */
1770 if (!sb_has_quota_active(inode->i_sb, cnt))
1771 continue;
1772 is_valid[cnt] = 1;
1727 transfer_from[cnt] = inode->i_dquot[cnt]; 1773 transfer_from[cnt] = inode->i_dquot[cnt];
1728 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); 1774 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1729 if (ret) 1775 if (ret)
@@ -1737,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1737 * Finally perform the needed transfer from transfer_from to transfer_to 1783 * Finally perform the needed transfer from transfer_from to transfer_to
1738 */ 1784 */
1739 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1785 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1740 /* 1786 if (!is_valid[cnt])
1741 * Skip changes for same uid or gid or for turned off quota-type.
1742 */
1743 if (!transfer_to[cnt])
1744 continue; 1787 continue;
1745
1746 /* Due to IO error we might not have transfer_from[] structure */ 1788 /* Due to IO error we might not have transfer_from[] structure */
1747 if (transfer_from[cnt]) { 1789 if (transfer_from[cnt]) {
1748 warntype_from_inodes[cnt] = 1790 warntype_from_inodes[cnt] =
@@ -1766,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1766 1808
1767 mark_all_dquot_dirty(transfer_from); 1809 mark_all_dquot_dirty(transfer_from);
1768 mark_all_dquot_dirty(transfer_to); 1810 mark_all_dquot_dirty(transfer_to);
1769 /* Pass back references to put */
1770 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1771 transfer_to[cnt] = transfer_from[cnt];
1772warn:
1773 flush_warnings(transfer_to, warntype_to); 1811 flush_warnings(transfer_to, warntype_to);
1774 flush_warnings(transfer_from, warntype_from_inodes); 1812 flush_warnings(transfer_from, warntype_from_inodes);
1775 flush_warnings(transfer_from, warntype_from_space); 1813 flush_warnings(transfer_from, warntype_from_space);
1776 return ret; 1814 /* Pass back references to put */
1815 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1816 if (is_valid[cnt])
1817 transfer_to[cnt] = transfer_from[cnt];
1818 return 0;
1777over_quota: 1819over_quota:
1778 spin_unlock(&dq_data_lock); 1820 spin_unlock(&dq_data_lock);
1779 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1821 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1780 goto warn; 1822 flush_warnings(transfer_to, warntype_to);
1823 return ret;
1781} 1824}
1782EXPORT_SYMBOL(__dquot_transfer); 1825EXPORT_SYMBOL(__dquot_transfer);
1783 1826
@@ -1790,7 +1833,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1790 struct super_block *sb = inode->i_sb; 1833 struct super_block *sb = inode->i_sb;
1791 int ret; 1834 int ret;
1792 1835
1793 if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode)) 1836 if (!dquot_active(inode))
1794 return 0; 1837 return 0;
1795 1838
1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1839 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
@@ -1957,7 +2000,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1957 truncate_inode_pages(&toputinode[cnt]->i_data, 2000 truncate_inode_pages(&toputinode[cnt]->i_data,
1958 0); 2001 0);
1959 mutex_unlock(&toputinode[cnt]->i_mutex); 2002 mutex_unlock(&toputinode[cnt]->i_mutex);
1960 mark_inode_dirty(toputinode[cnt]); 2003 mark_inode_dirty_sync(toputinode[cnt]);
1961 } 2004 }
1962 mutex_unlock(&dqopt->dqonoff_mutex); 2005 mutex_unlock(&dqopt->dqonoff_mutex);
1963 } 2006 }
@@ -2270,7 +2313,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2270 memset(di, 0, sizeof(*di)); 2313 memset(di, 0, sizeof(*di));
2271 di->d_version = FS_DQUOT_VERSION; 2314 di->d_version = FS_DQUOT_VERSION;
2272 di->d_flags = dquot->dq_type == USRQUOTA ? 2315 di->d_flags = dquot->dq_type == USRQUOTA ?
2273 XFS_USER_QUOTA : XFS_GROUP_QUOTA; 2316 FS_USER_QUOTA : FS_GROUP_QUOTA;
2274 di->d_id = dquot->dq_id; 2317 di->d_id = dquot->dq_id;
2275 2318
2276 spin_lock(&dq_data_lock); 2319 spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb..9e48874eabc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) { 67 if (ret != info->dqi_usable_bs) {
68 q_warn(KERN_WARNING "VFS: dquota write failed on " 68 quota_error(sb, "dquota write failed");
69 "dev %s\n", sb->s_id);
70 if (ret >= 0) 69 if (ret >= 0)
71 ret = -EIO; 70 ret = -EIO;
72 } 71 }
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
160 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 159 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
161 /* No matter whether write succeeds block is out of list */ 160 /* No matter whether write succeeds block is out of list */
162 if (write_blk(info, blk, buf) < 0) 161 if (write_blk(info, blk, buf) < 0)
163 q_warn(KERN_ERR 162 quota_error(info->dqi_sb, "Can't write block (%u) "
164 "VFS: Can't write block (%u) with free entries.\n", 163 "with free entries", blk);
165 blk);
166 return 0; 164 return 0;
167out_buf: 165out_buf:
168 kfree(tmpbuf); 166 kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
252 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 250 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
253 *err = remove_free_dqentry(info, buf, blk); 251 *err = remove_free_dqentry(info, buf, blk);
254 if (*err < 0) { 252 if (*err < 0) {
255 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't " 253 quota_error(dquot->dq_sb, "Can't remove block (%u) "
256 "remove block (%u) from entry free list.\n", 254 "from entry free list", blk);
257 blk);
258 goto out_buf; 255 goto out_buf;
259 } 256 }
260 } 257 }
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
268 } 265 }
269#ifdef __QUOTA_QT_PARANOIA 266#ifdef __QUOTA_QT_PARANOIA
270 if (i == qtree_dqstr_in_blk(info)) { 267 if (i == qtree_dqstr_in_blk(info)) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " 268 quota_error(dquot->dq_sb, "Data block full but it shouldn't");
272 "but it shouldn't.\n");
273 *err = -EIO; 269 *err = -EIO;
274 goto out_buf; 270 goto out_buf;
275 } 271 }
276#endif 272#endif
277 *err = write_blk(info, blk, buf); 273 *err = write_blk(info, blk, buf);
278 if (*err < 0) { 274 if (*err < 0) {
279 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 275 quota_error(dquot->dq_sb, "Can't write quota data block %u",
280 "data block %u.\n", blk); 276 blk);
281 goto out_buf; 277 goto out_buf;
282 } 278 }
283 dquot->dq_off = (blk << info->dqi_blocksize_bits) + 279 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
311 } else { 307 } else {
312 ret = read_blk(info, *treeblk, buf); 308 ret = read_blk(info, *treeblk, buf);
313 if (ret < 0) { 309 if (ret < 0) {
314 q_warn(KERN_ERR "VFS: Can't read tree quota block " 310 quota_error(dquot->dq_sb, "Can't read tree quota "
315 "%u.\n", *treeblk); 311 "block %u", *treeblk);
316 goto out_buf; 312 goto out_buf;
317 } 313 }
318 } 314 }
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
323 if (depth == info->dqi_qtree_depth - 1) { 319 if (depth == info->dqi_qtree_depth - 1) {
324#ifdef __QUOTA_QT_PARANOIA 320#ifdef __QUOTA_QT_PARANOIA
325 if (newblk) { 321 if (newblk) {
326 printk(KERN_ERR "VFS: Inserting already present quota " 322 quota_error(dquot->dq_sb, "Inserting already present "
327 "entry (block %u).\n", 323 "quota entry (block %u)",
328 le32_to_cpu(ref[get_index(info, 324 le32_to_cpu(ref[get_index(info,
329 dquot->dq_id, depth)])); 325 dquot->dq_id, depth)]));
330 ret = -EIO; 326 ret = -EIO;
331 goto out_buf; 327 goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
373 if (!dquot->dq_off) { 369 if (!dquot->dq_off) {
374 ret = dq_insert_tree(info, dquot); 370 ret = dq_insert_tree(info, dquot);
375 if (ret < 0) { 371 if (ret < 0) {
376 q_warn(KERN_ERR "VFS: Error %zd occurred while " 372 quota_error(sb, "Error %zd occurred while creating "
377 "creating quota.\n", ret); 373 "quota", ret);
378 kfree(ddquot); 374 kfree(ddquot);
379 return ret; 375 return ret;
380 } 376 }
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
385 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 381 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
386 dquot->dq_off); 382 dquot->dq_off);
387 if (ret != info->dqi_entry_size) { 383 if (ret != info->dqi_entry_size) {
388 q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n", 384 quota_error(sb, "dquota write failed");
389 sb->s_id);
390 if (ret >= 0) 385 if (ret >= 0)
391 ret = -ENOSPC; 386 ret = -ENOSPC;
392 } else { 387 } else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
410 if (!buf) 405 if (!buf)
411 return -ENOMEM; 406 return -ENOMEM;
412 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 407 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
413 q_warn(KERN_ERR "VFS: Quota structure has offset to other " 408 quota_error(dquot->dq_sb, "Quota structure has offset to "
414 "block (%u) than it should (%u).\n", blk, 409 "other block (%u) than it should (%u)", blk,
415 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 410 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
416 goto out_buf; 411 goto out_buf;
417 } 412 }
418 ret = read_blk(info, blk, buf); 413 ret = read_blk(info, blk, buf);
419 if (ret < 0) { 414 if (ret < 0) {
420 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 415 quota_error(dquot->dq_sb, "Can't read quota data block %u",
416 blk);
421 goto out_buf; 417 goto out_buf;
422 } 418 }
423 dh = (struct qt_disk_dqdbheader *)buf; 419 dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
427 if (ret >= 0) 423 if (ret >= 0)
428 ret = put_free_dqblk(info, buf, blk); 424 ret = put_free_dqblk(info, buf, blk);
429 if (ret < 0) { 425 if (ret < 0) {
430 q_warn(KERN_ERR "VFS: Can't move quota data block (%u) " 426 quota_error(dquot->dq_sb, "Can't move quota data block "
431 "to free list.\n", blk); 427 "(%u) to free list", blk);
432 goto out_buf; 428 goto out_buf;
433 } 429 }
434 } else { 430 } else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
440 /* Insert will write block itself */ 436 /* Insert will write block itself */
441 ret = insert_free_dqentry(info, buf, blk); 437 ret = insert_free_dqentry(info, buf, blk);
442 if (ret < 0) { 438 if (ret < 0) {
443 q_warn(KERN_ERR "VFS: Can't insert quota data " 439 quota_error(dquot->dq_sb, "Can't insert quota "
444 "block (%u) to free entry list.\n", blk); 440 "data block (%u) to free entry list", blk);
445 goto out_buf; 441 goto out_buf;
446 } 442 }
447 } else { 443 } else {
448 ret = write_blk(info, blk, buf); 444 ret = write_blk(info, blk, buf);
449 if (ret < 0) { 445 if (ret < 0) {
450 q_warn(KERN_ERR "VFS: Can't write quota data " 446 quota_error(dquot->dq_sb, "Can't write quota "
451 "block %u\n", blk); 447 "data block %u", blk);
452 goto out_buf; 448 goto out_buf;
453 } 449 }
454 } 450 }
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
472 return -ENOMEM; 468 return -ENOMEM;
473 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
474 if (ret < 0) { 470 if (ret < 0) {
475 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 471 quota_error(dquot->dq_sb, "Can't read quota data "
472 "block %u", blk);
476 goto out_buf; 473 goto out_buf;
477 } 474 }
478 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
496 } else { 493 } else {
497 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
498 if (ret < 0) 495 if (ret < 0)
499 q_warn(KERN_ERR "VFS: Can't write quota tree " 496 quota_error(dquot->dq_sb, "Can't write quota "
500 "block %u.\n", *blk); 497 "tree block %u", blk);
501 } 498 }
502 } 499 }
503out_buf: 500out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
529 return -ENOMEM; 526 return -ENOMEM;
530 ret = read_blk(info, blk, buf); 527 ret = read_blk(info, blk, buf);
531 if (ret < 0) { 528 if (ret < 0) {
532 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 529 quota_error(dquot->dq_sb, "Can't read quota tree "
530 "block %u", blk);
533 goto out_buf; 531 goto out_buf;
534 } 532 }
535 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 533 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
539 ddquot += info->dqi_entry_size; 537 ddquot += info->dqi_entry_size;
540 } 538 }
541 if (i == qtree_dqstr_in_blk(info)) { 539 if (i == qtree_dqstr_in_blk(info)) {
542 q_warn(KERN_ERR "VFS: Quota for id %u referenced " 540 quota_error(dquot->dq_sb, "Quota for id %u referenced "
543 "but not present.\n", dquot->dq_id); 541 "but not present", dquot->dq_id);
544 ret = -EIO; 542 ret = -EIO;
545 goto out_buf; 543 goto out_buf;
546 } else { 544 } else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
564 return -ENOMEM; 562 return -ENOMEM;
565 ret = read_blk(info, blk, buf); 563 ret = read_blk(info, blk, buf);
566 if (ret < 0) { 564 if (ret < 0) {
567 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 565 quota_error(dquot->dq_sb, "Can't read quota tree block %u",
566 blk);
568 goto out_buf; 567 goto out_buf;
569 } 568 }
570 ret = 0; 569 ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
598#ifdef __QUOTA_QT_PARANOIA 597#ifdef __QUOTA_QT_PARANOIA
599 /* Invalidated quota? */ 598 /* Invalidated quota? */
600 if (!sb_dqopt(dquot->dq_sb)->files[type]) { 599 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
601 printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); 600 quota_error(sb, "Quota invalidated while reading!");
602 return -EIO; 601 return -EIO;
603 } 602 }
604#endif 603#endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
607 offset = find_dqentry(info, dquot); 606 offset = find_dqentry(info, dquot);
608 if (offset <= 0) { /* Entry not present? */ 607 if (offset <= 0) { /* Entry not present? */
609 if (offset < 0) 608 if (offset < 0)
610 q_warn(KERN_ERR "VFS: Can't read quota " 609 quota_error(sb, "Can't read quota structure "
611 "structure for id %u.\n", dquot->dq_id); 610 "for id %u", dquot->dq_id);
612 dquot->dq_off = 0; 611 dquot->dq_off = 0;
613 set_bit(DQ_FAKE_B, &dquot->dq_flags); 612 set_bit(DQ_FAKE_B, &dquot->dq_flags);
614 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 613 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
625 if (ret != info->dqi_entry_size) { 624 if (ret != info->dqi_entry_size) {
626 if (ret >= 0) 625 if (ret >= 0)
627 ret = -EIO; 626 ret = -EIO;
628 q_warn(KERN_ERR "VFS: Error while reading quota " 627 quota_error(sb, "Error while reading quota structure for id %u",
629 "structure for id %u.\n", dquot->dq_id); 628 dquot->dq_id);
630 set_bit(DQ_FAKE_B, &dquot->dq_flags); 629 set_bit(DQ_FAKE_B, &dquot->dq_flags);
631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 630 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
632 kfree(ddquot); 631 kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d..a1ab8db81a5 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
22 22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ 23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24 24
25#define q_warn(fmt, args...) \
26do { \
27 if (printk_ratelimit()) \
28 printk(fmt, ## args); \
29} while(0)
30
31#endif /* _LINUX_QUOTAIO_TREE_H */ 25#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852..34b37a67bb1 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
95 (char *)&dqblk, sizeof(struct v1_disk_dqblk), 95 (char *)&dqblk, sizeof(struct v1_disk_dqblk),
96 v1_dqoff(dquot->dq_id)); 96 v1_dqoff(dquot->dq_id));
97 if (ret != sizeof(struct v1_disk_dqblk)) { 97 if (ret != sizeof(struct v1_disk_dqblk)) {
98 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 98 quota_error(dquot->dq_sb, "dquota write failed");
99 dquot->dq_sb->s_id);
100 if (ret >= 0) 99 if (ret >= 0)
101 ret = -EIO; 100 ret = -EIO;
102 goto out; 101 goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af145..65444d29406 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 q_warn(KERN_WARNING "quota_v2: Failed header read:" 66 quota_error(sb, "Failed header read: expected=%zd got=%zd",
67 " expected=%zd got=%zd\n", 67 sizeof(struct v2_disk_dqheader), size);
68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 68 return 0;
70 } 69 }
71 return 1; 70 return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 105 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 106 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 107 if (size != sizeof(struct v2_disk_dqinfo)) {
109 q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 108 quota_error(sb, "Can't read info structure");
110 sb->s_id);
111 return -1; 109 return -1;
112 } 110 }
113 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); 111 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 165 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 166 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 167 if (size != sizeof(struct v2_disk_dqinfo)) {
170 q_warn(KERN_WARNING "Can't write info structure on device %s.\n", 168 quota_error(sb, "Can't write info structure");
171 sb->s_id);
172 return -1; 169 return -1;
173 } 170 }
174 return 0; 171 return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d532c20fc17..9eead2c796b 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -146,9 +146,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
146 return ret; 146 return ret;
147 } 147 }
148 148
149 ret = simple_setsize(inode, newsize); 149 truncate_setsize(inode, newsize);
150 150 return 0;
151 return ret;
152} 151}
153 152
154/*****************************************************************************/ 153/*****************************************************************************/
@@ -183,7 +182,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
183 } 182 }
184 } 183 }
185 184
186 generic_setattr(inode, ia); 185 setattr_copy(inode, ia);
187 out: 186 out:
188 ia->ia_valid = old_ia_valid; 187 ia->ia_valid = old_ia_valid;
189 return ret; 188 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6..eacb166fb25 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
58 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
59 59
60 if (inode) { 60 if (inode) {
61 inode->i_ino = get_next_ino();
61 inode_init_owner(inode, dir, mode); 62 inode_init_owner(inode, dir, mode);
62 inode->i_mapping->a_ops = &ramfs_aops; 63 inode->i_mapping->a_ops = &ramfs_aops;
63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
254 return err; 255 return err;
255} 256}
256 257
257int ramfs_get_sb(struct file_system_type *fs_type, 258struct dentry *ramfs_mount(struct file_system_type *fs_type,
258 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 259 int flags, const char *dev_name, void *data)
259{ 260{
260 return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); 261 return mount_nodev(fs_type, flags, data, ramfs_fill_super);
261} 262}
262 263
263static int rootfs_get_sb(struct file_system_type *fs_type, 264static struct dentry *rootfs_mount(struct file_system_type *fs_type,
264 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 265 int flags, const char *dev_name, void *data)
265{ 266{
266 return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, 267 return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
267 mnt);
268} 268}
269 269
270static void ramfs_kill_sb(struct super_block *sb) 270static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
275 275
276static struct file_system_type ramfs_fs_type = { 276static struct file_system_type ramfs_fs_type = {
277 .name = "ramfs", 277 .name = "ramfs",
278 .get_sb = ramfs_get_sb, 278 .mount = ramfs_mount,
279 .kill_sb = ramfs_kill_sb, 279 .kill_sb = ramfs_kill_sb,
280}; 280};
281static struct file_system_type rootfs_fs_type = { 281static struct file_system_type rootfs_fs_type = {
282 .name = "rootfs", 282 .name = "rootfs",
283 .get_sb = rootfs_get_sb, 283 .mount = rootfs_mount,
284 .kill_sb = kill_litter_super, 284 .kill_sb = kill_litter_super,
285}; 285};
286 286
diff --git a/fs/read_write.c b/fs/read_write.c
index 9c0485236e6..431a0ed610c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
31 31
32EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
33 33
34static int
35__negative_fpos_check(struct file *file, loff_t pos, size_t count)
36{
37 /*
38 * pos or pos+count is negative here, check overflow.
39 * too big "count" will be caught in rw_verify_area().
40 */
41 if ((pos < 0) && (pos + count < pos))
42 return -EOVERFLOW;
43 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
44 return 0;
45 return -EINVAL;
46}
47
34/** 48/**
35 * generic_file_llseek_unlocked - lockless generic llseek implementation 49 * generic_file_llseek_unlocked - lockless generic llseek implementation
36 * @file: file structure to seek on 50 * @file: file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 break; 76 break;
63 } 77 }
64 78
65 if (offset < 0 || offset > inode->i_sb->s_maxbytes) 79 if (offset < 0 && __negative_fpos_check(file, offset, 0))
80 return -EINVAL;
81 if (offset > inode->i_sb->s_maxbytes)
66 return -EINVAL; 82 return -EINVAL;
67 83
68 /* Special lock needed here? */ 84 /* Special lock needed here? */
@@ -124,7 +140,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
124{ 140{
125 loff_t retval; 141 loff_t retval;
126 142
127 lock_kernel(); 143 mutex_lock(&file->f_dentry->d_inode->i_mutex);
128 switch (origin) { 144 switch (origin) {
129 case SEEK_END: 145 case SEEK_END:
130 offset += i_size_read(file->f_path.dentry->d_inode); 146 offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
137 offset += file->f_pos; 153 offset += file->f_pos;
138 } 154 }
139 retval = -EINVAL; 155 retval = -EINVAL;
140 if (offset >= 0) { 156 if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
141 if (offset != file->f_pos) { 157 if (offset != file->f_pos) {
142 file->f_pos = offset; 158 file->f_pos = offset;
143 file->f_version = 0; 159 file->f_version = 0;
@@ -145,7 +161,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
145 retval = offset; 161 retval = offset;
146 } 162 }
147out: 163out:
148 unlock_kernel(); 164 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
149 return retval; 165 return retval;
150} 166}
151EXPORT_SYMBOL(default_llseek); 167EXPORT_SYMBOL(default_llseek);
@@ -156,7 +172,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
156 172
157 fn = no_llseek; 173 fn = no_llseek;
158 if (file->f_mode & FMODE_LSEEK) { 174 if (file->f_mode & FMODE_LSEEK) {
159 fn = default_llseek;
160 if (file->f_op && file->f_op->llseek) 175 if (file->f_op && file->f_op->llseek)
161 fn = file->f_op->llseek; 176 fn = file->f_op->llseek;
162 } 177 }
@@ -222,13 +237,12 @@ bad:
222} 237}
223#endif 238#endif
224 239
240
225/* 241/*
226 * rw_verify_area doesn't like huge counts. We limit 242 * rw_verify_area doesn't like huge counts. We limit
227 * them to something that fits in "int" so that others 243 * them to something that fits in "int" so that others
228 * won't have to do range checks all the time. 244 * won't have to do range checks all the time.
229 */ 245 */
230#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
231
232int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 246int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
233{ 247{
234 struct inode *inode; 248 struct inode *inode;
@@ -239,8 +253,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
239 if (unlikely((ssize_t) count < 0)) 253 if (unlikely((ssize_t) count < 0))
240 return retval; 254 return retval;
241 pos = *ppos; 255 pos = *ppos;
242 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 256 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
243 return retval; 257 retval = __negative_fpos_check(file, pos, count);
258 if (retval)
259 return retval;
260 }
244 261
245 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 262 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
246 retval = locks_mandatory_area( 263 retval = locks_mandatory_area(
@@ -311,7 +328,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
311 else 328 else
312 ret = do_sync_read(file, buf, count, pos); 329 ret = do_sync_read(file, buf, count, pos);
313 if (ret > 0) { 330 if (ret > 0) {
314 fsnotify_access(file->f_path.dentry); 331 fsnotify_access(file);
315 add_rchar(current, ret); 332 add_rchar(current, ret);
316 } 333 }
317 inc_syscr(current); 334 inc_syscr(current);
@@ -367,7 +384,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
367 else 384 else
368 ret = do_sync_write(file, buf, count, pos); 385 ret = do_sync_write(file, buf, count, pos);
369 if (ret > 0) { 386 if (ret > 0) {
370 fsnotify_modify(file->f_path.dentry); 387 fsnotify_modify(file);
371 add_wchar(current, ret); 388 add_wchar(current, ret);
372 } 389 }
373 inc_syscw(current); 390 inc_syscw(current);
@@ -565,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
565 unsigned long nr_segs, unsigned long fast_segs, 582 unsigned long nr_segs, unsigned long fast_segs,
566 struct iovec *fast_pointer, 583 struct iovec *fast_pointer,
567 struct iovec **ret_pointer) 584 struct iovec **ret_pointer)
568 { 585{
569 unsigned long seg; 586 unsigned long seg;
570 ssize_t ret; 587 ssize_t ret;
571 struct iovec *iov = fast_pointer; 588 struct iovec *iov = fast_pointer;
572 589
573 /* 590 /*
574 * SuS says "The readv() function *may* fail if the iovcnt argument 591 * SuS says "The readv() function *may* fail if the iovcnt argument
575 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 592 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
576 * traditionally returned zero for zero segments, so... 593 * traditionally returned zero for zero segments, so...
577 */ 594 */
578 if (nr_segs == 0) { 595 if (nr_segs == 0) {
579 ret = 0; 596 ret = 0;
580 goto out; 597 goto out;
581 } 598 }
582 599
583 /* 600 /*
584 * First get the "struct iovec" from user memory and 601 * First get the "struct iovec" from user memory and
585 * verify all the pointers 602 * verify all the pointers
586 */ 603 */
587 if (nr_segs > UIO_MAXIOV) { 604 if (nr_segs > UIO_MAXIOV) {
588 ret = -EINVAL; 605 ret = -EINVAL;
589 goto out; 606 goto out;
590 } 607 }
591 if (nr_segs > fast_segs) { 608 if (nr_segs > fast_segs) {
592 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 609 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
593 if (iov == NULL) { 610 if (iov == NULL) {
594 ret = -ENOMEM; 611 ret = -ENOMEM;
595 goto out; 612 goto out;
596 } 613 }
597 } 614 }
598 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 615 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
599 ret = -EFAULT; 616 ret = -EFAULT;
600 goto out; 617 goto out;
601 } 618 }
602 619
603 /* 620 /*
604 * According to the Single Unix Specification we should return EINVAL 621 * According to the Single Unix Specification we should return EINVAL
605 * if an element length is < 0 when cast to ssize_t or if the 622 * if an element length is < 0 when cast to ssize_t or if the
606 * total length would overflow the ssize_t return value of the 623 * total length would overflow the ssize_t return value of the
607 * system call. 624 * system call.
608 */ 625 *
626 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
627 * overflow case.
628 */
609 ret = 0; 629 ret = 0;
610 for (seg = 0; seg < nr_segs; seg++) { 630 for (seg = 0; seg < nr_segs; seg++) {
611 void __user *buf = iov[seg].iov_base; 631 void __user *buf = iov[seg].iov_base;
612 ssize_t len = (ssize_t)iov[seg].iov_len; 632 ssize_t len = (ssize_t)iov[seg].iov_len;
613 633
614 /* see if we we're about to use an invalid len or if 634 /* see if we we're about to use an invalid len or if
615 * it's about to overflow ssize_t */ 635 * it's about to overflow ssize_t */
616 if (len < 0 || (ret + len < ret)) { 636 if (len < 0) {
617 ret = -EINVAL; 637 ret = -EINVAL;
618 goto out; 638 goto out;
619 } 639 }
620 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { 640 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
621 ret = -EFAULT; 641 ret = -EFAULT;
622 goto out; 642 goto out;
643 }
644 if (len > MAX_RW_COUNT - ret) {
645 len = MAX_RW_COUNT - ret;
646 iov[seg].iov_len = len;
623 } 647 }
624
625 ret += len; 648 ret += len;
626 } 649 }
627out: 650out:
628 *ret_pointer = iov; 651 *ret_pointer = iov;
629 return ret; 652 return ret;
@@ -675,9 +698,9 @@ out:
675 kfree(iov); 698 kfree(iov);
676 if ((ret + (type == READ)) > 0) { 699 if ((ret + (type == READ)) > 0) {
677 if (type == READ) 700 if (type == READ)
678 fsnotify_access(file->f_path.dentry); 701 fsnotify_access(file);
679 else 702 else
680 fsnotify_modify(file->f_path.dentry); 703 fsnotify_modify(file);
681 } 704 }
682 return ret; 705 return ret;
683} 706}
diff --git a/fs/readdir.c b/fs/readdir.c
index 7723401f8d8..356f71528ad 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/stddef.h>
7#include <linux/kernel.h> 8#include <linux/kernel.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/time.h> 10#include <linux/time.h>
@@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir);
54 * anyway. Thus the special "fillonedir()" function for that 55 * anyway. Thus the special "fillonedir()" function for that
55 * case (the low-level handlers don't need to care about this). 56 * case (the low-level handlers don't need to care about this).
56 */ 57 */
57#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
58 58
59#ifdef __ARCH_WANT_OLD_READDIR 59#ifdef __ARCH_WANT_OLD_READDIR
60 60
@@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
152 struct linux_dirent __user * dirent; 152 struct linux_dirent __user * dirent;
153 struct getdents_callback * buf = (struct getdents_callback *) __buf; 153 struct getdents_callback * buf = (struct getdents_callback *) __buf;
154 unsigned long d_ino; 154 unsigned long d_ino;
155 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long)); 155 int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
156 sizeof(long));
156 157
157 buf->error = -EINVAL; /* only used if we fail.. */ 158 buf->error = -EINVAL; /* only used if we fail.. */
158 if (reclen > buf->count) 159 if (reclen > buf->count)
@@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
237{ 238{
238 struct linux_dirent64 __user *dirent; 239 struct linux_dirent64 __user *dirent;
239 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; 240 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
240 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64)); 241 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
242 sizeof(u64));
241 243
242 buf->error = -EINVAL; /* only used if we fail.. */ 244 buf->error = -EINVAL; /* only used if we fail.. */
243 if (reclen > buf->count) 245 if (reclen > buf->count)
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f..7cd46666ba2 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
10 10
11 In general, ReiserFS is as fast as ext2, but is very efficient with 11 In general, ReiserFS is as fast as ext2, but is very efficient with
12 large directories and small files. Additional patches are needed 12 large directories and small files. Additional patches are needed
13 for NFS and quotas, please see <http://www.namesys.com/> for links. 13 for NFS and quotas, please see
14 <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
14 15
15 It is more easily extended to have features currently found in 16 It is more easily extended to have features currently found in
16 database and keyword search systems than block allocation based file 17 database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
18 plugins consistent with our motto ``It takes more than a license to 19 plugins consistent with our motto ``It takes more than a license to
19 make source code open.'' 20 make source code open.''
20 21
21 Read <http://www.namesys.com/> to learn more about reiserfs. 22 Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
23 to learn more about reiserfs.
22 24
23 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. 25 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
24 26
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e..e2f7a264e3f 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
43[END LICENSING] 43[END LICENSING]
44 44
45Reiserfs is a file system based on balanced tree algorithms, which is 45Reiserfs is a file system based on balanced tree algorithms, which is
46described at http://devlinux.com/namesys. 46described at https://reiser4.wiki.kernel.org/index.php/Main_Page
47 47
48Stop reading here. Go there, then return. 48Stop reading here. Go there, then return.
49 49
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index b82cdd8a45d..91f080cc76c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,20 +38,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
38 38
39 BUG_ON(!S_ISREG(inode->i_mode)); 39 BUG_ON(!S_ISREG(inode->i_mode));
40 40
41 if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
42 return 0;
43
44 mutex_lock(&(REISERFS_I(inode)->tailpack));
45
46 if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
47 mutex_unlock(&(REISERFS_I(inode)->tailpack));
48 return 0;
49 }
50
41 /* fast out for when nothing needs to be done */ 51 /* fast out for when nothing needs to be done */
42 if ((atomic_read(&inode->i_count) > 1 || 52 if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
43 !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
44 !tail_has_to_be_packed(inode)) && 53 !tail_has_to_be_packed(inode)) &&
45 REISERFS_I(inode)->i_prealloc_count <= 0) { 54 REISERFS_I(inode)->i_prealloc_count <= 0) {
55 mutex_unlock(&(REISERFS_I(inode)->tailpack));
46 return 0; 56 return 0;
47 } 57 }
48 58
49 mutex_lock(&inode->i_mutex);
50
51 mutex_lock(&(REISERFS_I(inode)->i_mmap));
52 if (REISERFS_I(inode)->i_flags & i_ever_mapped)
53 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
54
55 reiserfs_write_lock(inode->i_sb); 59 reiserfs_write_lock(inode->i_sb);
56 /* freeing preallocation only involves relogging blocks that 60 /* freeing preallocation only involves relogging blocks that
57 * are already in the current transaction. preallocation gets 61 * are already in the current transaction. preallocation gets
@@ -94,9 +98,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
94 if (!err) 98 if (!err)
95 err = jbegin_failure; 99 err = jbegin_failure;
96 100
97 if (!err && atomic_read(&inode->i_count) <= 1 && 101 if (!err &&
98 (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && 102 (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
99 tail_has_to_be_packed(inode)) { 103 tail_has_to_be_packed(inode)) {
104
100 /* if regular file is released by last holder and it has been 105 /* if regular file is released by last holder and it has been
101 appended (we append by unformatted node only) or its direct 106 appended (we append by unformatted node only) or its direct
102 item(s) had to be converted, then it may have to be 107 item(s) had to be converted, then it may have to be
@@ -104,27 +109,28 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
104 err = reiserfs_truncate_file(inode, 0); 109 err = reiserfs_truncate_file(inode, 0);
105 } 110 }
106 out: 111 out:
107 mutex_unlock(&(REISERFS_I(inode)->i_mmap));
108 mutex_unlock(&inode->i_mutex);
109 reiserfs_write_unlock(inode->i_sb); 112 reiserfs_write_unlock(inode->i_sb);
113 mutex_unlock(&(REISERFS_I(inode)->tailpack));
110 return err; 114 return err;
111} 115}
112 116
113static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma) 117static int reiserfs_file_open(struct inode *inode, struct file *file)
114{ 118{
115 struct inode *inode; 119 int err = dquot_file_open(inode, file);
116 120 if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
117 inode = file->f_path.dentry->d_inode; 121 /* somebody might be tailpacking on final close; wait for it */
118 mutex_lock(&(REISERFS_I(inode)->i_mmap)); 122 mutex_lock(&(REISERFS_I(inode)->tailpack));
119 REISERFS_I(inode)->i_flags |= i_ever_mapped; 123 atomic_inc(&REISERFS_I(inode)->openers);
120 mutex_unlock(&(REISERFS_I(inode)->i_mmap)); 124 mutex_unlock(&(REISERFS_I(inode)->tailpack));
121 125 }
122 return generic_file_mmap(file, vma); 126 return err;
123} 127}
124 128
125static void reiserfs_vfs_truncate_file(struct inode *inode) 129static void reiserfs_vfs_truncate_file(struct inode *inode)
126{ 130{
131 mutex_lock(&(REISERFS_I(inode)->tailpack));
127 reiserfs_truncate_file(inode, 1); 132 reiserfs_truncate_file(inode, 1);
133 mutex_unlock(&(REISERFS_I(inode)->tailpack));
128} 134}
129 135
130/* Sync a reiserfs file. */ 136/* Sync a reiserfs file. */
@@ -146,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
146 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
147 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
148 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
149 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
150 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 156 if (barrier_done < 0)
152 return barrier_done; 157 return barrier_done;
153 return (err < 0) ? -EIO : 0; 158 return (err < 0) ? -EIO : 0;
@@ -288,8 +293,8 @@ const struct file_operations reiserfs_file_operations = {
288#ifdef CONFIG_COMPAT 293#ifdef CONFIG_COMPAT
289 .compat_ioctl = reiserfs_compat_ioctl, 294 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 295#endif
291 .mmap = reiserfs_file_mmap, 296 .mmap = generic_file_mmap,
292 .open = dquot_file_open, 297 .open = reiserfs_file_open,
293 .release = reiserfs_file_release, 298 .release = reiserfs_file_release,
294 .fsync = reiserfs_sync_file, 299 .fsync = reiserfs_sync_file,
295 .aio_read = generic_file_aio_read, 300 .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0f22fdaf54a..41656d40dc5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,10 +22,8 @@
22 22
23int reiserfs_commit_write(struct file *f, struct page *page, 23int reiserfs_commit_write(struct file *f, struct page *page,
24 unsigned from, unsigned to); 24 unsigned from, unsigned to);
25int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to);
27 25
28void reiserfs_delete_inode(struct inode *inode) 26void reiserfs_evict_inode(struct inode *inode)
29{ 27{
30 /* We need blocks for transaction + (user+group) quota update (possibly delete) */ 28 /* We need blocks for transaction + (user+group) quota update (possibly delete) */
31 int jbegin_count = 29 int jbegin_count =
@@ -35,10 +33,12 @@ void reiserfs_delete_inode(struct inode *inode)
35 int depth; 33 int depth;
36 int err; 34 int err;
37 35
38 if (!is_bad_inode(inode)) 36 if (!inode->i_nlink && !is_bad_inode(inode))
39 dquot_initialize(inode); 37 dquot_initialize(inode);
40 38
41 truncate_inode_pages(&inode->i_data, 0); 39 truncate_inode_pages(&inode->i_data, 0);
40 if (inode->i_nlink)
41 goto no_delete;
42 42
43 depth = reiserfs_write_lock_once(inode->i_sb); 43 depth = reiserfs_write_lock_once(inode->i_sb);
44 44
@@ -77,9 +77,15 @@ void reiserfs_delete_inode(struct inode *inode)
77 ; 77 ;
78 } 78 }
79 out: 79 out:
80 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 80 end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */
81 dquot_drop(inode);
81 inode->i_blocks = 0; 82 inode->i_blocks = 0;
82 reiserfs_write_unlock_once(inode->i_sb, depth); 83 reiserfs_write_unlock_once(inode->i_sb, depth);
84 return;
85
86no_delete:
87 end_writeback(inode);
88 dquot_drop(inode);
83} 89}
84 90
85static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, 91static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -157,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
157** but tail is still sitting in a direct item, and we can't write to 163** but tail is still sitting in a direct item, and we can't write to
158** it. So, look through this page, and check all the mapped buffers 164** it. So, look through this page, and check all the mapped buffers
159** to make sure they have valid block numbers. Any that don't need 165** to make sure they have valid block numbers. Any that don't need
160** to be unmapped, so that block_prepare_write will correctly call 166** to be unmapped, so that __block_write_begin will correctly call
161** reiserfs_get_block to convert the tail into an unformatted node 167** reiserfs_get_block to convert the tail into an unformatted node
162*/ 168*/
163static inline void fix_tail_page_for_writing(struct page *page) 169static inline void fix_tail_page_for_writing(struct page *page)
@@ -431,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
431} 437}
432 438
433/* special version of get_block that is only used by grab_tail_page right 439/* special version of get_block that is only used by grab_tail_page right
434** now. It is sent to block_prepare_write, and when you try to get a 440** now. It is sent to __block_write_begin, and when you try to get a
435** block past the end of the file (or a block from a hole) it returns 441** block past the end of the file (or a block from a hole) it returns
436** -ENOENT instead of a valid buffer. block_prepare_write expects to 442** -ENOENT instead of a valid buffer. __block_write_begin expects to
437** be able to do i/o on the buffers returned, unless an error value 443** be able to do i/o on the buffers returned, unless an error value
438** is also returned. 444** is also returned.
439** 445**
440** So, this allows block_prepare_write to be used for reading a single block 446** So, this allows __block_write_begin to be used for reading a single block
441** in a page. Where it does not produce a valid page for holes, or past the 447** in a page. Where it does not produce a valid page for holes, or past the
442** end of the file. This turns out to be exactly what we need for reading 448** end of the file. This turns out to be exactly what we need for reading
443** tails for conversion. 449** tails for conversion.
@@ -550,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
550 ** 556 **
551 ** We must fix the tail page for writing because it might have buffers 557 ** We must fix the tail page for writing because it might have buffers
552 ** that are mapped, but have a block number of 0. This indicates tail 558 ** that are mapped, but have a block number of 0. This indicates tail
553 ** data that has been read directly into the page, and block_prepare_write 559 ** data that has been read directly into the page, and
554 ** won't trigger a get_block in this case. 560 ** __block_write_begin won't trigger a get_block in this case.
555 */ 561 */
556 fix_tail_page_for_writing(tail_page); 562 fix_tail_page_for_writing(tail_page);
557 retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); 563 retval = __reiserfs_write_begin(tail_page, tail_start,
564 tail_end - tail_start);
558 if (retval) 565 if (retval)
559 goto unlock; 566 goto unlock;
560 567
@@ -1138,7 +1145,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
1138 REISERFS_I(inode)->i_prealloc_count = 0; 1145 REISERFS_I(inode)->i_prealloc_count = 0;
1139 REISERFS_I(inode)->i_trans_id = 0; 1146 REISERFS_I(inode)->i_trans_id = 0;
1140 REISERFS_I(inode)->i_jl = NULL; 1147 REISERFS_I(inode)->i_jl = NULL;
1141 mutex_init(&(REISERFS_I(inode)->i_mmap));
1142 reiserfs_init_xattr_rwsem(inode); 1148 reiserfs_init_xattr_rwsem(inode);
1143 1149
1144 if (stat_data_v1(ih)) { 1150 if (stat_data_v1(ih)) {
@@ -1221,7 +1227,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
1221 inode_set_bytes(inode, 1227 inode_set_bytes(inode,
1222 to_real_used_space(inode, inode->i_blocks, 1228 to_real_used_space(inode, inode->i_blocks,
1223 SD_V2_SIZE)); 1229 SD_V2_SIZE));
1224 /* read persistent inode attributes from sd and initalise 1230 /* read persistent inode attributes from sd and initialise
1225 generic inode flags from them */ 1231 generic inode flags from them */
1226 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); 1232 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1227 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); 1233 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
@@ -1841,7 +1847,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1841 REISERFS_I(inode)->i_attrs = 1847 REISERFS_I(inode)->i_attrs =
1842 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; 1848 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1843 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); 1849 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1844 mutex_init(&(REISERFS_I(inode)->i_mmap));
1845 reiserfs_init_xattr_rwsem(inode); 1850 reiserfs_init_xattr_rwsem(inode);
1846 1851
1847 /* key to search for correct place for new stat data */ 1852 /* key to search for correct place for new stat data */
@@ -2027,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
2027 /* start within the page of the last block in the file */ 2032 /* start within the page of the last block in the file */
2028 start = (offset / blocksize) * blocksize; 2033 start = (offset / blocksize) * blocksize;
2029 2034
2030 error = block_prepare_write(page, start, offset, 2035 error = __block_write_begin(page, start, offset - start,
2031 reiserfs_get_block_create_0); 2036 reiserfs_get_block_create_0);
2032 if (error) 2037 if (error)
2033 goto unlock; 2038 goto unlock;
@@ -2432,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
2432 /* from this point on, we know the buffer is mapped to a 2437 /* from this point on, we know the buffer is mapped to a
2433 * real block and not a direct item 2438 * real block and not a direct item
2434 */ 2439 */
2435 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 2440 if (wbc->sync_mode != WB_SYNC_NONE) {
2436 lock_buffer(bh); 2441 lock_buffer(bh);
2437 } else { 2442 } else {
2438 if (!trylock_buffer(bh)) { 2443 if (!trylock_buffer(bh)) {
@@ -2587,8 +2592,7 @@ static int reiserfs_write_begin(struct file *file,
2587 old_ref = th->t_refcount; 2592 old_ref = th->t_refcount;
2588 th->t_refcount++; 2593 th->t_refcount++;
2589 } 2594 }
2590 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2595 ret = __block_write_begin(page, pos, len, reiserfs_get_block);
2591 reiserfs_get_block);
2592 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2596 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2593 struct reiserfs_transaction_handle *th = current->journal_info; 2597 struct reiserfs_transaction_handle *th = current->journal_info;
2594 /* this gets a little ugly. If reiserfs_get_block returned an 2598 /* this gets a little ugly. If reiserfs_get_block returned an
@@ -2623,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
2623 return ret; 2627 return ret;
2624} 2628}
2625 2629
2626int reiserfs_prepare_write(struct file *f, struct page *page, 2630int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2627 unsigned from, unsigned to)
2628{ 2631{
2629 struct inode *inode = page->mapping->host; 2632 struct inode *inode = page->mapping->host;
2630 int ret; 2633 int ret;
@@ -2645,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2645 th->t_refcount++; 2648 th->t_refcount++;
2646 } 2649 }
2647 2650
2648 ret = block_prepare_write(page, from, to, reiserfs_get_block); 2651 ret = __block_write_begin(page, from, len, reiserfs_get_block);
2649 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2652 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2650 struct reiserfs_transaction_handle *th = current->journal_info; 2653 struct reiserfs_transaction_handle *th = current->journal_info;
2651 /* this gets a little ugly. If reiserfs_get_block returned an 2654 /* this gets a little ugly. If reiserfs_get_block returned an
@@ -3059,10 +3062,25 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3059{ 3062{
3060 struct file *file = iocb->ki_filp; 3063 struct file *file = iocb->ki_filp;
3061 struct inode *inode = file->f_mapping->host; 3064 struct inode *inode = file->f_mapping->host;
3065 ssize_t ret;
3062 3066
3063 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3067 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3064 offset, nr_segs, 3068 offset, nr_segs,
3065 reiserfs_get_blocks_direct_io, NULL); 3069 reiserfs_get_blocks_direct_io, NULL);
3070
3071 /*
3072 * In case of error extending write may have instantiated a few
3073 * blocks outside i_size. Trim these off again.
3074 */
3075 if (unlikely((rw & WRITE) && ret < 0)) {
3076 loff_t isize = i_size_read(inode);
3077 loff_t end = offset + iov_length(iov, nr_segs);
3078
3079 if (end > isize)
3080 vmtruncate(inode, isize);
3081 }
3082
3083 return ret;
3066} 3084}
3067 3085
3068int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) 3086int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3072,6 +3090,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3072 int depth; 3090 int depth;
3073 int error; 3091 int error;
3074 3092
3093 error = inode_change_ok(inode, attr);
3094 if (error)
3095 return error;
3096
3075 /* must be turned off for recursive notify_change calls */ 3097 /* must be turned off for recursive notify_change calls */
3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3098 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3077 3099
@@ -3121,55 +3143,58 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3121 goto out; 3143 goto out;
3122 } 3144 }
3123 3145
3124 error = inode_change_ok(inode, attr); 3146 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3125 if (!error) { 3147 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3126 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3148 struct reiserfs_transaction_handle th;
3127 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3149 int jbegin_count =
3128 error = reiserfs_chown_xattrs(inode, attr); 3150 2 *
3151 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
3152 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
3153 2;
3129 3154
3130 if (!error) { 3155 error = reiserfs_chown_xattrs(inode, attr);
3131 struct reiserfs_transaction_handle th; 3156
3132 int jbegin_count = 3157 if (error)
3133 2 * 3158 return error;
3134 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + 3159
3135 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + 3160 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3136 2; 3161 error = journal_begin(&th, inode->i_sb, jbegin_count);
3137 3162 if (error)
3138 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ 3163 goto out;
3139 error = 3164 error = dquot_transfer(inode, attr);
3140 journal_begin(&th, inode->i_sb, 3165 if (error) {
3141 jbegin_count); 3166 journal_end(&th, inode->i_sb, jbegin_count);
3142 if (error) 3167 goto out;
3143 goto out;
3144 error = dquot_transfer(inode, attr);
3145 if (error) {
3146 journal_end(&th, inode->i_sb,
3147 jbegin_count);
3148 goto out;
3149 }
3150 /* Update corresponding info in inode so that everything is in
3151 * one transaction */
3152 if (attr->ia_valid & ATTR_UID)
3153 inode->i_uid = attr->ia_uid;
3154 if (attr->ia_valid & ATTR_GID)
3155 inode->i_gid = attr->ia_gid;
3156 mark_inode_dirty(inode);
3157 error =
3158 journal_end(&th, inode->i_sb, jbegin_count);
3159 }
3160 }
3161 if (!error) {
3162 /*
3163 * Relax the lock here, as it might truncate the
3164 * inode pages and wait for inode pages locks.
3165 * To release such page lock, the owner needs the
3166 * reiserfs lock
3167 */
3168 reiserfs_write_unlock_once(inode->i_sb, depth);
3169 error = inode_setattr(inode, attr);
3170 depth = reiserfs_write_lock_once(inode->i_sb);
3171 } 3168 }
3169
3170 /* Update corresponding info in inode so that everything is in
3171 * one transaction */
3172 if (attr->ia_valid & ATTR_UID)
3173 inode->i_uid = attr->ia_uid;
3174 if (attr->ia_valid & ATTR_GID)
3175 inode->i_gid = attr->ia_gid;
3176 mark_inode_dirty(inode);
3177 error = journal_end(&th, inode->i_sb, jbegin_count);
3178 if (error)
3179 goto out;
3180 }
3181
3182 /*
3183 * Relax the lock here, as it might truncate the
3184 * inode pages and wait for inode pages locks.
3185 * To release such page lock, the owner needs the
3186 * reiserfs lock
3187 */
3188 reiserfs_write_unlock_once(inode->i_sb, depth);
3189 if ((attr->ia_valid & ATTR_SIZE) &&
3190 attr->ia_size != i_size_read(inode))
3191 error = vmtruncate(inode, attr->ia_size);
3192
3193 if (!error) {
3194 setattr_copy(inode, attr);
3195 mark_inode_dirty(inode);
3172 } 3196 }
3197 depth = reiserfs_write_lock_once(inode->i_sb);
3173 3198
3174 if (!error && reiserfs_posixacl(inode->i_sb)) { 3199 if (!error && reiserfs_posixacl(inode->i_sb)) {
3175 if (attr->ia_valid & ATTR_MODE) 3200 if (attr->ia_valid & ATTR_MODE)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de071..adf22b485ce 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
160 160
161int reiserfs_commit_write(struct file *f, struct page *page, 161int reiserfs_commit_write(struct file *f, struct page *page,
162 unsigned from, unsigned to); 162 unsigned from, unsigned to);
163int reiserfs_prepare_write(struct file *f, struct page *page,
164 unsigned from, unsigned to);
165/* 163/*
166** reiserfs_unpack 164** reiserfs_unpack
167** Function try to convert tail from direct item into indirect. 165** Function try to convert tail from direct item into indirect.
@@ -170,6 +168,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
170int reiserfs_unpack(struct inode *inode, struct file *filp) 168int reiserfs_unpack(struct inode *inode, struct file *filp)
171{ 169{
172 int retval = 0; 170 int retval = 0;
171 int depth;
173 int index; 172 int index;
174 struct page *page; 173 struct page *page;
175 struct address_space *mapping; 174 struct address_space *mapping;
@@ -188,8 +187,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
188 /* we need to make sure nobody is changing the file size beneath 187 /* we need to make sure nobody is changing the file size beneath
189 ** us 188 ** us
190 */ 189 */
191 mutex_lock(&inode->i_mutex); 190 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
192 reiserfs_write_lock(inode->i_sb); 191 depth = reiserfs_write_lock_once(inode->i_sb);
193 192
194 write_from = inode->i_size & (blocksize - 1); 193 write_from = inode->i_size & (blocksize - 1);
195 /* if we are on a block boundary, we are already unpacked. */ 194 /* if we are on a block boundary, we are already unpacked. */
@@ -199,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
199 } 198 }
200 199
201 /* we unpack by finding the page with the tail, and calling 200 /* we unpack by finding the page with the tail, and calling
202 ** reiserfs_prepare_write on that page. This will force a 201 ** __reiserfs_write_begin on that page. This will force a
203 ** reiserfs_get_block to unpack the tail for us. 202 ** reiserfs_get_block to unpack the tail for us.
204 */ 203 */
205 index = inode->i_size >> PAGE_CACHE_SHIFT; 204 index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -209,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
209 if (!page) { 208 if (!page) {
210 goto out; 209 goto out;
211 } 210 }
212 retval = reiserfs_prepare_write(NULL, page, write_from, write_from); 211 retval = __reiserfs_write_begin(page, write_from, 0);
213 if (retval) 212 if (retval)
214 goto out_unlock; 213 goto out_unlock;
215 214
@@ -224,6 +223,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
224 223
225 out: 224 out:
226 mutex_unlock(&inode->i_mutex); 225 mutex_unlock(&inode->i_mutex);
227 reiserfs_write_unlock(inode->i_sb); 226 reiserfs_write_unlock_once(inode->i_sb, depth);
228 return retval; 227 return retval;
229} 228}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e..076c8b19468 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
138 return 0; 138 return 0;
139} 139}
140 140
141static void disable_barrier(struct super_block *s)
142{
143 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
144 printk("reiserfs: disabling flush barriers on %s\n",
145 reiserfs_bdevname(s));
146}
147
148static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 141static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
149 *sb) 142 *sb)
150{ 143{
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
677 submit_bh(WRITE, bh); 670 submit_bh(WRITE, bh);
678} 671}
679 672
680static int submit_barrier_buffer(struct buffer_head *bh)
681{
682 get_bh(bh);
683 bh->b_end_io = reiserfs_end_ordered_io;
684 clear_buffer_dirty(bh);
685 if (!buffer_uptodate(bh))
686 BUG();
687 return submit_bh(WRITE_BARRIER, bh);
688}
689
690static void check_barrier_completion(struct super_block *s,
691 struct buffer_head *bh)
692{
693 if (buffer_eopnotsupp(bh)) {
694 clear_buffer_eopnotsupp(bh);
695 disable_barrier(s);
696 set_buffer_uptodate(bh);
697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
701 }
702}
703
704#define CHUNK_SIZE 32 673#define CHUNK_SIZE 32
705struct buffer_chunk { 674struct buffer_chunk {
706 struct buffer_head *bh[CHUNK_SIZE]; 675 struct buffer_head *bh[CHUNK_SIZE];
@@ -983,7 +952,6 @@ static int flush_older_commits(struct super_block *s,
983 952
984static int reiserfs_async_progress_wait(struct super_block *s) 953static int reiserfs_async_progress_wait(struct super_block *s)
985{ 954{
986 DEFINE_WAIT(wait);
987 struct reiserfs_journal *j = SB_JOURNAL(s); 955 struct reiserfs_journal *j = SB_JOURNAL(s);
988 956
989 if (atomic_read(&j->j_async_throttle)) { 957 if (atomic_read(&j->j_async_throttle)) {
@@ -1010,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
1010 struct buffer_head *tbh = NULL; 978 struct buffer_head *tbh = NULL;
1011 unsigned int trans_id = jl->j_trans_id; 979 unsigned int trans_id = jl->j_trans_id;
1012 struct reiserfs_journal *journal = SB_JOURNAL(s); 980 struct reiserfs_journal *journal = SB_JOURNAL(s);
1013 int barrier = 0;
1014 int retval = 0; 981 int retval = 0;
1015 int write_len; 982 int write_len;
1016 983
@@ -1095,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
1095 } 1062 }
1096 atomic_dec(&journal->j_async_throttle); 1063 atomic_dec(&journal->j_async_throttle);
1097 1064
1098 /* We're skipping the commit if there's an error */
1099 if (retval || reiserfs_is_journal_aborted(journal))
1100 barrier = 0;
1101
1102 /* wait on everything written so far before writing the commit
1103 * if we are in barrier mode, send the commit down now
1104 */
1105 barrier = reiserfs_barrier_flush(s);
1106 if (barrier) {
1107 int ret;
1108 lock_buffer(jl->j_commit_bh);
1109 ret = submit_barrier_buffer(jl->j_commit_bh);
1110 if (ret == -EOPNOTSUPP) {
1111 set_buffer_uptodate(jl->j_commit_bh);
1112 disable_barrier(s);
1113 barrier = 0;
1114 }
1115 }
1116 for (i = 0; i < (jl->j_len + 1); i++) { 1065 for (i = 0; i < (jl->j_len + 1); i++) {
1117 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1066 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1118 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1067 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1144,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
1144 1093
1145 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1094 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1146 1095
1147 if (!barrier) { 1096 /* If there was a write error in the journal - we can't commit
1148 /* If there was a write error in the journal - we can't commit 1097 * this transaction - it will be invalid and, if successful,
1149 * this transaction - it will be invalid and, if successful, 1098 * will just end up propagating the write error out to
1150 * will just end up propagating the write error out to 1099 * the file system. */
1151 * the file system. */ 1100 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1152 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { 1101 if (buffer_dirty(jl->j_commit_bh))
1153 if (buffer_dirty(jl->j_commit_bh)) 1102 BUG();
1154 BUG(); 1103 mark_buffer_dirty(jl->j_commit_bh) ;
1155 mark_buffer_dirty(jl->j_commit_bh) ;
1156 reiserfs_write_unlock(s);
1157 sync_dirty_buffer(jl->j_commit_bh) ;
1158 reiserfs_write_lock(s);
1159 }
1160 } else {
1161 reiserfs_write_unlock(s); 1104 reiserfs_write_unlock(s);
1162 wait_on_buffer(jl->j_commit_bh); 1105 if (reiserfs_barrier_flush(s))
1106 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1107 else
1108 sync_dirty_buffer(jl->j_commit_bh);
1163 reiserfs_write_lock(s); 1109 reiserfs_write_lock(s);
1164 } 1110 }
1165 1111
1166 check_barrier_completion(s, jl->j_commit_bh);
1167
1168 /* If there was a write error in the journal - we can't commit this 1112 /* If there was a write error in the journal - we can't commit this
1169 * transaction - it will be invalid and, if successful, will just end 1113 * transaction - it will be invalid and, if successful, will just end
1170 * up propagating the write error out to the filesystem. */ 1114 * up propagating the write error out to the filesystem. */
@@ -1320,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
1320 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1264 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1321 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1265 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1322 1266
1323 if (reiserfs_barrier_flush(sb)) { 1267 set_buffer_dirty(journal->j_header_bh);
1324 int ret; 1268 reiserfs_write_unlock(sb);
1325 lock_buffer(journal->j_header_bh); 1269
1326 ret = submit_barrier_buffer(journal->j_header_bh); 1270 if (reiserfs_barrier_flush(sb))
1327 if (ret == -EOPNOTSUPP) { 1271 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1328 set_buffer_uptodate(journal->j_header_bh); 1272 else
1329 disable_barrier(sb);
1330 goto sync;
1331 }
1332 reiserfs_write_unlock(sb);
1333 wait_on_buffer(journal->j_header_bh);
1334 reiserfs_write_lock(sb);
1335 check_barrier_completion(sb, journal->j_header_bh);
1336 } else {
1337 sync:
1338 set_buffer_dirty(journal->j_header_bh);
1339 reiserfs_write_unlock(sb);
1340 sync_dirty_buffer(journal->j_header_bh); 1273 sync_dirty_buffer(journal->j_header_bh);
1341 reiserfs_write_lock(sb); 1274
1342 } 1275 reiserfs_write_lock(sb);
1343 if (!buffer_uptodate(journal->j_header_bh)) { 1276 if (!buffer_uptodate(journal->j_header_bh)) {
1344 reiserfs_warning(sb, "journal-837", 1277 reiserfs_warning(sb, "journal-837",
1345 "IO error during journal replay"); 1278 "IO error during journal replay");
@@ -2312,7 +2245,7 @@ static int journal_read_transaction(struct super_block *sb,
2312 /* flush out the real blocks */ 2245 /* flush out the real blocks */
2313 for (i = 0; i < get_desc_trans_len(desc); i++) { 2246 for (i = 0; i < get_desc_trans_len(desc); i++) {
2314 set_buffer_dirty(real_blocks[i]); 2247 set_buffer_dirty(real_blocks[i]);
2315 ll_rw_block(SWRITE, 1, real_blocks + i); 2248 write_dirty_buffer(real_blocks[i], WRITE);
2316 } 2249 }
2317 for (i = 0; i < get_desc_trans_len(desc); i++) { 2250 for (i = 0; i < get_desc_trans_len(desc); i++) {
2318 wait_on_buffer(real_blocks[i]); 2251 wait_on_buffer(real_blocks[i]);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086..ba5f51ec345 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1156 inode->i_ctime = CURRENT_TIME_SEC; 1156 inode->i_ctime = CURRENT_TIME_SEC;
1157 reiserfs_update_sd(&th, inode); 1157 reiserfs_update_sd(&th, inode);
1158 1158
1159 atomic_inc(&inode->i_count); 1159 ihold(inode);
1160 d_instantiate(dentry, inode); 1160 d_instantiate(dentry, inode);
1161 retval = journal_end(&th, dir->i_sb, jbegin_count); 1161 retval = journal_end(&th, dir->i_sb, jbegin_count);
1162 reiserfs_write_unlock(dir->i_sb); 1162 reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9822fa15118..3bf7a6457f4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -525,6 +525,8 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
525 kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); 525 kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
526 if (!ei) 526 if (!ei)
527 return NULL; 527 return NULL;
528 atomic_set(&ei->openers, 0);
529 mutex_init(&ei->tailpack);
528 return &ei->vfs_inode; 530 return &ei->vfs_inode;
529} 531}
530 532
@@ -589,11 +591,6 @@ out:
589 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 591 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
590} 592}
591 593
592static void reiserfs_clear_inode(struct inode *inode)
593{
594 dquot_drop(inode);
595}
596
597#ifdef CONFIG_QUOTA 594#ifdef CONFIG_QUOTA
598static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, 595static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
599 size_t, loff_t); 596 size_t, loff_t);
@@ -606,8 +603,7 @@ static const struct super_operations reiserfs_sops = {
606 .destroy_inode = reiserfs_destroy_inode, 603 .destroy_inode = reiserfs_destroy_inode,
607 .write_inode = reiserfs_write_inode, 604 .write_inode = reiserfs_write_inode,
608 .dirty_inode = reiserfs_dirty_inode, 605 .dirty_inode = reiserfs_dirty_inode,
609 .clear_inode = reiserfs_clear_inode, 606 .evict_inode = reiserfs_evict_inode,
610 .delete_inode = reiserfs_delete_inode,
611 .put_super = reiserfs_put_super, 607 .put_super = reiserfs_put_super,
612 .write_super = reiserfs_write_super, 608 .write_super = reiserfs_write_super,
613 .sync_fs = reiserfs_sync_fs, 609 .sync_fs = reiserfs_sync_fs,
@@ -2217,12 +2213,11 @@ out:
2217 2213
2218#endif 2214#endif
2219 2215
2220static int get_super_block(struct file_system_type *fs_type, 2216static struct dentry *get_super_block(struct file_system_type *fs_type,
2221 int flags, const char *dev_name, 2217 int flags, const char *dev_name,
2222 void *data, struct vfsmount *mnt) 2218 void *data)
2223{ 2219{
2224 return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, 2220 return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
2225 mnt);
2226} 2221}
2227 2222
2228static int __init init_reiserfs_fs(void) 2223static int __init init_reiserfs_fs(void)
@@ -2257,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
2257struct file_system_type reiserfs_fs_type = { 2252struct file_system_type reiserfs_fs_type = {
2258 .owner = THIS_MODULE, 2253 .owner = THIS_MODULE,
2259 .name = "reiserfs", 2254 .name = "reiserfs",
2260 .get_sb = get_super_block, 2255 .mount = get_super_block,
2261 .kill_sb = reiserfs_kill_sb, 2256 .kill_sb = reiserfs_kill_sb,
2262 .fs_flags = FS_REQUIRES_DEV, 2257 .fs_flags = FS_REQUIRES_DEV,
2263}; 2258};
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c67..5d04a7828e7 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
418 418
419int reiserfs_commit_write(struct file *f, struct page *page, 419int reiserfs_commit_write(struct file *f, struct page *page,
420 unsigned from, unsigned to); 420 unsigned from, unsigned to);
421int reiserfs_prepare_write(struct file *f, struct page *page,
422 unsigned from, unsigned to);
423 421
424static void update_ctime(struct inode *inode) 422static void update_ctime(struct inode *inode)
425{ 423{
426 struct timespec now = current_fs_time(inode->i_sb); 424 struct timespec now = current_fs_time(inode->i_sb);
427 if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || 425 if (inode_unhashed(inode) || !inode->i_nlink ||
428 timespec_equal(&inode->i_ctime, &now)) 426 timespec_equal(&inode->i_ctime, &now))
429 return; 427 return;
430 428
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
532 rxh->h_hash = cpu_to_le32(xahash); 530 rxh->h_hash = cpu_to_le32(xahash);
533 } 531 }
534 532
535 err = reiserfs_prepare_write(NULL, page, page_offset, 533 err = __reiserfs_write_begin(page, page_offset, chunk + skip);
536 page_offset + chunk + skip);
537 if (!err) { 534 if (!err) {
538 if (buffer) 535 if (buffer)
539 memcpy(data + skip, buffer + buffer_pos, chunk); 536 memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d21354689..6647f90e55c 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
282static const struct file_operations romfs_dir_operations = { 282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 283 .read = generic_read_dir,
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285 .llseek = default_llseek,
285}; 286};
286 287
287static const struct inode_operations romfs_dir_inode_operations = { 288static const struct inode_operations romfs_dir_inode_operations = {
@@ -551,20 +552,19 @@ error_rsb:
551/* 552/*
552 * get a superblock for mounting 553 * get a superblock for mounting
553 */ 554 */
554static int romfs_get_sb(struct file_system_type *fs_type, 555static struct dentry *romfs_mount(struct file_system_type *fs_type,
555 int flags, const char *dev_name, 556 int flags, const char *dev_name,
556 void *data, struct vfsmount *mnt) 557 void *data)
557{ 558{
558 int ret = -EINVAL; 559 struct dentry *ret = ERR_PTR(-EINVAL);
559 560
560#ifdef CONFIG_ROMFS_ON_MTD 561#ifdef CONFIG_ROMFS_ON_MTD
561 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, 562 ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
562 mnt);
563#endif 563#endif
564#ifdef CONFIG_ROMFS_ON_BLOCK 564#ifdef CONFIG_ROMFS_ON_BLOCK
565 if (ret == -EINVAL) 565 if (ret == ERR_PTR(-EINVAL))
566 ret = get_sb_bdev(fs_type, flags, dev_name, data, 566 ret = mount_bdev(fs_type, flags, dev_name, data,
567 romfs_fill_super, mnt); 567 romfs_fill_super);
568#endif 568#endif
569 return ret; 569 return ret;
570} 570}
@@ -591,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
591static struct file_system_type romfs_fs_type = { 591static struct file_system_type romfs_fs_type = {
592 .owner = THIS_MODULE, 592 .owner = THIS_MODULE,
593 .name = "romfs", 593 .name = "romfs",
594 .get_sb = romfs_get_sb, 594 .mount = romfs_mount,
595 .kill_sb = romfs_kill_sb, 595 .kill_sb = romfs_kill_sb,
596 .fs_flags = FS_REQUIRES_DEV, 596 .fs_flags = FS_REQUIRES_DEV,
597}; 597};
diff --git a/fs/select.c b/fs/select.c
index 500a669f779..b7b10aa3086 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
67 return slack; 67 return slack;
68} 68}
69 69
70static long estimate_accuracy(struct timespec *tv) 70long select_estimate_accuracy(struct timespec *tv)
71{ 71{
72 unsigned long ret; 72 unsigned long ret;
73 struct timespec now; 73 struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
417 } 417 }
418 418
419 if (end_time && !timed_out) 419 if (end_time && !timed_out)
420 slack = estimate_accuracy(end_time); 420 slack = select_estimate_accuracy(end_time);
421 421
422 retval = 0; 422 retval = 0;
423 for (;;) { 423 for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
769 } 769 }
770 770
771 if (end_time && !timed_out) 771 if (end_time && !timed_out)
772 slack = estimate_accuracy(end_time); 772 slack = select_estimate_accuracy(end_time);
773 773
774 for (;;) { 774 for (;;) {
775 struct poll_list *walk; 775 struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3..05d6b0e78c9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
131 */ 131 */
132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) 132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
133{ 133{
134 struct seq_file *m = (struct seq_file *)file->private_data; 134 struct seq_file *m = file->private_data;
135 size_t copied = 0; 135 size_t copied = 0;
136 loff_t pos; 136 loff_t pos;
137 size_t n; 137 size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
280 */ 280 */
281loff_t seq_lseek(struct file *file, loff_t offset, int origin) 281loff_t seq_lseek(struct file *file, loff_t offset, int origin)
282{ 282{
283 struct seq_file *m = (struct seq_file *)file->private_data; 283 struct seq_file *m = file->private_data;
284 loff_t retval = -EINVAL; 284 loff_t retval = -EINVAL;
285 285
286 mutex_lock(&m->lock); 286 mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
324 */ 324 */
325int seq_release(struct inode *inode, struct file *file) 325int seq_release(struct inode *inode, struct file *file)
326{ 326{
327 struct seq_file *m = (struct seq_file *)file->private_data; 327 struct seq_file *m = file->private_data;
328 kfree(m->buf); 328 kfree(m->buf);
329 kfree(m); 329 kfree(m);
330 return 0; 330 return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
462 if (size) { 462 if (size) {
463 char *p; 463 char *p;
464 464
465 spin_lock(&dcache_lock);
466 p = __d_path(path, root, buf, size); 465 p = __d_path(path, root, buf, size);
467 spin_unlock(&dcache_lock);
468 res = PTR_ERR(p); 466 res = PTR_ERR(p);
469 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
470 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c..492465b451d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); 88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); 89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
91 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
91 break; 92 break;
92 case __SI_POLL: 93 case __SI_POLL:
93 err |= __put_user(kinfo->si_band, &uinfo->ssi_band); 94 err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -98,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
98#ifdef __ARCH_SI_TRAPNO 99#ifdef __ARCH_SI_TRAPNO
99 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
100#endif 101#endif
102#ifdef BUS_MCEERR_AO
103 /*
104 * Other callers might not initialize the si_lsb field,
105 * so check explicitly for the right codes here.
106 */
107 if (kinfo->si_code == BUS_MCEERR_AR ||
108 kinfo->si_code == BUS_MCEERR_AO)
109 err |= __put_user((short) kinfo->si_addr_lsb,
110 &uinfo->ssi_addr_lsb);
111#endif
101 break; 112 break;
102 case __SI_CHLD: 113 case __SI_CHLD:
103 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -111,6 +122,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
111 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 122 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
112 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); 123 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
113 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 124 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
125 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
114 break; 126 break;
115 default: 127 default:
116 /* 128 /*
@@ -204,6 +216,7 @@ static const struct file_operations signalfd_fops = {
204 .release = signalfd_release, 216 .release = signalfd_release,
205 .poll = signalfd_poll, 217 .poll = signalfd_poll,
206 .read = signalfd_read, 218 .read = signalfd_read,
219 .llseek = noop_llseek,
207}; 220};
208 221
209SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, 222SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2..00000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on INET
4 select NLS
5 help
6 SMB (Server Message Block) is the protocol Windows for Workgroups
7 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
8 files and printers over local networks. Saying Y here allows you to
9 mount their file systems (often called "shares" in this context) and
10 access them just like any other Unix directory. Currently, this
11 works only if the Windows machines use TCP/IP as the underlying
12 transport protocol, and not NetBEUI. For details, read
13 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>.
15
16 Note: if you just want your box to act as an SMB *server* and make
17 files and printing services available to Windows clients (which need
18 to have a TCP/IP stack), you don't need to say Y here; you can use
19 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
20 for that.
21
22 General information about how to connect Linux, Windows machines and
23 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
24
25 To compile the SMB support as a module, choose M here:
26 the module will be called smbfs. Most people say N, however.
27
28config SMB_NLS_DEFAULT
29 bool "Use a default NLS"
30 depends on SMB_FS
31 help
32 Enabling this will make smbfs use nls translations by default. You
33 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
34 settings and you need to give the default nls for the SMB server as
35 CONFIG_SMB_NLS_REMOTE.
36
37 The nls settings can be changed at mount time, if your smbmount
38 supports that, using the codepage and iocharset parameters.
39
40 smbmount from samba 2.2.0 or later supports this.
41
42config SMB_NLS_REMOTE
43 string "Default Remote NLS Option"
44 depends on SMB_NLS_DEFAULT
45 default "cp437"
46 help
47 This setting allows you to specify a default value for which
48 codepage the server uses. If this field is left blank no
49 translations will be done by default. The local codepage/charset
50 default to CONFIG_NLS_DEFAULT.
51
52 The nls settings can be changed at mount time, if your smbmount
53 supports that, using the codepage and iocharset parameters.
54
55 smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c..00000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
1#
2# Makefile for the linux smb-filesystem routines.
3#
4
5obj-$(CONFIG_SMB_FS) += smbfs.o
6
7smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
8 symlink.o smbiod.o request.o
9
10# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
11# SMBFS_PARANOIA should normally be enabled.
12
13EXTRA_CFLAGS += -DSMBFS_PARANOIA
14#EXTRA_CFLAGS += -DSMBFS_DEBUG
15#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
17#EXTRA_CFLAGS += -Werror
18
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e34..00000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * cache.c
3 *
4 * Copyright (C) 1997 by Bill Hawes
5 *
6 * Routines to support directory cacheing using the page cache.
7 * This cache code is almost directly taken from ncpfs.
8 *
9 * Please add a note about your changes to smbfs in the ChangeLog file.
10 */
11
12#include <linux/time.h>
13#include <linux/errno.h>
14#include <linux/kernel.h>
15#include <linux/mm.h>
16#include <linux/smb_fs.h>
17#include <linux/pagemap.h>
18#include <linux/net.h>
19
20#include <asm/page.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25/*
26 * Force the next attempt to use the cache to be a timeout.
27 * If we can't find the page that's fine, it will cause a refresh.
28 */
29void
30smb_invalid_dir_cache(struct inode * dir)
31{
32 struct smb_sb_info *server = server_from_inode(dir);
33 union smb_dir_cache *cache = NULL;
34 struct page *page = NULL;
35
36 page = grab_cache_page(&dir->i_data, 0);
37 if (!page)
38 goto out;
39
40 if (!PageUptodate(page))
41 goto out_unlock;
42
43 cache = kmap(page);
44 cache->head.time = jiffies - SMB_MAX_AGE(server);
45
46 kunmap(page);
47 SetPageUptodate(page);
48out_unlock:
49 unlock_page(page);
50 page_cache_release(page);
51out:
52 return;
53}
54
55/*
56 * Mark all dentries for 'parent' as invalid, forcing them to be re-read
57 */
58void
59smb_invalidate_dircache_entries(struct dentry *parent)
60{
61 struct smb_sb_info *server = server_from_dentry(parent);
62 struct list_head *next;
63 struct dentry *dentry;
64
65 spin_lock(&dcache_lock);
66 next = parent->d_subdirs.next;
67 while (next != &parent->d_subdirs) {
68 dentry = list_entry(next, struct dentry, d_u.d_child);
69 dentry->d_fsdata = NULL;
70 smb_age_dentry(server, dentry);
71 next = next->next;
72 }
73 spin_unlock(&dcache_lock);
74}
75
76/*
77 * dget, but require that fpos and parent matches what the dentry contains.
78 * dentry is not known to be a valid pointer at entry.
79 */
80struct dentry *
81smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
82{
83 struct dentry *dent = dentry;
84 struct list_head *next;
85
86 if (d_validate(dent, parent)) {
87 if (dent->d_name.len <= SMB_MAXNAMELEN &&
88 (unsigned long)dent->d_fsdata == fpos) {
89 if (!dent->d_inode) {
90 dput(dent);
91 dent = NULL;
92 }
93 return dent;
94 }
95 dput(dent);
96 }
97
98 /* If a pointer is invalid, we search the dentry. */
99 spin_lock(&dcache_lock);
100 next = parent->d_subdirs.next;
101 while (next != &parent->d_subdirs) {
102 dent = list_entry(next, struct dentry, d_u.d_child);
103 if ((unsigned long)dent->d_fsdata == fpos) {
104 if (dent->d_inode)
105 dget_locked(dent);
106 else
107 dent = NULL;
108 goto out_unlock;
109 }
110 next = next->next;
111 }
112 dent = NULL;
113out_unlock:
114 spin_unlock(&dcache_lock);
115 return dent;
116}
117
118
119/*
120 * Create dentry/inode for this file and add it to the dircache.
121 */
122int
123smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
124 struct smb_cache_control *ctrl, struct qstr *qname,
125 struct smb_fattr *entry)
126{
127 struct dentry *newdent, *dentry = filp->f_path.dentry;
128 struct inode *newino, *inode = dentry->d_inode;
129 struct smb_cache_control ctl = *ctrl;
130 int valid = 0;
131 int hashed = 0;
132 ino_t ino = 0;
133
134 qname->hash = full_name_hash(qname->name, qname->len);
135
136 if (dentry->d_op && dentry->d_op->d_hash)
137 if (dentry->d_op->d_hash(dentry, qname) != 0)
138 goto end_advance;
139
140 newdent = d_lookup(dentry, qname);
141
142 if (!newdent) {
143 newdent = d_alloc(dentry, qname);
144 if (!newdent)
145 goto end_advance;
146 } else {
147 hashed = 1;
148 memcpy((char *) newdent->d_name.name, qname->name,
149 newdent->d_name.len);
150 }
151
152 if (!newdent->d_inode) {
153 smb_renew_times(newdent);
154 entry->f_ino = iunique(inode->i_sb, 2);
155 newino = smb_iget(inode->i_sb, entry);
156 if (newino) {
157 smb_new_dentry(newdent);
158 d_instantiate(newdent, newino);
159 if (!hashed)
160 d_rehash(newdent);
161 }
162 } else
163 smb_set_inode_attr(newdent->d_inode, entry);
164
165 if (newdent->d_inode) {
166 ino = newdent->d_inode->i_ino;
167 newdent->d_fsdata = (void *) ctl.fpos;
168 smb_new_dentry(newdent);
169 }
170
171 if (ctl.idx >= SMB_DIRCACHE_SIZE) {
172 if (ctl.page) {
173 kunmap(ctl.page);
174 SetPageUptodate(ctl.page);
175 unlock_page(ctl.page);
176 page_cache_release(ctl.page);
177 }
178 ctl.cache = NULL;
179 ctl.idx -= SMB_DIRCACHE_SIZE;
180 ctl.ofs += 1;
181 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs);
182 if (ctl.page)
183 ctl.cache = kmap(ctl.page);
184 }
185 if (ctl.cache) {
186 ctl.cache->dentry[ctl.idx] = newdent;
187 valid = 1;
188 }
189 dput(newdent);
190
191end_advance:
192 if (!valid)
193 ctl.valid = 0;
194 if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
195 if (!ino)
196 ino = find_inode_number(dentry, qname);
197 if (!ino)
198 ino = iunique(inode->i_sb, 2);
199 ctl.filled = filldir(dirent, qname->name, qname->len,
200 filp->f_pos, ino, DT_UNKNOWN);
201 if (!ctl.filled)
202 filp->f_pos += 1;
203 }
204 ctl.fpos += 1;
205 ctl.idx += 1;
206 *ctrl = ctl;
207 return (ctl.valid || !ctl.filled);
208}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f3..00000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
1/*
2 * dir.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/errno.h>
12#include <linux/kernel.h>
13#include <linux/smp_lock.h>
14#include <linux/ctype.h>
15#include <linux/net.h>
16#include <linux/sched.h>
17
18#include <linux/smb_fs.h>
19#include <linux/smb_mount.h>
20#include <linux/smbno.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25static int smb_readdir(struct file *, void *, filldir_t);
26static int smb_dir_open(struct inode *, struct file *);
27
28static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
29static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
30static int smb_mkdir(struct inode *, struct dentry *, int);
31static int smb_rmdir(struct inode *, struct dentry *);
32static int smb_unlink(struct inode *, struct dentry *);
33static int smb_rename(struct inode *, struct dentry *,
34 struct inode *, struct dentry *);
35static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
36static int smb_link(struct dentry *, struct inode *, struct dentry *);
37
38const struct file_operations smb_dir_operations =
39{
40 .llseek = generic_file_llseek,
41 .read = generic_read_dir,
42 .readdir = smb_readdir,
43 .unlocked_ioctl = smb_ioctl,
44 .open = smb_dir_open,
45};
46
47const struct inode_operations smb_dir_inode_operations =
48{
49 .create = smb_create,
50 .lookup = smb_lookup,
51 .unlink = smb_unlink,
52 .mkdir = smb_mkdir,
53 .rmdir = smb_rmdir,
54 .rename = smb_rename,
55 .getattr = smb_getattr,
56 .setattr = smb_notify_change,
57};
58
59const struct inode_operations smb_dir_inode_operations_unix =
60{
61 .create = smb_create,
62 .lookup = smb_lookup,
63 .unlink = smb_unlink,
64 .mkdir = smb_mkdir,
65 .rmdir = smb_rmdir,
66 .rename = smb_rename,
67 .getattr = smb_getattr,
68 .setattr = smb_notify_change,
69 .symlink = smb_symlink,
70 .mknod = smb_make_node,
71 .link = smb_link,
72};
73
74/*
75 * Read a directory, using filldir to fill the dirent memory.
76 * smb_proc_readdir does the actual reading from the smb server.
77 *
78 * The cache code is almost directly taken from ncpfs
79 */
80static int
81smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
82{
83 struct dentry *dentry = filp->f_path.dentry;
84 struct inode *dir = dentry->d_inode;
85 struct smb_sb_info *server = server_from_dentry(dentry);
86 union smb_dir_cache *cache = NULL;
87 struct smb_cache_control ctl;
88 struct page *page = NULL;
89 int result;
90
91 ctl.page = NULL;
92 ctl.cache = NULL;
93
94 VERBOSE("reading %s/%s, f_pos=%d\n",
95 DENTRY_PATH(dentry), (int) filp->f_pos);
96
97 result = 0;
98
99 lock_kernel();
100
101 switch ((unsigned int) filp->f_pos) {
102 case 0:
103 if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
104 goto out;
105 filp->f_pos = 1;
106 /* fallthrough */
107 case 1:
108 if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
109 goto out;
110 filp->f_pos = 2;
111 }
112
113 /*
114 * Make sure our inode is up-to-date.
115 */
116 result = smb_revalidate_inode(dentry);
117 if (result)
118 goto out;
119
120
121 page = grab_cache_page(&dir->i_data, 0);
122 if (!page)
123 goto read_really;
124
125 ctl.cache = cache = kmap(page);
126 ctl.head = cache->head;
127
128 if (!PageUptodate(page) || !ctl.head.eof) {
129 VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
130 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
131 goto init_cache;
132 }
133
134 if (filp->f_pos == 2) {
135 if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
136 goto init_cache;
137
138 /*
139 * N.B. ncpfs checks mtime of dentry too here, we don't.
140 * 1. common smb servers do not update mtime on dir changes
141 * 2. it requires an extra smb request
142 * (revalidate has the same timeout as ctl.head.time)
143 *
144 * Instead smbfs invalidates its own cache on local changes
145 * and remote changes are not seen until timeout.
146 */
147 }
148
149 if (filp->f_pos > ctl.head.end)
150 goto finished;
151
152 ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
153 ctl.ofs = ctl.fpos / SMB_DIRCACHE_SIZE;
154 ctl.idx = ctl.fpos % SMB_DIRCACHE_SIZE;
155
156 for (;;) {
157 if (ctl.ofs != 0) {
158 ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
159 if (!ctl.page)
160 goto invalid_cache;
161 ctl.cache = kmap(ctl.page);
162 if (!PageUptodate(ctl.page))
163 goto invalid_cache;
164 }
165 while (ctl.idx < SMB_DIRCACHE_SIZE) {
166 struct dentry *dent;
167 int res;
168
169 dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
170 dentry, filp->f_pos);
171 if (!dent)
172 goto invalid_cache;
173
174 res = filldir(dirent, dent->d_name.name,
175 dent->d_name.len, filp->f_pos,
176 dent->d_inode->i_ino, DT_UNKNOWN);
177 dput(dent);
178 if (res)
179 goto finished;
180 filp->f_pos += 1;
181 ctl.idx += 1;
182 if (filp->f_pos > ctl.head.end)
183 goto finished;
184 }
185 if (ctl.page) {
186 kunmap(ctl.page);
187 SetPageUptodate(ctl.page);
188 unlock_page(ctl.page);
189 page_cache_release(ctl.page);
190 ctl.page = NULL;
191 }
192 ctl.idx = 0;
193 ctl.ofs += 1;
194 }
195invalid_cache:
196 if (ctl.page) {
197 kunmap(ctl.page);
198 unlock_page(ctl.page);
199 page_cache_release(ctl.page);
200 ctl.page = NULL;
201 }
202 ctl.cache = cache;
203init_cache:
204 smb_invalidate_dircache_entries(dentry);
205 ctl.head.time = jiffies;
206 ctl.head.eof = 0;
207 ctl.fpos = 2;
208 ctl.ofs = 0;
209 ctl.idx = SMB_DIRCACHE_START;
210 ctl.filled = 0;
211 ctl.valid = 1;
212read_really:
213 result = server->ops->readdir(filp, dirent, filldir, &ctl);
214 if (result == -ERESTARTSYS && page)
215 ClearPageUptodate(page);
216 if (ctl.idx == -1)
217 goto invalid_cache; /* retry */
218 ctl.head.end = ctl.fpos - 1;
219 ctl.head.eof = ctl.valid;
220finished:
221 if (page) {
222 cache->head = ctl.head;
223 kunmap(page);
224 if (result != -ERESTARTSYS)
225 SetPageUptodate(page);
226 unlock_page(page);
227 page_cache_release(page);
228 }
229 if (ctl.page) {
230 kunmap(ctl.page);
231 SetPageUptodate(ctl.page);
232 unlock_page(ctl.page);
233 page_cache_release(ctl.page);
234 }
235out:
236 unlock_kernel();
237 return result;
238}
239
240static int
241smb_dir_open(struct inode *dir, struct file *file)
242{
243 struct dentry *dentry = file->f_path.dentry;
244 struct smb_sb_info *server;
245 int error = 0;
246
247 VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
248 file->f_path.dentry->d_name.name);
249
250 /*
251 * Directory timestamps in the core protocol aren't updated
252 * when a file is added, so we give them a very short TTL.
253 */
254 lock_kernel();
255 server = server_from_dentry(dentry);
256 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
257 unsigned long age = jiffies - SMB_I(dir)->oldmtime;
258 if (age > 2*HZ)
259 smb_invalid_dir_cache(dir);
260 }
261
262 /*
263 * Note: in order to allow the smbmount process to open the
264 * mount point, we only revalidate if the connection is valid or
265 * if the process is trying to access something other than the root.
266 */
267 if (server->state == CONN_VALID || !IS_ROOT(dentry))
268 error = smb_revalidate_inode(dentry);
269 unlock_kernel();
270 return error;
271}
272
273/*
274 * Dentry operations routines
275 */
276static int smb_lookup_validate(struct dentry *, struct nameidata *);
277static int smb_hash_dentry(struct dentry *, struct qstr *);
278static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
279static int smb_delete_dentry(struct dentry *);
280
281static const struct dentry_operations smbfs_dentry_operations =
282{
283 .d_revalidate = smb_lookup_validate,
284 .d_hash = smb_hash_dentry,
285 .d_compare = smb_compare_dentry,
286 .d_delete = smb_delete_dentry,
287};
288
289static const struct dentry_operations smbfs_dentry_operations_case =
290{
291 .d_revalidate = smb_lookup_validate,
292 .d_delete = smb_delete_dentry,
293};
294
295
296/*
297 * This is the callback when the dcache has a lookup hit.
298 */
299static int
300smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
301{
302 struct smb_sb_info *server = server_from_dentry(dentry);
303 struct inode * inode = dentry->d_inode;
304 unsigned long age = jiffies - dentry->d_time;
305 int valid;
306
307 /*
308 * The default validation is based on dentry age:
309 * we believe in dentries for a few seconds. (But each
310 * successful server lookup renews the timestamp.)
311 */
312 valid = (age <= SMB_MAX_AGE(server));
313#ifdef SMBFS_DEBUG_VERBOSE
314 if (!valid)
315 VERBOSE("%s/%s not valid, age=%lu\n",
316 DENTRY_PATH(dentry), age);
317#endif
318
319 if (inode) {
320 lock_kernel();
321 if (is_bad_inode(inode)) {
322 PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
323 valid = 0;
324 } else if (!valid)
325 valid = (smb_revalidate_inode(dentry) == 0);
326 unlock_kernel();
327 } else {
328 /*
329 * What should we do for negative dentries?
330 */
331 }
332 return valid;
333}
334
335static int
336smb_hash_dentry(struct dentry *dir, struct qstr *this)
337{
338 unsigned long hash;
339 int i;
340
341 hash = init_name_hash();
342 for (i=0; i < this->len ; i++)
343 hash = partial_name_hash(tolower(this->name[i]), hash);
344 this->hash = end_name_hash(hash);
345
346 return 0;
347}
348
349static int
350smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
351{
352 int i, result = 1;
353
354 if (a->len != b->len)
355 goto out;
356 for (i=0; i < a->len; i++) {
357 if (tolower(a->name[i]) != tolower(b->name[i]))
358 goto out;
359 }
360 result = 0;
361out:
362 return result;
363}
364
365/*
366 * This is the callback from dput() when d_count is going to 0.
367 * We use this to unhash dentries with bad inodes.
368 */
369static int
370smb_delete_dentry(struct dentry * dentry)
371{
372 if (dentry->d_inode) {
373 if (is_bad_inode(dentry->d_inode)) {
374 PARANOIA("bad inode, unhashing %s/%s\n",
375 DENTRY_PATH(dentry));
376 return 1;
377 }
378 } else {
379 /* N.B. Unhash negative dentries? */
380 }
381 return 0;
382}
383
384/*
385 * Initialize a new dentry
386 */
387void
388smb_new_dentry(struct dentry *dentry)
389{
390 struct smb_sb_info *server = server_from_dentry(dentry);
391
392 if (server->mnt->flags & SMB_MOUNT_CASE)
393 dentry->d_op = &smbfs_dentry_operations_case;
394 else
395 dentry->d_op = &smbfs_dentry_operations;
396 dentry->d_time = jiffies;
397}
398
399
400/*
401 * Whenever a lookup succeeds, we know the parent directories
402 * are all valid, so we want to update the dentry timestamps.
403 * N.B. Move this to dcache?
404 */
405void
406smb_renew_times(struct dentry * dentry)
407{
408 dget(dentry);
409 spin_lock(&dentry->d_lock);
410 for (;;) {
411 struct dentry *parent;
412
413 dentry->d_time = jiffies;
414 if (IS_ROOT(dentry))
415 break;
416 parent = dentry->d_parent;
417 dget(parent);
418 spin_unlock(&dentry->d_lock);
419 dput(dentry);
420 dentry = parent;
421 spin_lock(&dentry->d_lock);
422 }
423 spin_unlock(&dentry->d_lock);
424 dput(dentry);
425}
426
427static struct dentry *
428smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
429{
430 struct smb_fattr finfo;
431 struct inode *inode;
432 int error;
433 struct smb_sb_info *server;
434
435 error = -ENAMETOOLONG;
436 if (dentry->d_name.len > SMB_MAXNAMELEN)
437 goto out;
438
439 /* Do not allow lookup of names with backslashes in */
440 error = -EINVAL;
441 if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
442 goto out;
443
444 lock_kernel();
445 error = smb_proc_getattr(dentry, &finfo);
446#ifdef SMBFS_PARANOIA
447 if (error && error != -ENOENT)
448 PARANOIA("find %s/%s failed, error=%d\n",
449 DENTRY_PATH(dentry), error);
450#endif
451
452 inode = NULL;
453 if (error == -ENOENT)
454 goto add_entry;
455 if (!error) {
456 error = -EACCES;
457 finfo.f_ino = iunique(dentry->d_sb, 2);
458 inode = smb_iget(dir->i_sb, &finfo);
459 if (inode) {
460 add_entry:
461 server = server_from_dentry(dentry);
462 if (server->mnt->flags & SMB_MOUNT_CASE)
463 dentry->d_op = &smbfs_dentry_operations_case;
464 else
465 dentry->d_op = &smbfs_dentry_operations;
466
467 d_add(dentry, inode);
468 smb_renew_times(dentry);
469 error = 0;
470 }
471 }
472 unlock_kernel();
473out:
474 return ERR_PTR(error);
475}
476
477/*
478 * This code is common to all routines creating a new inode.
479 */
480static int
481smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
482{
483 struct smb_sb_info *server = server_from_dentry(dentry);
484 struct inode *inode;
485 int error;
486 struct smb_fattr fattr;
487
488 VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
489
490 error = smb_proc_getattr(dentry, &fattr);
491 if (error)
492 goto out_close;
493
494 smb_renew_times(dentry);
495 fattr.f_ino = iunique(dentry->d_sb, 2);
496 inode = smb_iget(dentry->d_sb, &fattr);
497 if (!inode)
498 goto out_no_inode;
499
500 if (have_id) {
501 struct smb_inode_info *ei = SMB_I(inode);
502 ei->fileid = fileid;
503 ei->access = SMB_O_RDWR;
504 ei->open = server->generation;
505 }
506 d_instantiate(dentry, inode);
507out:
508 return error;
509
510out_no_inode:
511 error = -EACCES;
512out_close:
513 if (have_id) {
514 PARANOIA("%s/%s failed, error=%d, closing %u\n",
515 DENTRY_PATH(dentry), error, fileid);
516 smb_close_fileid(dentry, fileid);
517 }
518 goto out;
519}
520
521/* N.B. How should the mode argument be used? */
522static int
523smb_create(struct inode *dir, struct dentry *dentry, int mode,
524 struct nameidata *nd)
525{
526 struct smb_sb_info *server = server_from_dentry(dentry);
527 __u16 fileid;
528 int error;
529 struct iattr attr;
530
531 VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
532
533 lock_kernel();
534 smb_invalid_dir_cache(dir);
535 error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
536 if (!error) {
537 if (server->opt.capabilities & SMB_CAP_UNIX) {
538 /* Set attributes for new file */
539 attr.ia_valid = ATTR_MODE;
540 attr.ia_mode = mode;
541 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
542 }
543 error = smb_instantiate(dentry, fileid, 1);
544 } else {
545 PARANOIA("%s/%s failed, error=%d\n",
546 DENTRY_PATH(dentry), error);
547 }
548 unlock_kernel();
549 return error;
550}
551
552/* N.B. How should the mode argument be used? */
553static int
554smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
555{
556 struct smb_sb_info *server = server_from_dentry(dentry);
557 int error;
558 struct iattr attr;
559
560 lock_kernel();
561 smb_invalid_dir_cache(dir);
562 error = smb_proc_mkdir(dentry);
563 if (!error) {
564 if (server->opt.capabilities & SMB_CAP_UNIX) {
565 /* Set attributes for new directory */
566 attr.ia_valid = ATTR_MODE;
567 attr.ia_mode = mode;
568 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
569 }
570 error = smb_instantiate(dentry, 0, 0);
571 }
572 unlock_kernel();
573 return error;
574}
575
576static int
577smb_rmdir(struct inode *dir, struct dentry *dentry)
578{
579 struct inode *inode = dentry->d_inode;
580 int error;
581
582 /*
583 * Close the directory if it's open.
584 */
585 lock_kernel();
586 smb_close(inode);
587
588 /*
589 * Check that nobody else is using the directory..
590 */
591 error = -EBUSY;
592 if (!d_unhashed(dentry))
593 goto out;
594
595 smb_invalid_dir_cache(dir);
596 error = smb_proc_rmdir(dentry);
597
598out:
599 unlock_kernel();
600 return error;
601}
602
603static int
604smb_unlink(struct inode *dir, struct dentry *dentry)
605{
606 int error;
607
608 /*
609 * Close the file if it's open.
610 */
611 lock_kernel();
612 smb_close(dentry->d_inode);
613
614 smb_invalid_dir_cache(dir);
615 error = smb_proc_unlink(dentry);
616 if (!error)
617 smb_renew_times(dentry);
618 unlock_kernel();
619 return error;
620}
621
622static int
623smb_rename(struct inode *old_dir, struct dentry *old_dentry,
624 struct inode *new_dir, struct dentry *new_dentry)
625{
626 int error;
627
628 /*
629 * Close any open files, and check whether to delete the
630 * target before attempting the rename.
631 */
632 lock_kernel();
633 if (old_dentry->d_inode)
634 smb_close(old_dentry->d_inode);
635 if (new_dentry->d_inode) {
636 smb_close(new_dentry->d_inode);
637 error = smb_proc_unlink(new_dentry);
638 if (error) {
639 VERBOSE("unlink %s/%s, error=%d\n",
640 DENTRY_PATH(new_dentry), error);
641 goto out;
642 }
643 /* FIXME */
644 d_delete(new_dentry);
645 }
646
647 smb_invalid_dir_cache(old_dir);
648 smb_invalid_dir_cache(new_dir);
649 error = smb_proc_mv(old_dentry, new_dentry);
650 if (!error) {
651 smb_renew_times(old_dentry);
652 smb_renew_times(new_dentry);
653 }
654out:
655 unlock_kernel();
656 return error;
657}
658
659/*
660 * FIXME: samba servers won't let you create device nodes unless uid/gid
661 * matches the connection credentials (and we don't know which those are ...)
662 */
663static int
664smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
665{
666 int error;
667 struct iattr attr;
668
669 attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
670 attr.ia_mode = mode;
671 current_euid_egid(&attr.ia_uid, &attr.ia_gid);
672
673 if (!new_valid_dev(dev))
674 return -EINVAL;
675
676 smb_invalid_dir_cache(dir);
677 error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
678 if (!error) {
679 error = smb_instantiate(dentry, 0, 0);
680 }
681 return error;
682}
683
684/*
685 * dentry = existing file
686 * new_dentry = new file
687 */
688static int
689smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
690{
691 int error;
692
693 DEBUG1("smb_link old=%s/%s new=%s/%s\n",
694 DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
695 smb_invalid_dir_cache(dir);
696 error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
697 if (!error) {
698 smb_renew_times(dentry);
699 error = smb_instantiate(new_dentry, 0, 0);
700 }
701 return error;
702}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94b..00000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
1/*
2 * file.c
3 *
4 * Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/fcntl.h>
14#include <linux/stat.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/net.h>
19#include <linux/aio.h>
20
21#include <asm/uaccess.h>
22#include <asm/system.h>
23
24#include <linux/smbno.h>
25#include <linux/smb_fs.h>
26
27#include "smb_debug.h"
28#include "proto.h"
29
30static int
31smb_fsync(struct file *file, int datasync)
32{
33 struct dentry *dentry = file->f_path.dentry;
34 struct smb_sb_info *server = server_from_dentry(dentry);
35 int result;
36
37 VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
38
39 /*
40 * The VFS will writepage() all dirty pages for us, but we
41 * should send a SMBflush to the server, letting it know that
42 * we want things synchronized with actual storage.
43 *
44 * Note: this function requires all pages to have been written already
45 * (should be ok with writepage_sync)
46 */
47 result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
48 return result;
49}
50
51/*
52 * Read a page synchronously.
53 */
54static int
55smb_readpage_sync(struct dentry *dentry, struct page *page)
56{
57 char *buffer = kmap(page);
58 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
59 struct smb_sb_info *server = server_from_dentry(dentry);
60 unsigned int rsize = smb_get_rsize(server);
61 int count = PAGE_SIZE;
62 int result;
63
64 VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
65 DENTRY_PATH(dentry), count, offset, rsize);
66
67 result = smb_open(dentry, SMB_O_RDONLY);
68 if (result < 0)
69 goto io_error;
70
71 do {
72 if (count < rsize)
73 rsize = count;
74
75 result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
76 if (result < 0)
77 goto io_error;
78
79 count -= result;
80 offset += result;
81 buffer += result;
82 dentry->d_inode->i_atime =
83 current_fs_time(dentry->d_inode->i_sb);
84 if (result < rsize)
85 break;
86 } while (count);
87
88 memset(buffer, 0, count);
89 flush_dcache_page(page);
90 SetPageUptodate(page);
91 result = 0;
92
93io_error:
94 kunmap(page);
95 unlock_page(page);
96 return result;
97}
98
99/*
100 * We are called with the page locked and we unlock it when done.
101 */
102static int
103smb_readpage(struct file *file, struct page *page)
104{
105 int error;
106 struct dentry *dentry = file->f_path.dentry;
107
108 page_cache_get(page);
109 error = smb_readpage_sync(dentry, page);
110 page_cache_release(page);
111 return error;
112}
113
114/*
115 * Write a page synchronously.
116 * Offset is the data offset within the page.
117 */
118static int
119smb_writepage_sync(struct inode *inode, struct page *page,
120 unsigned long pageoffset, unsigned int count)
121{
122 loff_t offset;
123 char *buffer = kmap(page) + pageoffset;
124 struct smb_sb_info *server = server_from_inode(inode);
125 unsigned int wsize = smb_get_wsize(server);
126 int ret = 0;
127
128 offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
129 VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
130 inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
131
132 do {
133 int write_ret;
134
135 if (count < wsize)
136 wsize = count;
137
138 write_ret = server->ops->write(inode, offset, wsize, buffer);
139 if (write_ret < 0) {
140 PARANOIA("failed write, wsize=%d, write_ret=%d\n",
141 wsize, write_ret);
142 ret = write_ret;
143 break;
144 }
145 /* N.B. what if result < wsize?? */
146#ifdef SMBFS_PARANOIA
147 if (write_ret < wsize)
148 PARANOIA("short write, wsize=%d, write_ret=%d\n",
149 wsize, write_ret);
150#endif
151 buffer += wsize;
152 offset += wsize;
153 count -= wsize;
154 /*
155 * Update the inode now rather than waiting for a refresh.
156 */
157 inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
158 SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
159 if (offset > inode->i_size)
160 inode->i_size = offset;
161 } while (count);
162
163 kunmap(page);
164 return ret;
165}
166
167/*
168 * Write a page to the server. This will be used for NFS swapping only
169 * (for now), and we currently do this synchronously only.
170 *
171 * We are called with the page locked and we unlock it when done.
172 */
173static int
174smb_writepage(struct page *page, struct writeback_control *wbc)
175{
176 struct address_space *mapping = page->mapping;
177 struct inode *inode;
178 unsigned long end_index;
179 unsigned offset = PAGE_CACHE_SIZE;
180 int err;
181
182 BUG_ON(!mapping);
183 inode = mapping->host;
184 BUG_ON(!inode);
185
186 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
187
188 /* easy case */
189 if (page->index < end_index)
190 goto do_it;
191 /* things got complicated... */
192 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
193 /* OK, are we completely out? */
194 if (page->index >= end_index+1 || !offset)
195 return 0; /* truncated - don't care */
196do_it:
197 page_cache_get(page);
198 err = smb_writepage_sync(inode, page, 0, offset);
199 SetPageUptodate(page);
200 unlock_page(page);
201 page_cache_release(page);
202 return err;
203}
204
205static int
206smb_updatepage(struct file *file, struct page *page, unsigned long offset,
207 unsigned int count)
208{
209 struct dentry *dentry = file->f_path.dentry;
210
211 DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
212 ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
213
214 return smb_writepage_sync(dentry->d_inode, page, offset, count);
215}
216
217static ssize_t
218smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
219 unsigned long nr_segs, loff_t pos)
220{
221 struct file * file = iocb->ki_filp;
222 struct dentry * dentry = file->f_path.dentry;
223 ssize_t status;
224
225 VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
226 (unsigned long) iocb->ki_left, (unsigned long) pos);
227
228 status = smb_revalidate_inode(dentry);
229 if (status) {
230 PARANOIA("%s/%s validation failed, error=%Zd\n",
231 DENTRY_PATH(dentry), status);
232 goto out;
233 }
234
235 VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
236 (long)dentry->d_inode->i_size,
237 dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
238
239 status = generic_file_aio_read(iocb, iov, nr_segs, pos);
240out:
241 return status;
242}
243
244static int
245smb_file_mmap(struct file * file, struct vm_area_struct * vma)
246{
247 struct dentry * dentry = file->f_path.dentry;
248 int status;
249
250 VERBOSE("file %s/%s, address %lu - %lu\n",
251 DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
252
253 status = smb_revalidate_inode(dentry);
254 if (status) {
255 PARANOIA("%s/%s validation failed, error=%d\n",
256 DENTRY_PATH(dentry), status);
257 goto out;
258 }
259 status = generic_file_mmap(file, vma);
260out:
261 return status;
262}
263
264static ssize_t
265smb_file_splice_read(struct file *file, loff_t *ppos,
266 struct pipe_inode_info *pipe, size_t count,
267 unsigned int flags)
268{
269 struct dentry *dentry = file->f_path.dentry;
270 ssize_t status;
271
272 VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
273 DENTRY_PATH(dentry), *ppos, count);
274
275 status = smb_revalidate_inode(dentry);
276 if (status) {
277 PARANOIA("%s/%s validation failed, error=%Zd\n",
278 DENTRY_PATH(dentry), status);
279 goto out;
280 }
281 status = generic_file_splice_read(file, ppos, pipe, count, flags);
282out:
283 return status;
284}
285
286/*
287 * This does the "real" work of the write. The generic routine has
288 * allocated the page, locked it, done all the page alignment stuff
289 * calculations etc. Now we should just copy the data from user
290 * space and write it back to the real medium..
291 *
292 * If the writer ends up delaying the write, the writer needs to
293 * increment the page use counts until he is done with the page.
294 */
295static int smb_write_begin(struct file *file, struct address_space *mapping,
296 loff_t pos, unsigned len, unsigned flags,
297 struct page **pagep, void **fsdata)
298{
299 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
300 *pagep = grab_cache_page_write_begin(mapping, index, flags);
301 if (!*pagep)
302 return -ENOMEM;
303 return 0;
304}
305
306static int smb_write_end(struct file *file, struct address_space *mapping,
307 loff_t pos, unsigned len, unsigned copied,
308 struct page *page, void *fsdata)
309{
310 int status;
311 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
312
313 lock_kernel();
314 status = smb_updatepage(file, page, offset, copied);
315 unlock_kernel();
316
317 if (!status) {
318 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
319 SetPageUptodate(page);
320 status = copied;
321 }
322
323 unlock_page(page);
324 page_cache_release(page);
325
326 return status;
327}
328
329const struct address_space_operations smb_file_aops = {
330 .readpage = smb_readpage,
331 .writepage = smb_writepage,
332 .write_begin = smb_write_begin,
333 .write_end = smb_write_end,
334};
335
336/*
337 * Write to a file (through the page cache).
338 */
339static ssize_t
340smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
341 unsigned long nr_segs, loff_t pos)
342{
343 struct file * file = iocb->ki_filp;
344 struct dentry * dentry = file->f_path.dentry;
345 ssize_t result;
346
347 VERBOSE("file %s/%s, count=%lu@%lu\n",
348 DENTRY_PATH(dentry),
349 (unsigned long) iocb->ki_left, (unsigned long) pos);
350
351 result = smb_revalidate_inode(dentry);
352 if (result) {
353 PARANOIA("%s/%s validation failed, error=%Zd\n",
354 DENTRY_PATH(dentry), result);
355 goto out;
356 }
357
358 result = smb_open(dentry, SMB_O_WRONLY);
359 if (result)
360 goto out;
361
362 if (iocb->ki_left > 0) {
363 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
364 VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
365 (long) file->f_pos, (long) dentry->d_inode->i_size,
366 dentry->d_inode->i_mtime.tv_sec,
367 dentry->d_inode->i_atime.tv_sec);
368 }
369out:
370 return result;
371}
372
373static int
374smb_file_open(struct inode *inode, struct file * file)
375{
376 int result;
377 struct dentry *dentry = file->f_path.dentry;
378 int smb_mode = (file->f_mode & O_ACCMODE) - 1;
379
380 lock_kernel();
381 result = smb_open(dentry, smb_mode);
382 if (result)
383 goto out;
384 SMB_I(inode)->openers++;
385out:
386 unlock_kernel();
387 return result;
388}
389
390static int
391smb_file_release(struct inode *inode, struct file * file)
392{
393 lock_kernel();
394 if (!--SMB_I(inode)->openers) {
395 /* We must flush any dirty pages now as we won't be able to
396 write anything after close. mmap can trigger this.
397 "openers" should perhaps include mmap'ers ... */
398 filemap_write_and_wait(inode->i_mapping);
399 smb_close(inode);
400 }
401 unlock_kernel();
402 return 0;
403}
404
405/*
406 * Check whether the required access is compatible with
407 * an inode's permission. SMB doesn't recognize superuser
408 * privileges, so we need our own check for this.
409 */
410static int
411smb_file_permission(struct inode *inode, int mask)
412{
413 int mode = inode->i_mode;
414 int error = 0;
415
416 VERBOSE("mode=%x, mask=%x\n", mode, mask);
417
418 /* Look at user permissions */
419 mode >>= 6;
420 if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
421 error = -EACCES;
422 return error;
423}
424
425static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
426{
427 loff_t ret;
428 lock_kernel();
429 ret = generic_file_llseek_unlocked(file, offset, origin);
430 unlock_kernel();
431 return ret;
432}
433
434const struct file_operations smb_file_operations =
435{
436 .llseek = smb_remote_llseek,
437 .read = do_sync_read,
438 .aio_read = smb_file_aio_read,
439 .write = do_sync_write,
440 .aio_write = smb_file_aio_write,
441 .unlocked_ioctl = smb_ioctl,
442 .mmap = smb_file_mmap,
443 .open = smb_file_open,
444 .release = smb_file_release,
445 .fsync = smb_fsync,
446 .splice_read = smb_file_splice_read,
447};
448
449const struct inode_operations smb_file_inode_operations =
450{
451 .permission = smb_file_permission,
452 .getattr = smb_getattr,
453 .setattr = smb_notify_change,
454};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab..00000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
1/*
2 * getopt.c
3 */
4
5#include <linux/kernel.h>
6#include <linux/string.h>
7#include <linux/net.h>
8
9#include "getopt.h"
10
11/**
12 * smb_getopt - option parser
13 * @caller: name of the caller, for error messages
14 * @options: the options string
15 * @opts: an array of &struct option entries controlling parser operations
16 * @optopt: output; will contain the current option
17 * @optarg: output; will contain the value (if one exists)
18 * @flag: output; may be NULL; should point to a long for or'ing flags
19 * @value: output; may be NULL; will be overwritten with the integer value
20 * of the current argument.
21 *
22 * Helper to parse options on the format used by mount ("a=b,c=d,e,f").
23 * Returns opts->val if a matching entry in the 'opts' array is found,
24 * 0 when no more tokens are found, -1 if an error is encountered.
25 */
26int smb_getopt(char *caller, char **options, struct option *opts,
27 char **optopt, char **optarg, unsigned long *flag,
28 unsigned long *value)
29{
30 char *token;
31 char *val;
32 int i;
33
34 do {
35 if ((token = strsep(options, ",")) == NULL)
36 return 0;
37 } while (*token == '\0');
38 *optopt = token;
39
40 *optarg = NULL;
41 if ((val = strchr (token, '=')) != NULL) {
42 *val++ = 0;
43 if (value)
44 *value = simple_strtoul(val, NULL, 0);
45 *optarg = val;
46 }
47
48 for (i = 0; opts[i].name != NULL; i++) {
49 if (!strcmp(opts[i].name, token)) {
50 if (!opts[i].flag && (!val || !*val)) {
51 printk("%s: the %s option requires an argument\n",
52 caller, token);
53 return -1;
54 }
55
56 if (flag && opts[i].flag)
57 *flag |= opts[i].flag;
58
59 return opts[i].val;
60 }
61 }
62 printk("%s: Unrecognized mount option %s\n", caller, token);
63 return -1;
64}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c4..00000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef _LINUX_GETOPT_H
2#define _LINUX_GETOPT_H
3
4struct option {
5 const char *name;
6 unsigned long flag;
7 int val;
8};
9
10extern int smb_getopt(char *caller, char **options, struct option *opts,
11 char **optopt, char **optarg, unsigned long *flag,
12 unsigned long *value);
13
14#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 9551cb6f7fe..00000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,841 +0,0 @@
1/*
2 * inode.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/module.h>
11#include <linux/time.h>
12#include <linux/kernel.h>
13#include <linux/mm.h>
14#include <linux/string.h>
15#include <linux/stat.h>
16#include <linux/errno.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/file.h>
20#include <linux/dcache.h>
21#include <linux/smp_lock.h>
22#include <linux/nls.h>
23#include <linux/seq_file.h>
24#include <linux/mount.h>
25#include <linux/net.h>
26#include <linux/vfs.h>
27#include <linux/highuid.h>
28#include <linux/sched.h>
29#include <linux/smb_fs.h>
30#include <linux/smbno.h>
31#include <linux/smb_mount.h>
32
33#include <asm/system.h>
34#include <asm/uaccess.h>
35
36#include "smb_debug.h"
37#include "getopt.h"
38#include "proto.h"
39
40/* Always pick a default string */
41#ifdef CONFIG_SMB_NLS_REMOTE
42#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
43#else
44#define SMB_NLS_REMOTE ""
45#endif
46
47#define SMB_TTL_DEFAULT 1000
48
49static void smb_delete_inode(struct inode *);
50static void smb_put_super(struct super_block *);
51static int smb_statfs(struct dentry *, struct kstatfs *);
52static int smb_show_options(struct seq_file *, struct vfsmount *);
53
54static struct kmem_cache *smb_inode_cachep;
55
56static struct inode *smb_alloc_inode(struct super_block *sb)
57{
58 struct smb_inode_info *ei;
59 ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
60 if (!ei)
61 return NULL;
62 return &ei->vfs_inode;
63}
64
65static void smb_destroy_inode(struct inode *inode)
66{
67 kmem_cache_free(smb_inode_cachep, SMB_I(inode));
68}
69
70static void init_once(void *foo)
71{
72 struct smb_inode_info *ei = (struct smb_inode_info *) foo;
73
74 inode_init_once(&ei->vfs_inode);
75}
76
77static int init_inodecache(void)
78{
79 smb_inode_cachep = kmem_cache_create("smb_inode_cache",
80 sizeof(struct smb_inode_info),
81 0, (SLAB_RECLAIM_ACCOUNT|
82 SLAB_MEM_SPREAD),
83 init_once);
84 if (smb_inode_cachep == NULL)
85 return -ENOMEM;
86 return 0;
87}
88
89static void destroy_inodecache(void)
90{
91 kmem_cache_destroy(smb_inode_cachep);
92}
93
94static int smb_remount(struct super_block *sb, int *flags, char *data)
95{
96 *flags |= MS_NODIRATIME;
97 return 0;
98}
99
100static const struct super_operations smb_sops =
101{
102 .alloc_inode = smb_alloc_inode,
103 .destroy_inode = smb_destroy_inode,
104 .drop_inode = generic_delete_inode,
105 .delete_inode = smb_delete_inode,
106 .put_super = smb_put_super,
107 .statfs = smb_statfs,
108 .show_options = smb_show_options,
109 .remount_fs = smb_remount,
110};
111
112
113/* We are always generating a new inode here */
114struct inode *
115smb_iget(struct super_block *sb, struct smb_fattr *fattr)
116{
117 struct smb_sb_info *server = SMB_SB(sb);
118 struct inode *result;
119
120 DEBUG1("smb_iget: %p\n", fattr);
121
122 result = new_inode(sb);
123 if (!result)
124 return result;
125 result->i_ino = fattr->f_ino;
126 SMB_I(result)->open = 0;
127 SMB_I(result)->fileid = 0;
128 SMB_I(result)->access = 0;
129 SMB_I(result)->flags = 0;
130 SMB_I(result)->closed = 0;
131 SMB_I(result)->openers = 0;
132 smb_set_inode_attr(result, fattr);
133 if (S_ISREG(result->i_mode)) {
134 result->i_op = &smb_file_inode_operations;
135 result->i_fop = &smb_file_operations;
136 result->i_data.a_ops = &smb_file_aops;
137 } else if (S_ISDIR(result->i_mode)) {
138 if (server->opt.capabilities & SMB_CAP_UNIX)
139 result->i_op = &smb_dir_inode_operations_unix;
140 else
141 result->i_op = &smb_dir_inode_operations;
142 result->i_fop = &smb_dir_operations;
143 } else if (S_ISLNK(result->i_mode)) {
144 result->i_op = &smb_link_inode_operations;
145 } else {
146 init_special_inode(result, result->i_mode, fattr->f_rdev);
147 }
148 insert_inode_hash(result);
149 return result;
150}
151
152/*
153 * Copy the inode data to a smb_fattr structure.
154 */
155void
156smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
157{
158 memset(fattr, 0, sizeof(struct smb_fattr));
159 fattr->f_mode = inode->i_mode;
160 fattr->f_nlink = inode->i_nlink;
161 fattr->f_ino = inode->i_ino;
162 fattr->f_uid = inode->i_uid;
163 fattr->f_gid = inode->i_gid;
164 fattr->f_size = inode->i_size;
165 fattr->f_mtime = inode->i_mtime;
166 fattr->f_ctime = inode->i_ctime;
167 fattr->f_atime = inode->i_atime;
168 fattr->f_blocks = inode->i_blocks;
169
170 fattr->attr = SMB_I(inode)->attr;
171 /*
172 * Keep the attributes in sync with the inode permissions.
173 */
174 if (fattr->f_mode & S_IWUSR)
175 fattr->attr &= ~aRONLY;
176 else
177 fattr->attr |= aRONLY;
178}
179
180/*
181 * Update the inode, possibly causing it to invalidate its pages if mtime/size
182 * is different from last time.
183 */
184void
185smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
186{
187 struct smb_inode_info *ei = SMB_I(inode);
188
189 /*
190 * A size change should have a different mtime, or same mtime
191 * but different size.
192 */
193 time_t last_time = inode->i_mtime.tv_sec;
194 loff_t last_sz = inode->i_size;
195
196 inode->i_mode = fattr->f_mode;
197 inode->i_nlink = fattr->f_nlink;
198 inode->i_uid = fattr->f_uid;
199 inode->i_gid = fattr->f_gid;
200 inode->i_ctime = fattr->f_ctime;
201 inode->i_blocks = fattr->f_blocks;
202 inode->i_size = fattr->f_size;
203 inode->i_mtime = fattr->f_mtime;
204 inode->i_atime = fattr->f_atime;
205 ei->attr = fattr->attr;
206
207 /*
208 * Update the "last time refreshed" field for revalidation.
209 */
210 ei->oldmtime = jiffies;
211
212 if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
213 VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
214 inode->i_ino,
215 (long) last_time, (long) inode->i_mtime.tv_sec,
216 (long) last_sz, (long) inode->i_size);
217
218 if (!S_ISDIR(inode->i_mode))
219 invalidate_remote_inode(inode);
220 }
221}
222
223/*
224 * This is called if the connection has gone bad ...
225 * try to kill off all the current inodes.
226 */
227void
228smb_invalidate_inodes(struct smb_sb_info *server)
229{
230 VERBOSE("\n");
231 shrink_dcache_sb(SB_of(server));
232 invalidate_inodes(SB_of(server));
233}
234
235/*
236 * This is called to update the inode attributes after
237 * we've made changes to a file or directory.
238 */
239static int
240smb_refresh_inode(struct dentry *dentry)
241{
242 struct inode *inode = dentry->d_inode;
243 int error;
244 struct smb_fattr fattr;
245
246 error = smb_proc_getattr(dentry, &fattr);
247 if (!error) {
248 smb_renew_times(dentry);
249 /*
250 * Check whether the type part of the mode changed,
251 * and don't update the attributes if it did.
252 *
253 * And don't dick with the root inode
254 */
255 if (inode->i_ino == 2)
256 return error;
257 if (S_ISLNK(inode->i_mode))
258 return error; /* VFS will deal with it */
259
260 if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
261 smb_set_inode_attr(inode, &fattr);
262 } else {
263 /*
264 * Big trouble! The inode has become a new object,
265 * so any operations attempted on it are invalid.
266 *
267 * To limit damage, mark the inode as bad so that
268 * subsequent lookup validations will fail.
269 */
270 PARANOIA("%s/%s changed mode, %07o to %07o\n",
271 DENTRY_PATH(dentry),
272 inode->i_mode, fattr.f_mode);
273
274 fattr.f_mode = inode->i_mode; /* save mode */
275 make_bad_inode(inode);
276 inode->i_mode = fattr.f_mode; /* restore mode */
277 /*
278 * No need to worry about unhashing the dentry: the
279 * lookup validation will see that the inode is bad.
280 * But we do want to invalidate the caches ...
281 */
282 if (!S_ISDIR(inode->i_mode))
283 invalidate_remote_inode(inode);
284 else
285 smb_invalid_dir_cache(inode);
286 error = -EIO;
287 }
288 }
289 return error;
290}
291
292/*
293 * This is called when we want to check whether the inode
294 * has changed on the server. If it has changed, we must
295 * invalidate our local caches.
296 */
297int
298smb_revalidate_inode(struct dentry *dentry)
299{
300 struct smb_sb_info *s = server_from_dentry(dentry);
301 struct inode *inode = dentry->d_inode;
302 int error = 0;
303
304 DEBUG1("smb_revalidate_inode\n");
305 lock_kernel();
306
307 /*
308 * Check whether we've recently refreshed the inode.
309 */
310 if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
311 VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
312 inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
313 goto out;
314 }
315
316 error = smb_refresh_inode(dentry);
317out:
318 unlock_kernel();
319 return error;
320}
321
322/*
323 * This routine is called when i_nlink == 0 and i_count goes to 0.
324 * All blocking cleanup operations need to go here to avoid races.
325 */
326static void
327smb_delete_inode(struct inode *ino)
328{
329 DEBUG1("ino=%ld\n", ino->i_ino);
330 truncate_inode_pages(&ino->i_data, 0);
331 lock_kernel();
332 if (smb_close(ino))
333 PARANOIA("could not close inode %ld\n", ino->i_ino);
334 unlock_kernel();
335 clear_inode(ino);
336}
337
338static struct option opts[] = {
339 { "version", 0, 'v' },
340 { "win95", SMB_MOUNT_WIN95, 1 },
341 { "oldattr", SMB_MOUNT_OLDATTR, 1 },
342 { "dirattr", SMB_MOUNT_DIRATTR, 1 },
343 { "case", SMB_MOUNT_CASE, 1 },
344 { "uid", 0, 'u' },
345 { "gid", 0, 'g' },
346 { "file_mode", 0, 'f' },
347 { "dir_mode", 0, 'd' },
348 { "iocharset", 0, 'i' },
349 { "codepage", 0, 'c' },
350 { "ttl", 0, 't' },
351 { NULL, 0, 0}
352};
353
354static int
355parse_options(struct smb_mount_data_kernel *mnt, char *options)
356{
357 int c;
358 unsigned long flags;
359 unsigned long value;
360 char *optarg;
361 char *optopt;
362
363 flags = 0;
364 while ( (c = smb_getopt("smbfs", &options, opts,
365 &optopt, &optarg, &flags, &value)) > 0) {
366
367 VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
368 switch (c) {
369 case 1:
370 /* got a "flag" option */
371 break;
372 case 'v':
373 if (value != SMB_MOUNT_VERSION) {
374 printk ("smbfs: Bad mount version %ld, expected %d\n",
375 value, SMB_MOUNT_VERSION);
376 return 0;
377 }
378 mnt->version = value;
379 break;
380 case 'u':
381 mnt->uid = value;
382 flags |= SMB_MOUNT_UID;
383 break;
384 case 'g':
385 mnt->gid = value;
386 flags |= SMB_MOUNT_GID;
387 break;
388 case 'f':
389 mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
390 flags |= SMB_MOUNT_FMODE;
391 break;
392 case 'd':
393 mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
394 flags |= SMB_MOUNT_DMODE;
395 break;
396 case 'i':
397 strlcpy(mnt->codepage.local_name, optarg,
398 SMB_NLS_MAXNAMELEN);
399 break;
400 case 'c':
401 strlcpy(mnt->codepage.remote_name, optarg,
402 SMB_NLS_MAXNAMELEN);
403 break;
404 case 't':
405 mnt->ttl = value;
406 break;
407 default:
408 printk ("smbfs: Unrecognized mount option %s\n",
409 optopt);
410 return -1;
411 }
412 }
413 mnt->flags = flags;
414 return c;
415}
416
417/*
418 * smb_show_options() is for displaying mount options in /proc/mounts.
419 * It tries to avoid showing settings that were not changed from their
420 * defaults.
421 */
422static int
423smb_show_options(struct seq_file *s, struct vfsmount *m)
424{
425 struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
426 int i;
427
428 for (i = 0; opts[i].name != NULL; i++)
429 if (mnt->flags & opts[i].flag)
430 seq_printf(s, ",%s", opts[i].name);
431
432 if (mnt->flags & SMB_MOUNT_UID)
433 seq_printf(s, ",uid=%d", mnt->uid);
434 if (mnt->flags & SMB_MOUNT_GID)
435 seq_printf(s, ",gid=%d", mnt->gid);
436 if (mnt->mounted_uid != 0)
437 seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
438
439 /*
440 * Defaults for file_mode and dir_mode are unknown to us; they
441 * depend on the current umask of the user doing the mount.
442 */
443 if (mnt->flags & SMB_MOUNT_FMODE)
444 seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
445 if (mnt->flags & SMB_MOUNT_DMODE)
446 seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
447
448 if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
449 seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
450 if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
451 seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
452
453 if (mnt->ttl != SMB_TTL_DEFAULT)
454 seq_printf(s, ",ttl=%d", mnt->ttl);
455
456 return 0;
457}
458
459static void
460smb_unload_nls(struct smb_sb_info *server)
461{
462 unload_nls(server->remote_nls);
463 unload_nls(server->local_nls);
464}
465
466static void
467smb_put_super(struct super_block *sb)
468{
469 struct smb_sb_info *server = SMB_SB(sb);
470
471 lock_kernel();
472
473 smb_lock_server(server);
474 server->state = CONN_INVALID;
475 smbiod_unregister_server(server);
476
477 smb_close_socket(server);
478
479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1);
481
482 bdi_destroy(&server->bdi);
483 kfree(server->ops);
484 smb_unload_nls(server);
485 sb->s_fs_info = NULL;
486 smb_unlock_server(server);
487 put_pid(server->conn_pid);
488 kfree(server);
489
490 unlock_kernel();
491}
492
493static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
494{
495 struct smb_sb_info *server;
496 struct smb_mount_data_kernel *mnt;
497 struct smb_mount_data *oldmnt;
498 struct inode *root_inode;
499 struct smb_fattr root;
500 int ver;
501 void *mem;
502 static int warn_count;
503
504 if (warn_count < 5) {
505 warn_count++;
506 printk(KERN_EMERG "smbfs is deprecated and will be removed"
507 " from the 2.6.27 kernel. Please migrate to cifs\n");
508 }
509
510 if (!raw_data)
511 goto out_no_data;
512
513 oldmnt = (struct smb_mount_data *) raw_data;
514 ver = oldmnt->version;
515 if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
516 goto out_wrong_data;
517
518 sb->s_flags |= MS_NODIRATIME;
519 sb->s_blocksize = 1024; /* Eh... Is this correct? */
520 sb->s_blocksize_bits = 10;
521 sb->s_magic = SMB_SUPER_MAGIC;
522 sb->s_op = &smb_sops;
523 sb->s_time_gran = 100;
524
525 server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
526 if (!server)
527 goto out_no_server;
528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
534
535 server->super_block = sb;
536 server->mnt = NULL;
537 server->sock_file = NULL;
538 init_waitqueue_head(&server->conn_wq);
539 init_MUTEX(&server->sem);
540 INIT_LIST_HEAD(&server->entry);
541 INIT_LIST_HEAD(&server->xmitq);
542 INIT_LIST_HEAD(&server->recvq);
543 server->conn_error = 0;
544 server->conn_pid = NULL;
545 server->state = CONN_INVALID; /* no connection yet */
546 server->generation = 0;
547
548 /* Allocate the global temp buffer and some superblock helper structs */
549 /* FIXME: move these to the smb_sb_info struct */
550 VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
551 sizeof(struct smb_mount_data_kernel));
552 mem = kmalloc(sizeof(struct smb_ops) +
553 sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
554 if (!mem)
555 goto out_no_mem;
556
557 server->ops = mem;
558 smb_install_null_ops(server->ops);
559 server->mnt = mem + sizeof(struct smb_ops);
560
561 /* Setup NLS stuff */
562 server->remote_nls = NULL;
563 server->local_nls = NULL;
564
565 mnt = server->mnt;
566
567 memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
568 strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
569 SMB_NLS_MAXNAMELEN);
570 strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
571 SMB_NLS_MAXNAMELEN);
572
573 mnt->ttl = SMB_TTL_DEFAULT;
574 if (ver == SMB_MOUNT_OLDVERSION) {
575 mnt->version = oldmnt->version;
576
577 SET_UID(mnt->uid, oldmnt->uid);
578 SET_GID(mnt->gid, oldmnt->gid);
579
580 mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
581 mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
582
583 mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
584 SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
585 } else {
586 mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
587 S_IROTH | S_IXOTH | S_IFREG;
588 mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
589 S_IROTH | S_IXOTH | S_IFDIR;
590 if (parse_options(mnt, raw_data))
591 goto out_bad_option;
592 }
593 mnt->mounted_uid = current_uid();
594 smb_setcodepage(server, &mnt->codepage);
595
596 /*
597 * Display the enabled options
598 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
599 */
600 if (mnt->flags & SMB_MOUNT_OLDATTR)
601 printk("SMBFS: Using core getattr (Win 95 speedup)\n");
602 else if (mnt->flags & SMB_MOUNT_DIRATTR)
603 printk("SMBFS: Using dir ff getattr\n");
604
605 if (smbiod_register_server(server) < 0) {
606 printk(KERN_ERR "smbfs: failed to start smbiod\n");
607 goto out_no_smbiod;
608 }
609
610 /*
611 * Keep the super block locked while we get the root inode.
612 */
613 smb_init_root_dirent(server, &root, sb);
614 root_inode = smb_iget(sb, &root);
615 if (!root_inode)
616 goto out_no_root;
617
618 sb->s_root = d_alloc_root(root_inode);
619 if (!sb->s_root)
620 goto out_no_root;
621
622 smb_new_dentry(sb->s_root);
623
624 return 0;
625
626out_no_root:
627 iput(root_inode);
628out_no_smbiod:
629 smb_unload_nls(server);
630out_bad_option:
631 kfree(mem);
632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
635 if (!server->mnt)
636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
637 sb->s_fs_info = NULL;
638 kfree(server);
639 goto out_fail;
640out_wrong_data:
641 printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
642 goto out_fail;
643out_no_data:
644 printk(KERN_ERR "smb_fill_super: missing data argument\n");
645out_fail:
646 return -EINVAL;
647out_no_server:
648 printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
649 return -ENOMEM;
650}
651
652static int
653smb_statfs(struct dentry *dentry, struct kstatfs *buf)
654{
655 int result;
656
657 lock_kernel();
658
659 result = smb_proc_dskattr(dentry, buf);
660
661 unlock_kernel();
662
663 buf->f_type = SMB_SUPER_MAGIC;
664 buf->f_namelen = SMB_MAXPATHLEN;
665 return result;
666}
667
668int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
669{
670 int err = smb_revalidate_inode(dentry);
671 if (!err)
672 generic_fillattr(dentry->d_inode, stat);
673 return err;
674}
675
676int
677smb_notify_change(struct dentry *dentry, struct iattr *attr)
678{
679 struct inode *inode = dentry->d_inode;
680 struct smb_sb_info *server = server_from_dentry(dentry);
681 unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
682 int error, changed, refresh = 0;
683 struct smb_fattr fattr;
684
685 lock_kernel();
686
687 error = smb_revalidate_inode(dentry);
688 if (error)
689 goto out;
690
691 if ((error = inode_change_ok(inode, attr)) < 0)
692 goto out;
693
694 error = -EPERM;
695 if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
696 goto out;
697
698 if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
699 goto out;
700
701 if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
702 goto out;
703
704 if ((attr->ia_valid & ATTR_SIZE) != 0) {
705 VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
706 DENTRY_PATH(dentry),
707 (long) inode->i_size, (long) attr->ia_size);
708
709 filemap_write_and_wait(inode->i_mapping);
710
711 error = smb_open(dentry, O_WRONLY);
712 if (error)
713 goto out;
714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error)
716 goto out;
717 error = simple_setsize(inode, attr->ia_size);
718 if (error)
719 goto out;
720 refresh = 1;
721 }
722
723 if (server->opt.capabilities & SMB_CAP_UNIX) {
724 /* For now we don't want to set the size with setattr_unix */
725 attr->ia_valid &= ~ATTR_SIZE;
726 /* FIXME: only call if we actually want to set something? */
727 error = smb_proc_setattr_unix(dentry, attr, 0, 0);
728 if (!error)
729 refresh = 1;
730
731 goto out;
732 }
733
734 /*
735 * Initialize the fattr and check for changed fields.
736 * Note: CTIME under SMB is creation time rather than
737 * change time, so we don't attempt to change it.
738 */
739 smb_get_inode_attr(inode, &fattr);
740
741 changed = 0;
742 if ((attr->ia_valid & ATTR_MTIME) != 0) {
743 fattr.f_mtime = attr->ia_mtime;
744 changed = 1;
745 }
746 if ((attr->ia_valid & ATTR_ATIME) != 0) {
747 fattr.f_atime = attr->ia_atime;
748 /* Earlier protocols don't have an access time */
749 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
750 changed = 1;
751 }
752 if (changed) {
753 error = smb_proc_settime(dentry, &fattr);
754 if (error)
755 goto out;
756 refresh = 1;
757 }
758
759 /*
760 * Check for mode changes ... we're extremely limited in
761 * what can be set for SMB servers: just the read-only bit.
762 */
763 if ((attr->ia_valid & ATTR_MODE) != 0) {
764 VERBOSE("%s/%s mode change, old=%x, new=%x\n",
765 DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
766 changed = 0;
767 if (attr->ia_mode & S_IWUSR) {
768 if (fattr.attr & aRONLY) {
769 fattr.attr &= ~aRONLY;
770 changed = 1;
771 }
772 } else {
773 if (!(fattr.attr & aRONLY)) {
774 fattr.attr |= aRONLY;
775 changed = 1;
776 }
777 }
778 if (changed) {
779 error = smb_proc_setattr(dentry, &fattr);
780 if (error)
781 goto out;
782 refresh = 1;
783 }
784 }
785 error = 0;
786
787out:
788 if (refresh)
789 smb_refresh_inode(dentry);
790 unlock_kernel();
791 return error;
792}
793
794static int smb_get_sb(struct file_system_type *fs_type,
795 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
796{
797 return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
798}
799
800static struct file_system_type smb_fs_type = {
801 .owner = THIS_MODULE,
802 .name = "smbfs",
803 .get_sb = smb_get_sb,
804 .kill_sb = kill_anon_super,
805 .fs_flags = FS_BINARY_MOUNTDATA,
806};
807
808static int __init init_smb_fs(void)
809{
810 int err;
811 DEBUG1("registering ...\n");
812
813 err = init_inodecache();
814 if (err)
815 goto out_inode;
816 err = smb_init_request_cache();
817 if (err)
818 goto out_request;
819 err = register_filesystem(&smb_fs_type);
820 if (err)
821 goto out;
822 return 0;
823out:
824 smb_destroy_request_cache();
825out_request:
826 destroy_inodecache();
827out_inode:
828 return err;
829}
830
831static void __exit exit_smb_fs(void)
832{
833 DEBUG1("unregistering ...\n");
834 unregister_filesystem(&smb_fs_type);
835 smb_destroy_request_cache();
836 destroy_inodecache();
837}
838
839module_init(init_smb_fs)
840module_exit(exit_smb_fs)
841MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad3..00000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/*
2 * ioctl.c
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/ioctl.h>
13#include <linux/time.h>
14#include <linux/mm.h>
15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
17#include <linux/net.h>
18
19#include <linux/smb_fs.h>
20#include <linux/smb_mount.h>
21
22#include <asm/uaccess.h>
23
24#include "proto.h"
25
26long
27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
28{
29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt;
31 int result = -EINVAL;
32
33 lock_kernel();
34 switch (cmd) {
35 uid16_t uid16;
36 uid_t uid32;
37 case SMB_IOC_GETMOUNTUID:
38 SET_UID(uid16, server->mnt->mounted_uid);
39 result = put_user(uid16, (uid16_t __user *) arg);
40 break;
41 case SMB_IOC_GETMOUNTUID32:
42 SET_UID(uid32, server->mnt->mounted_uid);
43 result = put_user(uid32, (uid_t __user *) arg);
44 break;
45
46 case SMB_IOC_NEWCONN:
47 /* arg is smb_conn_opt, or NULL if no connection was made */
48 if (!arg) {
49 result = 0;
50 smb_lock_server(server);
51 server->state = CONN_RETRIED;
52 printk(KERN_ERR "Connection attempt failed! [%d]\n",
53 server->conn_error);
54 smbiod_flush(server);
55 smb_unlock_server(server);
56 break;
57 }
58
59 result = -EFAULT;
60 if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
61 result = smb_newconn(server, &opt);
62 break;
63 default:
64 break;
65 }
66 unlock_kernel();
67
68 return result;
69}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b..00000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
1/*
2 * proc.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/types.h>
11#include <linux/capability.h>
12#include <linux/errno.h>
13#include <linux/slab.h>
14#include <linux/fs.h>
15#include <linux/file.h>
16#include <linux/stat.h>
17#include <linux/fcntl.h>
18#include <linux/dcache.h>
19#include <linux/nls.h>
20#include <linux/smp_lock.h>
21#include <linux/net.h>
22#include <linux/vfs.h>
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <net/sock.h>
28
29#include <asm/string.h>
30#include <asm/div64.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37/* Features. Undefine if they cause problems, this should perhaps be a
38 config option. */
39#define SMBFS_POSIX_UNLINK 1
40
41/* Allow smb_retry to be interrupted. */
42#define SMB_RETRY_INTR
43
44#define SMB_VWV(packet) ((packet) + SMB_HEADER_LEN)
45#define SMB_CMD(packet) (*(packet+8))
46#define SMB_WCT(packet) (*(packet+SMB_HEADER_LEN - 1))
47
48#define SMB_DIRINFO_SIZE 43
49#define SMB_STATUS_SIZE 21
50
51#define SMB_ST_BLKSIZE (PAGE_SIZE)
52#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
53
54static struct smb_ops smb_ops_core;
55static struct smb_ops smb_ops_os2;
56static struct smb_ops smb_ops_win95;
57static struct smb_ops smb_ops_winNT;
58static struct smb_ops smb_ops_unix;
59static struct smb_ops smb_ops_null;
60
61static void
62smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
63static void
64smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
65static int
66smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
67 struct smb_fattr *fattr);
68static int
69smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
70 struct smb_fattr *fattr);
71static int
72smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
73 u16 attr);
74static int
75smb_proc_setattr_ext(struct smb_sb_info *server,
76 struct inode *inode, struct smb_fattr *fattr);
77static int
78smb_proc_query_cifsunix(struct smb_sb_info *server);
79static void
80install_ops(struct smb_ops *dst, struct smb_ops *src);
81
82
83static void
84str_upper(char *name, int len)
85{
86 while (len--)
87 {
88 if (*name >= 'a' && *name <= 'z')
89 *name -= ('a' - 'A');
90 name++;
91 }
92}
93
94#if 0
95static void
96str_lower(char *name, int len)
97{
98 while (len--)
99 {
100 if (*name >= 'A' && *name <= 'Z')
101 *name += ('a' - 'A');
102 name++;
103 }
104}
105#endif
106
107/* reverse a string inline. This is used by the dircache walking routines */
108static void reverse_string(char *buf, int len)
109{
110 char c;
111 char *end = buf+len-1;
112
113 while(buf < end) {
114 c = *buf;
115 *(buf++) = *end;
116 *(end--) = c;
117 }
118}
119
120/* no conversion, just a wrapper for memcpy. */
121static int convert_memcpy(unsigned char *output, int olen,
122 const unsigned char *input, int ilen,
123 struct nls_table *nls_from,
124 struct nls_table *nls_to)
125{
126 if (olen < ilen)
127 return -ENAMETOOLONG;
128 memcpy(output, input, ilen);
129 return ilen;
130}
131
132static inline int write_char(unsigned char ch, char *output, int olen)
133{
134 if (olen < 4)
135 return -ENAMETOOLONG;
136 sprintf(output, ":x%02x", ch);
137 return 4;
138}
139
140static inline int write_unichar(wchar_t ch, char *output, int olen)
141{
142 if (olen < 5)
143 return -ENAMETOOLONG;
144 sprintf(output, ":%04x", ch);
145 return 5;
146}
147
148/* convert from one "codepage" to another (possibly being utf8). */
149static int convert_cp(unsigned char *output, int olen,
150 const unsigned char *input, int ilen,
151 struct nls_table *nls_from,
152 struct nls_table *nls_to)
153{
154 int len = 0;
155 int n;
156 wchar_t ch;
157
158 while (ilen > 0) {
159 /* convert by changing to unicode and back to the new cp */
160 n = nls_from->char2uni(input, ilen, &ch);
161 if (n == -EINVAL) {
162 ilen--;
163 n = write_char(*input++, output, olen);
164 if (n < 0)
165 goto fail;
166 output += n;
167 olen -= n;
168 len += n;
169 continue;
170 } else if (n < 0)
171 goto fail;
172 input += n;
173 ilen -= n;
174
175 n = nls_to->uni2char(ch, output, olen);
176 if (n == -EINVAL)
177 n = write_unichar(ch, output, olen);
178 if (n < 0)
179 goto fail;
180 output += n;
181 olen -= n;
182
183 len += n;
184 }
185 return len;
186fail:
187 return n;
188}
189
190/* ----------------------------------------------------------- */
191
192/*
193 * nls_unicode
194 *
195 * This encodes/decodes little endian unicode format
196 */
197
198static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
199{
200 if (boundlen < 2)
201 return -EINVAL;
202 *out++ = uni & 0xff;
203 *out++ = uni >> 8;
204 return 2;
205}
206
207static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
208{
209 if (boundlen < 2)
210 return -EINVAL;
211 *uni = (rawstring[1] << 8) | rawstring[0];
212 return 2;
213}
214
215static struct nls_table unicode_table = {
216 .charset = "unicode",
217 .uni2char = uni2char,
218 .char2uni = char2uni,
219};
220
221/* ----------------------------------------------------------- */
222
223static int setcodepage(struct nls_table **p, char *name)
224{
225 struct nls_table *nls;
226
227 if (!name || !*name) {
228 nls = NULL;
229 } else if ( (nls = load_nls(name)) == NULL) {
230 printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
231 return -EINVAL;
232 }
233
234 /* if already set, unload the previous one. */
235 if (*p && *p != &unicode_table)
236 unload_nls(*p);
237 *p = nls;
238
239 return 0;
240}
241
242/* Handles all changes to codepage settings. */
243int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
244{
245 int n = 0;
246
247 smb_lock_server(server);
248
249 /* Don't load any nls_* at all, if no remote is requested */
250 if (!*cp->remote_name)
251 goto out;
252
253 /* local */
254 n = setcodepage(&server->local_nls, cp->local_name);
255 if (n != 0)
256 goto out;
257
258 /* remote */
259 if (!strcmp(cp->remote_name, "unicode")) {
260 server->remote_nls = &unicode_table;
261 } else {
262 n = setcodepage(&server->remote_nls, cp->remote_name);
263 if (n != 0)
264 setcodepage(&server->local_nls, NULL);
265 }
266
267out:
268 if (server->local_nls != NULL && server->remote_nls != NULL)
269 server->ops->convert = convert_cp;
270 else
271 server->ops->convert = convert_memcpy;
272
273 smb_unlock_server(server);
274 return n;
275}
276
277
278/*****************************************************************************/
279/* */
280/* Encoding/Decoding section */
281/* */
282/*****************************************************************************/
283
284static __u8 *
285smb_encode_smb_length(__u8 * p, __u32 len)
286{
287 *p = 0;
288 *(p+1) = 0;
289 *(p+2) = (len & 0xFF00) >> 8;
290 *(p+3) = (len & 0xFF);
291 if (len > 0xFFFF)
292 {
293 *(p+1) = 1;
294 }
295 return p + 4;
296}
297
298/*
299 * smb_build_path: build the path to entry and name storing it in buf.
300 * The path returned will have the trailing '\0'.
301 */
302static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
303 int maxlen,
304 struct dentry *entry, struct qstr *name)
305{
306 unsigned char *path = buf;
307 int len;
308 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
309
310 if (maxlen < (2<<unicode))
311 return -ENAMETOOLONG;
312
313 if (maxlen > SMB_MAXPATHLEN + 1)
314 maxlen = SMB_MAXPATHLEN + 1;
315
316 if (entry == NULL)
317 goto test_name_and_out;
318
319 /*
320 * If IS_ROOT, we have to do no walking at all.
321 */
322 if (IS_ROOT(entry) && !name) {
323 *path++ = '\\';
324 if (unicode) *path++ = '\0';
325 *path++ = '\0';
326 if (unicode) *path++ = '\0';
327 return path-buf;
328 }
329
330 /*
331 * Build the path string walking the tree backward from end to ROOT
332 * and store it in reversed order [see reverse_string()]
333 */
334 dget(entry);
335 spin_lock(&entry->d_lock);
336 while (!IS_ROOT(entry)) {
337 struct dentry *parent;
338
339 if (maxlen < (3<<unicode)) {
340 spin_unlock(&entry->d_lock);
341 dput(entry);
342 return -ENAMETOOLONG;
343 }
344
345 len = server->ops->convert(path, maxlen-2,
346 entry->d_name.name, entry->d_name.len,
347 server->local_nls, server->remote_nls);
348 if (len < 0) {
349 spin_unlock(&entry->d_lock);
350 dput(entry);
351 return len;
352 }
353 reverse_string(path, len);
354 path += len;
355 if (unicode) {
356 /* Note: reverse order */
357 *path++ = '\0';
358 maxlen--;
359 }
360 *path++ = '\\';
361 maxlen -= len+1;
362
363 parent = entry->d_parent;
364 dget(parent);
365 spin_unlock(&entry->d_lock);
366 dput(entry);
367 entry = parent;
368 spin_lock(&entry->d_lock);
369 }
370 spin_unlock(&entry->d_lock);
371 dput(entry);
372 reverse_string(buf, path-buf);
373
374 /* maxlen has space for at least one char */
375test_name_and_out:
376 if (name) {
377 if (maxlen < (3<<unicode))
378 return -ENAMETOOLONG;
379 *path++ = '\\';
380 if (unicode) {
381 *path++ = '\0';
382 maxlen--;
383 }
384 len = server->ops->convert(path, maxlen-2,
385 name->name, name->len,
386 server->local_nls, server->remote_nls);
387 if (len < 0)
388 return len;
389 path += len;
390 maxlen -= len+1;
391 }
392 /* maxlen has space for at least one char */
393 *path++ = '\0';
394 if (unicode) *path++ = '\0';
395 return path-buf;
396}
397
398static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
399 struct dentry *dir, struct qstr *name)
400{
401 int result;
402
403 result = smb_build_path(server, buf, maxlen, dir, name);
404 if (result < 0)
405 goto out;
406 if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
407 str_upper(buf, result);
408out:
409 return result;
410}
411
412/* encode_path for non-trans2 request SMBs */
413static int smb_simple_encode_path(struct smb_request *req, char **p,
414 struct dentry * entry, struct qstr * name)
415{
416 struct smb_sb_info *server = req->rq_server;
417 char *s = *p;
418 int res;
419 int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
420 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
421
422 if (!maxlen)
423 return -ENAMETOOLONG;
424 *s++ = 4; /* ASCII data format */
425
426 /*
427 * SMB Unicode strings must be 16bit aligned relative the start of the
428 * packet. If they are not they must be padded with 0.
429 */
430 if (unicode) {
431 int align = s - (char *)req->rq_buffer;
432 if (!(align & 1)) {
433 *s++ = '\0';
434 maxlen--;
435 }
436 }
437
438 res = smb_encode_path(server, s, maxlen-1, entry, name);
439 if (res < 0)
440 return res;
441 *p = s + res;
442 return 0;
443}
444
445/* The following are taken directly from msdos-fs */
446
447/* Linear day numbers of the respective 1sts in non-leap years. */
448
449static int day_n[] =
450{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
451 /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
452
453
454static time_t
455utc2local(struct smb_sb_info *server, time_t time)
456{
457 return time - server->opt.serverzone*60;
458}
459
460static time_t
461local2utc(struct smb_sb_info *server, time_t time)
462{
463 return time + server->opt.serverzone*60;
464}
465
466/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
467
468static time_t
469date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
470{
471 int month, year;
472 time_t secs;
473
474 /* first subtract and mask after that... Otherwise, if
475 date == 0, bad things happen */
476 month = ((date >> 5) - 1) & 15;
477 year = date >> 9;
478 secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
479 ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
480 month < 2 ? 1 : 0) + 3653);
481 /* days since 1.1.70 plus 80's leap day */
482 return local2utc(server, secs);
483}
484
485
486/* Convert linear UNIX date to a MS-DOS time/date pair. */
487
488static void
489date_unix2dos(struct smb_sb_info *server,
490 int unix_date, __u16 *date, __u16 *time)
491{
492 int day, year, nl_day, month;
493
494 unix_date = utc2local(server, unix_date);
495 if (unix_date < 315532800)
496 unix_date = 315532800;
497
498 *time = (unix_date % 60) / 2 +
499 (((unix_date / 60) % 60) << 5) +
500 (((unix_date / 3600) % 24) << 11);
501
502 day = unix_date / 86400 - 3652;
503 year = day / 365;
504 if ((year + 3) / 4 + 365 * year > day)
505 year--;
506 day -= (year + 3) / 4 + 365 * year;
507 if (day == 59 && !(year & 3)) {
508 nl_day = day;
509 month = 2;
510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day)
514 break;
515 }
516 *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
517}
518
519/* The following are taken from fs/ntfs/util.c */
520
521#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
522
523/*
524 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
525 * into Unix UTC (based 1970-01-01, in seconds).
526 */
527static struct timespec
528smb_ntutc2unixutc(u64 ntutc)
529{
530 struct timespec ts;
531 /* FIXME: what about the timezone difference? */
532 /* Subtract the NTFS time offset, then convert to 1s intervals. */
533 u64 t = ntutc - NTFS_TIME_OFFSET;
534 ts.tv_nsec = do_div(t, 10000000) * 100;
535 ts.tv_sec = t;
536 return ts;
537}
538
539/* Convert the Unix UTC into NT time */
540static u64
541smb_unixutc2ntutc(struct timespec ts)
542{
543 /* Note: timezone conversion is probably wrong. */
544 /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
545 return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
546}
547
548#define MAX_FILE_MODE 6
549static mode_t file_mode[] = {
550 S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
551};
552
553static int smb_filetype_to_mode(u32 filetype)
554{
555 if (filetype > MAX_FILE_MODE) {
556 PARANOIA("Filetype out of range: %d\n", filetype);
557 return S_IFREG;
558 }
559 return file_mode[filetype];
560}
561
562static u32 smb_filetype_from_mode(int mode)
563{
564 if (S_ISREG(mode))
565 return UNIX_TYPE_FILE;
566 if (S_ISDIR(mode))
567 return UNIX_TYPE_DIR;
568 if (S_ISLNK(mode))
569 return UNIX_TYPE_SYMLINK;
570 if (S_ISCHR(mode))
571 return UNIX_TYPE_CHARDEV;
572 if (S_ISBLK(mode))
573 return UNIX_TYPE_BLKDEV;
574 if (S_ISFIFO(mode))
575 return UNIX_TYPE_FIFO;
576 if (S_ISSOCK(mode))
577 return UNIX_TYPE_SOCKET;
578 return UNIX_TYPE_UNKNOWN;
579}
580
581
582/*****************************************************************************/
583/* */
584/* Support section. */
585/* */
586/*****************************************************************************/
587
588__u32
589smb_len(__u8 * p)
590{
591 return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
592}
593
594static __u16
595smb_bcc(__u8 * packet)
596{
597 int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
598 return WVAL(packet, pos);
599}
600
601/* smb_valid_packet: We check if packet fulfills the basic
602 requirements of a smb packet */
603
604static int
605smb_valid_packet(__u8 * packet)
606{
607 return (packet[4] == 0xff
608 && packet[5] == 'S'
609 && packet[6] == 'M'
610 && packet[7] == 'B'
611 && (smb_len(packet) + 4 == SMB_HEADER_LEN
612 + SMB_WCT(packet) * 2 + smb_bcc(packet)));
613}
614
615/* smb_verify: We check if we got the answer we expected, and if we
616 got enough data. If bcc == -1, we don't care. */
617
618static int
619smb_verify(__u8 * packet, int command, int wct, int bcc)
620{
621 if (SMB_CMD(packet) != command)
622 goto bad_command;
623 if (SMB_WCT(packet) < wct)
624 goto bad_wct;
625 if (bcc != -1 && smb_bcc(packet) < bcc)
626 goto bad_bcc;
627 return 0;
628
629bad_command:
630 printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
631 command, SMB_CMD(packet));
632 goto fail;
633bad_wct:
634 printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
635 command, wct, SMB_WCT(packet));
636 goto fail;
637bad_bcc:
638 printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
639 command, bcc, smb_bcc(packet));
640fail:
641 return -EIO;
642}
643
644/*
645 * Returns the maximum read or write size for the "payload". Making all of the
646 * packet fit within the negotiated max_xmit size.
647 *
648 * N.B. Since this value is usually computed before locking the server,
649 * the server's packet size must never be decreased!
650 */
651static inline int
652smb_get_xmitsize(struct smb_sb_info *server, int overhead)
653{
654 return server->opt.max_xmit - overhead;
655}
656
657/*
658 * Calculate the maximum read size
659 */
660int
661smb_get_rsize(struct smb_sb_info *server)
662{
663 /* readX has 12 parameters, read has 5 */
664 int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
665 int size = smb_get_xmitsize(server, overhead);
666
667 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
668
669 return size;
670}
671
672/*
673 * Calculate the maximum write size
674 */
675int
676smb_get_wsize(struct smb_sb_info *server)
677{
678 /* writeX has 14 parameters, write has 5 */
679 int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
680 int size = smb_get_xmitsize(server, overhead);
681
682 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
683
684 return size;
685}
686
687/*
688 * Convert SMB error codes to -E... errno values.
689 */
690int
691smb_errno(struct smb_request *req)
692{
693 int errcls = req->rq_rcls;
694 int error = req->rq_err;
695 char *class = "Unknown";
696
697 VERBOSE("errcls %d code %d from command 0x%x\n",
698 errcls, error, SMB_CMD(req->rq_header));
699
700 if (errcls == ERRDOS) {
701 switch (error) {
702 case ERRbadfunc:
703 return -EINVAL;
704 case ERRbadfile:
705 case ERRbadpath:
706 return -ENOENT;
707 case ERRnofids:
708 return -EMFILE;
709 case ERRnoaccess:
710 return -EACCES;
711 case ERRbadfid:
712 return -EBADF;
713 case ERRbadmcb:
714 return -EREMOTEIO;
715 case ERRnomem:
716 return -ENOMEM;
717 case ERRbadmem:
718 return -EFAULT;
719 case ERRbadenv:
720 case ERRbadformat:
721 return -EREMOTEIO;
722 case ERRbadaccess:
723 return -EACCES;
724 case ERRbaddata:
725 return -E2BIG;
726 case ERRbaddrive:
727 return -ENXIO;
728 case ERRremcd:
729 return -EREMOTEIO;
730 case ERRdiffdevice:
731 return -EXDEV;
732 case ERRnofiles:
733 return -ENOENT;
734 case ERRbadshare:
735 return -ETXTBSY;
736 case ERRlock:
737 return -EDEADLK;
738 case ERRfilexists:
739 return -EEXIST;
740 case ERROR_INVALID_PARAMETER:
741 return -EINVAL;
742 case ERROR_DISK_FULL:
743 return -ENOSPC;
744 case ERROR_INVALID_NAME:
745 return -ENOENT;
746 case ERROR_DIR_NOT_EMPTY:
747 return -ENOTEMPTY;
748 case ERROR_NOT_LOCKED:
749 return -ENOLCK;
750 case ERROR_ALREADY_EXISTS:
751 return -EEXIST;
752 default:
753 class = "ERRDOS";
754 goto err_unknown;
755 }
756 } else if (errcls == ERRSRV) {
757 switch (error) {
758 /* N.B. This is wrong ... EIO ? */
759 case ERRerror:
760 return -ENFILE;
761 case ERRbadpw:
762 return -EINVAL;
763 case ERRbadtype:
764 case ERRtimeout:
765 return -EIO;
766 case ERRaccess:
767 return -EACCES;
768 /*
769 * This is a fatal error, as it means the "tree ID"
770 * for this connection is no longer valid. We map
771 * to a special error code and get a new connection.
772 */
773 case ERRinvnid:
774 return -EBADSLT;
775 default:
776 class = "ERRSRV";
777 goto err_unknown;
778 }
779 } else if (errcls == ERRHRD) {
780 switch (error) {
781 case ERRnowrite:
782 return -EROFS;
783 case ERRbadunit:
784 return -ENODEV;
785 case ERRnotready:
786 return -EUCLEAN;
787 case ERRbadcmd:
788 case ERRdata:
789 return -EIO;
790 case ERRbadreq:
791 return -ERANGE;
792 case ERRbadshare:
793 return -ETXTBSY;
794 case ERRlock:
795 return -EDEADLK;
796 case ERRdiskfull:
797 return -ENOSPC;
798 default:
799 class = "ERRHRD";
800 goto err_unknown;
801 }
802 } else if (errcls == ERRCMD) {
803 class = "ERRCMD";
804 } else if (errcls == SUCCESS) {
805 return 0; /* This is the only valid 0 return */
806 }
807
808err_unknown:
809 printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
810 class, error, SMB_CMD(req->rq_header));
811 return -EIO;
812}
813
814/* smb_request_ok: We expect the server to be locked. Then we do the
815 request and check the answer completely. When smb_request_ok
816 returns 0, you can be quite sure that everything went well. When
817 the answer is <=0, the returned number is a valid unix errno. */
818
819static int
820smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
821{
822 int result;
823
824 req->rq_resp_wct = wct;
825 req->rq_resp_bcc = bcc;
826
827 result = smb_add_request(req);
828 if (result != 0) {
829 DEBUG1("smb_request failed\n");
830 goto out;
831 }
832
833 if (smb_valid_packet(req->rq_header) != 0) {
834 PARANOIA("invalid packet!\n");
835 goto out;
836 }
837
838 result = smb_verify(req->rq_header, command, wct, bcc);
839
840out:
841 return result;
842}
843
844/*
845 * This implements the NEWCONN ioctl. It installs the server pid,
846 * sets server->state to CONN_VALID, and wakes up the waiting process.
847 */
848int
849smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
850{
851 struct file *filp;
852 struct sock *sk;
853 int error;
854
855 VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
856
857 smb_lock_server(server);
858
859 /*
860 * Make sure we don't already have a valid connection ...
861 */
862 error = -EINVAL;
863 if (server->state == CONN_VALID)
864 goto out;
865
866 error = -EACCES;
867 if (current_uid() != server->mnt->mounted_uid &&
868 !capable(CAP_SYS_ADMIN))
869 goto out;
870
871 error = -EBADF;
872 filp = fget(opt->fd);
873 if (!filp)
874 goto out;
875 if (!smb_valid_socket(filp->f_path.dentry->d_inode))
876 goto out_putf;
877
878 server->sock_file = filp;
879 server->conn_pid = get_pid(task_pid(current));
880 server->opt = *opt;
881 server->generation += 1;
882 server->state = CONN_VALID;
883 error = 0;
884
885 if (server->conn_error) {
886 /*
887 * conn_error is the returncode we originally decided to
888 * drop the old connection on. This message should be positive
889 * and not make people ask questions on why smbfs is printing
890 * error messages ...
891 */
892 printk(KERN_INFO "SMB connection re-established (%d)\n",
893 server->conn_error);
894 server->conn_error = 0;
895 }
896
897 /*
898 * Store the server in sock user_data (Only used by sunrpc)
899 */
900 sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
901 sk->sk_user_data = server;
902
903 /* chain into the data_ready callback */
904 server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
905
906 /* check if we have an old smbmount that uses seconds for the
907 serverzone */
908 if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
909 server->opt.serverzone /= 60;
910
911 /* now that we have an established connection we can detect the server
912 type and enable bug workarounds */
913 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
914 install_ops(server->ops, &smb_ops_core);
915 else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
916 install_ops(server->ops, &smb_ops_os2);
917 else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
918 (server->opt.max_xmit < 0x1000) &&
919 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
920 /* FIXME: can we kill the WIN95 flag now? */
921 server->mnt->flags |= SMB_MOUNT_WIN95;
922 VERBOSE("detected WIN95 server\n");
923 install_ops(server->ops, &smb_ops_win95);
924 } else {
925 /*
926 * Samba has max_xmit 65535
927 * NT4spX has max_xmit 4536 (or something like that)
928 * win2k has ...
929 */
930 VERBOSE("detected NT1 (Samba, NT4/5) server\n");
931 install_ops(server->ops, &smb_ops_winNT);
932 }
933
934 /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
935 if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
936 server->ops->getattr = smb_proc_getattr_core;
937 } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
938 server->ops->getattr = smb_proc_getattr_ff;
939 }
940
941 /* Decode server capabilities */
942 if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
943 /* Should be ok to set this now, as no one can access the
944 mount until the connection has been established. */
945 SB_of(server)->s_maxbytes = ~0ULL >> 1;
946 VERBOSE("LFS enabled\n");
947 }
948 if (server->opt.capabilities & SMB_CAP_UNICODE) {
949 server->mnt->flags |= SMB_MOUNT_UNICODE;
950 VERBOSE("Unicode enabled\n");
951 } else {
952 server->mnt->flags &= ~SMB_MOUNT_UNICODE;
953 }
954#if 0
955 /* flags we may test for other patches ... */
956 if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
957 VERBOSE("Large reads enabled\n");
958 }
959 if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
960 VERBOSE("Large writes enabled\n");
961 }
962#endif
963 if (server->opt.capabilities & SMB_CAP_UNIX) {
964 struct inode *inode;
965 VERBOSE("Using UNIX CIFS extensions\n");
966 install_ops(server->ops, &smb_ops_unix);
967 inode = SB_of(server)->s_root->d_inode;
968 if (inode)
969 inode->i_op = &smb_dir_inode_operations_unix;
970 }
971
972 VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
973 server->opt.protocol, server->opt.max_xmit,
974 pid_nr(server->conn_pid), server->opt.capabilities);
975
976 /* FIXME: this really should be done by smbmount. */
977 if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
978 server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
979 }
980
981 smb_unlock_server(server);
982 smbiod_wake_up();
983 if (server->opt.capabilities & SMB_CAP_UNIX)
984 smb_proc_query_cifsunix(server);
985
986 server->conn_complete++;
987 wake_up_interruptible_all(&server->conn_wq);
988 return error;
989
990out:
991 smb_unlock_server(server);
992 smbiod_wake_up();
993 return error;
994
995out_putf:
996 fput(filp);
997 goto out;
998}
999
1000/* smb_setup_header: We completely set up the packet. You only have to
1001 insert the command-specific fields */
1002
1003__u8 *
1004smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
1005{
1006 __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
1007 __u8 *p = req->rq_header;
1008 struct smb_sb_info *server = req->rq_server;
1009
1010 p = smb_encode_smb_length(p, xmit_len - 4);
1011
1012 *p++ = 0xff;
1013 *p++ = 'S';
1014 *p++ = 'M';
1015 *p++ = 'B';
1016 *p++ = command;
1017
1018 memset(p, '\0', 19);
1019 p += 19;
1020 p += 8;
1021
1022 if (server->opt.protocol > SMB_PROTOCOL_CORE) {
1023 int flags = SMB_FLAGS_CASELESS_PATHNAMES;
1024 int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
1025 SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
1026
1027 *(req->rq_header + smb_flg) = flags;
1028 if (server->mnt->flags & SMB_MOUNT_UNICODE)
1029 flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
1030 WSET(req->rq_header, smb_flg2, flags2);
1031 }
1032 *p++ = wct; /* wct */
1033 p += 2 * wct;
1034 WSET(p, 0, bcc);
1035
1036 /* Include the header in the data to send */
1037 req->rq_iovlen = 1;
1038 req->rq_iov[0].iov_base = req->rq_header;
1039 req->rq_iov[0].iov_len = xmit_len - bcc;
1040
1041 return req->rq_buffer;
1042}
1043
1044static void
1045smb_setup_bcc(struct smb_request *req, __u8 *p)
1046{
1047 u16 bcc = p - req->rq_buffer;
1048 u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
1049
1050 WSET(pbcc, 0, bcc);
1051
1052 smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN +
1053 2*SMB_WCT(req->rq_header) - 2 + bcc);
1054
1055 /* Include the "bytes" in the data to send */
1056 req->rq_iovlen = 2;
1057 req->rq_iov[1].iov_base = req->rq_buffer;
1058 req->rq_iov[1].iov_len = bcc;
1059}
1060
1061static int
1062smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
1063 __u16 mode, off_t offset)
1064{
1065 int result;
1066 struct smb_request *req;
1067
1068 result = -ENOMEM;
1069 if (! (req = smb_alloc_request(server, 0)))
1070 goto out;
1071
1072 smb_setup_header(req, SMBlseek, 4, 0);
1073 WSET(req->rq_header, smb_vwv0, fileid);
1074 WSET(req->rq_header, smb_vwv1, mode);
1075 DSET(req->rq_header, smb_vwv2, offset);
1076 req->rq_flags |= SMB_REQ_NORETRY;
1077
1078 result = smb_request_ok(req, SMBlseek, 2, 0);
1079 if (result < 0) {
1080 result = 0;
1081 goto out_free;
1082 }
1083
1084 result = DVAL(req->rq_header, smb_vwv0);
1085out_free:
1086 smb_rput(req);
1087out:
1088 return result;
1089}
1090
1091static int
1092smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
1093{
1094 struct inode *ino = dentry->d_inode;
1095 struct smb_inode_info *ei = SMB_I(ino);
1096 int mode, read_write = 0x42, read_only = 0x40;
1097 int res;
1098 char *p;
1099 struct smb_request *req;
1100
1101 /*
1102 * Attempt to open r/w, unless there are no write privileges.
1103 */
1104 mode = read_write;
1105 if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
1106 mode = read_only;
1107#if 0
1108 /* FIXME: why is this code not in? below we fix it so that a caller
1109 wanting RO doesn't get RW. smb_revalidate_inode does some
1110 optimization based on access mode. tail -f needs it to be correct.
1111
1112 We must open rw since we don't do the open if called a second time
1113 with different 'wish'. Is that not supported by smb servers? */
1114 if (!(wish & (O_WRONLY | O_RDWR)))
1115 mode = read_only;
1116#endif
1117
1118 res = -ENOMEM;
1119 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1120 goto out;
1121
1122 retry:
1123 p = smb_setup_header(req, SMBopen, 2, 0);
1124 WSET(req->rq_header, smb_vwv0, mode);
1125 WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
1126 res = smb_simple_encode_path(req, &p, dentry, NULL);
1127 if (res < 0)
1128 goto out_free;
1129 smb_setup_bcc(req, p);
1130
1131 res = smb_request_ok(req, SMBopen, 7, 0);
1132 if (res != 0) {
1133 if (mode == read_write &&
1134 (res == -EACCES || res == -ETXTBSY || res == -EROFS))
1135 {
1136 VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
1137 DENTRY_PATH(dentry), res);
1138 mode = read_only;
1139 req->rq_flags = 0;
1140 goto retry;
1141 }
1142 goto out_free;
1143 }
1144 /* We should now have data in vwv[0..6]. */
1145
1146 ei->fileid = WVAL(req->rq_header, smb_vwv0);
1147 ei->attr = WVAL(req->rq_header, smb_vwv1);
1148 /* smb_vwv2 has mtime */
1149 /* smb_vwv4 has size */
1150 ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
1151 ei->open = server->generation;
1152
1153out_free:
1154 smb_rput(req);
1155out:
1156 return res;
1157}
1158
1159/*
1160 * Make sure the file is open, and check that the access
1161 * is compatible with the desired access.
1162 */
1163int
1164smb_open(struct dentry *dentry, int wish)
1165{
1166 struct inode *inode = dentry->d_inode;
1167 int result;
1168 __u16 access;
1169
1170 result = -ENOENT;
1171 if (!inode) {
1172 printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
1173 DENTRY_PATH(dentry));
1174 goto out;
1175 }
1176
1177 if (!smb_is_open(inode)) {
1178 struct smb_sb_info *server = server_from_inode(inode);
1179 result = 0;
1180 if (!smb_is_open(inode))
1181 result = smb_proc_open(server, dentry, wish);
1182 if (result)
1183 goto out;
1184 /*
1185 * A successful open means the path is still valid ...
1186 */
1187 smb_renew_times(dentry);
1188 }
1189
1190 /*
1191 * Check whether the access is compatible with the desired mode.
1192 */
1193 result = 0;
1194 access = SMB_I(inode)->access;
1195 if (access != wish && access != SMB_O_RDWR) {
1196 PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
1197 DENTRY_PATH(dentry), access, wish);
1198 result = -EACCES;
1199 }
1200out:
1201 return result;
1202}
1203
1204static int
1205smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
1206{
1207 struct smb_request *req;
1208 int result = -ENOMEM;
1209
1210 if (! (req = smb_alloc_request(server, 0)))
1211 goto out;
1212
1213 smb_setup_header(req, SMBclose, 3, 0);
1214 WSET(req->rq_header, smb_vwv0, fileid);
1215 DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
1216 req->rq_flags |= SMB_REQ_NORETRY;
1217 result = smb_request_ok(req, SMBclose, 0, 0);
1218
1219 smb_rput(req);
1220out:
1221 return result;
1222}
1223
1224/*
1225 * Win NT 4.0 has an apparent bug in that it fails to update the
1226 * modify time when writing to a file. As a workaround, we update
1227 * both modify and access time locally, and post the times to the
1228 * server when closing the file.
1229 */
1230static int
1231smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
1232{
1233 struct smb_inode_info *ei = SMB_I(ino);
1234 int result = 0;
1235 if (smb_is_open(ino))
1236 {
1237 /*
1238 * We clear the open flag in advance, in case another
1239 * process observes the value while we block below.
1240 */
1241 ei->open = 0;
1242
1243 /*
1244 * Kludge alert: SMB timestamps are accurate only to
1245 * two seconds ... round the times to avoid needless
1246 * cache invalidations!
1247 */
1248 if (ino->i_mtime.tv_sec & 1) {
1249 ino->i_mtime.tv_sec--;
1250 ino->i_mtime.tv_nsec = 0;
1251 }
1252 if (ino->i_atime.tv_sec & 1) {
1253 ino->i_atime.tv_sec--;
1254 ino->i_atime.tv_nsec = 0;
1255 }
1256 /*
1257 * If the file is open with write permissions,
1258 * update the time stamps to sync mtime and atime.
1259 */
1260 if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
1261 (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
1262 !(ei->access == SMB_O_RDONLY))
1263 {
1264 struct smb_fattr fattr;
1265 smb_get_inode_attr(ino, &fattr);
1266 smb_proc_setattr_ext(server, ino, &fattr);
1267 }
1268
1269 result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
1270 /*
1271 * Force a revalidation after closing ... some servers
1272 * don't post the size until the file has been closed.
1273 */
1274 if (server->opt.protocol < SMB_PROTOCOL_NT1)
1275 ei->oldmtime = 0;
1276 ei->closed = jiffies;
1277 }
1278 return result;
1279}
1280
1281int
1282smb_close(struct inode *ino)
1283{
1284 int result = 0;
1285
1286 if (smb_is_open(ino)) {
1287 struct smb_sb_info *server = server_from_inode(ino);
1288 result = smb_proc_close_inode(server, ino);
1289 }
1290 return result;
1291}
1292
1293/*
1294 * This is used to close a file following a failed instantiate.
1295 * Since we don't have an inode, we can't use any of the above.
1296 */
1297int
1298smb_close_fileid(struct dentry *dentry, __u16 fileid)
1299{
1300 struct smb_sb_info *server = server_from_dentry(dentry);
1301 int result;
1302
1303 result = smb_proc_close(server, fileid, get_seconds());
1304 return result;
1305}
1306
1307/* In smb_proc_read and smb_proc_write we do not retry, because the
1308 file-id would not be valid after a reconnection. */
1309
1310static void
1311smb_proc_read_data(struct smb_request *req)
1312{
1313 req->rq_iov[0].iov_base = req->rq_buffer;
1314 req->rq_iov[0].iov_len = 3;
1315
1316 req->rq_iov[1].iov_base = req->rq_page;
1317 req->rq_iov[1].iov_len = req->rq_rsize;
1318 req->rq_iovlen = 2;
1319
1320 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1321}
1322
1323static int
1324smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
1325{
1326 struct smb_sb_info *server = server_from_inode(inode);
1327 __u16 returned_count, data_len;
1328 unsigned char *buf;
1329 int result;
1330 struct smb_request *req;
1331 u8 rbuf[4];
1332
1333 result = -ENOMEM;
1334 if (! (req = smb_alloc_request(server, 0)))
1335 goto out;
1336
1337 smb_setup_header(req, SMBread, 5, 0);
1338 buf = req->rq_header;
1339 WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
1340 WSET(buf, smb_vwv1, count);
1341 DSET(buf, smb_vwv2, offset);
1342 WSET(buf, smb_vwv4, 0);
1343
1344 req->rq_page = data;
1345 req->rq_rsize = count;
1346 req->rq_callback = smb_proc_read_data;
1347 req->rq_buffer = rbuf;
1348 req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
1349
1350 result = smb_request_ok(req, SMBread, 5, -1);
1351 if (result < 0)
1352 goto out_free;
1353 returned_count = WVAL(req->rq_header, smb_vwv0);
1354
1355 data_len = WVAL(rbuf, 1);
1356
1357 if (returned_count != data_len) {
1358 printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
1359 printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
1360 returned_count, data_len);
1361 }
1362 result = data_len;
1363
1364out_free:
1365 smb_rput(req);
1366out:
1367 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1368 inode->i_ino, SMB_I(inode)->fileid, count, result);
1369 return result;
1370}
1371
1372static int
1373smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
1374{
1375 struct smb_sb_info *server = server_from_inode(inode);
1376 int result;
1377 u16 fileid = SMB_I(inode)->fileid;
1378 u8 buf[4];
1379 struct smb_request *req;
1380
1381 result = -ENOMEM;
1382 if (! (req = smb_alloc_request(server, 0)))
1383 goto out;
1384
1385 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1386 inode->i_ino, fileid, count, offset);
1387
1388 smb_setup_header(req, SMBwrite, 5, count + 3);
1389 WSET(req->rq_header, smb_vwv0, fileid);
1390 WSET(req->rq_header, smb_vwv1, count);
1391 DSET(req->rq_header, smb_vwv2, offset);
1392 WSET(req->rq_header, smb_vwv4, 0);
1393
1394 buf[0] = 1;
1395 WSET(buf, 1, count); /* yes, again ... */
1396 req->rq_iov[1].iov_base = buf;
1397 req->rq_iov[1].iov_len = 3;
1398 req->rq_iov[2].iov_base = (char *) data;
1399 req->rq_iov[2].iov_len = count;
1400 req->rq_iovlen = 3;
1401 req->rq_flags |= SMB_REQ_NORETRY;
1402
1403 result = smb_request_ok(req, SMBwrite, 1, 0);
1404 if (result >= 0)
1405 result = WVAL(req->rq_header, smb_vwv0);
1406
1407 smb_rput(req);
1408out:
1409 return result;
1410}
1411
1412/*
1413 * In smb_proc_readX and smb_proc_writeX we do not retry, because the
1414 * file-id would not be valid after a reconnection.
1415 */
1416
1417#define SMB_READX_MAX_PAD 64
1418static void
1419smb_proc_readX_data(struct smb_request *req)
1420{
1421 /* header length, excluding the netbios length (-4) */
1422 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
1423 int data_off = WVAL(req->rq_header, smb_vwv6);
1424
1425 /*
1426 * Some genius made the padding to the data bytes arbitrary.
1427 * So we must first calculate the amount of padding used by the server.
1428 */
1429 data_off -= hdrlen;
1430 if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
1431 PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
1432 PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
1433 req->rq_rlen = req->rq_bufsize + 1;
1434 return;
1435 }
1436 req->rq_iov[0].iov_base = req->rq_buffer;
1437 req->rq_iov[0].iov_len = data_off;
1438
1439 req->rq_iov[1].iov_base = req->rq_page;
1440 req->rq_iov[1].iov_len = req->rq_rsize;
1441 req->rq_iovlen = 2;
1442
1443 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1444}
1445
1446static int
1447smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
1448{
1449 struct smb_sb_info *server = server_from_inode(inode);
1450 unsigned char *buf;
1451 int result;
1452 struct smb_request *req;
1453 static char pad[SMB_READX_MAX_PAD];
1454
1455 result = -ENOMEM;
1456 if (! (req = smb_alloc_request(server, 0)))
1457 goto out;
1458
1459 smb_setup_header(req, SMBreadX, 12, 0);
1460 buf = req->rq_header;
1461 WSET(buf, smb_vwv0, 0x00ff);
1462 WSET(buf, smb_vwv1, 0);
1463 WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
1464 DSET(buf, smb_vwv3, (u32)offset); /* low 32 bits */
1465 WSET(buf, smb_vwv5, count);
1466 WSET(buf, smb_vwv6, 0);
1467 DSET(buf, smb_vwv7, 0);
1468 WSET(buf, smb_vwv9, 0);
1469 DSET(buf, smb_vwv10, (u32)(offset >> 32)); /* high 32 bits */
1470 WSET(buf, smb_vwv11, 0);
1471
1472 req->rq_page = data;
1473 req->rq_rsize = count;
1474 req->rq_callback = smb_proc_readX_data;
1475 req->rq_buffer = pad;
1476 req->rq_bufsize = SMB_READX_MAX_PAD;
1477 req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
1478
1479 result = smb_request_ok(req, SMBreadX, 12, -1);
1480 if (result < 0)
1481 goto out_free;
1482 result = WVAL(req->rq_header, smb_vwv5);
1483
1484out_free:
1485 smb_rput(req);
1486out:
1487 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1488 inode->i_ino, SMB_I(inode)->fileid, count, result);
1489 return result;
1490}
1491
1492static int
1493smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
1494{
1495 struct smb_sb_info *server = server_from_inode(inode);
1496 int result;
1497 u8 *p;
1498 static u8 pad[4];
1499 struct smb_request *req;
1500
1501 result = -ENOMEM;
1502 if (! (req = smb_alloc_request(server, 0)))
1503 goto out;
1504
1505 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1506 inode->i_ino, SMB_I(inode)->fileid, count, offset);
1507
1508 p = smb_setup_header(req, SMBwriteX, 14, count + 1);
1509 WSET(req->rq_header, smb_vwv0, 0x00ff);
1510 WSET(req->rq_header, smb_vwv1, 0);
1511 WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
1512 DSET(req->rq_header, smb_vwv3, (u32)offset); /* low 32 bits */
1513 DSET(req->rq_header, smb_vwv5, 0);
1514 WSET(req->rq_header, smb_vwv7, 0); /* write mode */
1515 WSET(req->rq_header, smb_vwv8, 0);
1516 WSET(req->rq_header, smb_vwv9, 0);
1517 WSET(req->rq_header, smb_vwv10, count); /* data length */
1518 WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
1519 DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
1520
1521 req->rq_iov[1].iov_base = pad;
1522 req->rq_iov[1].iov_len = 1;
1523 req->rq_iov[2].iov_base = (char *) data;
1524 req->rq_iov[2].iov_len = count;
1525 req->rq_iovlen = 3;
1526 req->rq_flags |= SMB_REQ_NORETRY;
1527
1528 result = smb_request_ok(req, SMBwriteX, 6, 0);
1529 if (result >= 0)
1530 result = WVAL(req->rq_header, smb_vwv2);
1531
1532 smb_rput(req);
1533out:
1534 return result;
1535}
1536
1537int
1538smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
1539{
1540 struct smb_sb_info *server = server_from_dentry(dentry);
1541 char *p;
1542 int result;
1543 struct smb_request *req;
1544
1545 result = -ENOMEM;
1546 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1547 goto out;
1548
1549 p = smb_setup_header(req, SMBcreate, 3, 0);
1550 WSET(req->rq_header, smb_vwv0, attr);
1551 DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
1552 result = smb_simple_encode_path(req, &p, dentry, NULL);
1553 if (result < 0)
1554 goto out_free;
1555 smb_setup_bcc(req, p);
1556
1557 result = smb_request_ok(req, SMBcreate, 1, 0);
1558 if (result < 0)
1559 goto out_free;
1560
1561 *fileid = WVAL(req->rq_header, smb_vwv0);
1562 result = 0;
1563
1564out_free:
1565 smb_rput(req);
1566out:
1567 return result;
1568}
1569
1570int
1571smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
1572{
1573 struct smb_sb_info *server = server_from_dentry(old_dentry);
1574 char *p;
1575 int result;
1576 struct smb_request *req;
1577
1578 result = -ENOMEM;
1579 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1580 goto out;
1581
1582 p = smb_setup_header(req, SMBmv, 1, 0);
1583 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
1584 result = smb_simple_encode_path(req, &p, old_dentry, NULL);
1585 if (result < 0)
1586 goto out_free;
1587 result = smb_simple_encode_path(req, &p, new_dentry, NULL);
1588 if (result < 0)
1589 goto out_free;
1590 smb_setup_bcc(req, p);
1591
1592 if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
1593 goto out_free;
1594 result = 0;
1595
1596out_free:
1597 smb_rput(req);
1598out:
1599 return result;
1600}
1601
1602/*
1603 * Code common to mkdir and rmdir.
1604 */
1605static int
1606smb_proc_generic_command(struct dentry *dentry, __u8 command)
1607{
1608 struct smb_sb_info *server = server_from_dentry(dentry);
1609 char *p;
1610 int result;
1611 struct smb_request *req;
1612
1613 result = -ENOMEM;
1614 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1615 goto out;
1616
1617 p = smb_setup_header(req, command, 0, 0);
1618 result = smb_simple_encode_path(req, &p, dentry, NULL);
1619 if (result < 0)
1620 goto out_free;
1621 smb_setup_bcc(req, p);
1622
1623 result = smb_request_ok(req, command, 0, 0);
1624 if (result < 0)
1625 goto out_free;
1626 result = 0;
1627
1628out_free:
1629 smb_rput(req);
1630out:
1631 return result;
1632}
1633
1634int
1635smb_proc_mkdir(struct dentry *dentry)
1636{
1637 return smb_proc_generic_command(dentry, SMBmkdir);
1638}
1639
1640int
1641smb_proc_rmdir(struct dentry *dentry)
1642{
1643 return smb_proc_generic_command(dentry, SMBrmdir);
1644}
1645
1646#if SMBFS_POSIX_UNLINK
1647/*
1648 * Removes readonly attribute from a file. Used by unlink to give posix
1649 * semantics.
1650 */
1651static int
1652smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
1653{
1654 int result;
1655 struct smb_fattr fattr;
1656
1657 /* FIXME: cifsUE should allow removing a readonly file. */
1658
1659 /* first get current attribute */
1660 smb_init_dirent(server, &fattr);
1661 result = server->ops->getattr(server, dentry, &fattr);
1662 smb_finish_dirent(server, &fattr);
1663 if (result < 0)
1664 return result;
1665
1666 /* if RONLY attribute is set, remove it */
1667 if (fattr.attr & aRONLY) { /* read only attribute is set */
1668 fattr.attr &= ~aRONLY;
1669 result = smb_proc_setattr_core(server, dentry, fattr.attr);
1670 }
1671 return result;
1672}
1673#endif
1674
1675int
1676smb_proc_unlink(struct dentry *dentry)
1677{
1678 struct smb_sb_info *server = server_from_dentry(dentry);
1679 int flag = 0;
1680 char *p;
1681 int result;
1682 struct smb_request *req;
1683
1684 result = -ENOMEM;
1685 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1686 goto out;
1687
1688 retry:
1689 p = smb_setup_header(req, SMBunlink, 1, 0);
1690 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
1691 result = smb_simple_encode_path(req, &p, dentry, NULL);
1692 if (result < 0)
1693 goto out_free;
1694 smb_setup_bcc(req, p);
1695
1696 if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
1697#if SMBFS_POSIX_UNLINK
1698 if (result == -EACCES && !flag) {
1699 /* Posix semantics is for the read-only state
1700 of a file to be ignored in unlink(). In the
1701 SMB world a unlink() is refused on a
1702 read-only file. To make things easier for
1703 unix users we try to override the files
1704 permission if the unlink fails with the
1705 right error.
1706 This introduces a race condition that could
1707 lead to a file being written by someone who
1708 shouldn't have access, but as far as I can
1709 tell that is unavoidable */
1710
1711 /* remove RONLY attribute and try again */
1712 result = smb_set_rw(dentry,server);
1713 if (result == 0) {
1714 flag = 1;
1715 req->rq_flags = 0;
1716 goto retry;
1717 }
1718 }
1719#endif
1720 goto out_free;
1721 }
1722 result = 0;
1723
1724out_free:
1725 smb_rput(req);
1726out:
1727 return result;
1728}
1729
1730int
1731smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
1732{
1733 int result;
1734 struct smb_request *req;
1735
1736 result = -ENOMEM;
1737 if (! (req = smb_alloc_request(server, 0)))
1738 goto out;
1739
1740 smb_setup_header(req, SMBflush, 1, 0);
1741 WSET(req->rq_header, smb_vwv0, fileid);
1742 req->rq_flags |= SMB_REQ_NORETRY;
1743 result = smb_request_ok(req, SMBflush, 0, 0);
1744
1745 smb_rput(req);
1746out:
1747 return result;
1748}
1749
1750static int
1751smb_proc_trunc32(struct inode *inode, loff_t length)
1752{
1753 /*
1754 * Writing 0bytes is old-SMB magic for truncating files.
1755 * MAX_NON_LFS should prevent this from being called with a too
1756 * large offset.
1757 */
1758 return smb_proc_write(inode, length, 0, NULL);
1759}
1760
1761static int
1762smb_proc_trunc64(struct inode *inode, loff_t length)
1763{
1764 struct smb_sb_info *server = server_from_inode(inode);
1765 int result;
1766 char *param;
1767 char *data;
1768 struct smb_request *req;
1769
1770 result = -ENOMEM;
1771 if (! (req = smb_alloc_request(server, 14)))
1772 goto out;
1773
1774 param = req->rq_buffer;
1775 data = req->rq_buffer + 6;
1776
1777 /* FIXME: must we also set allocation size? winNT seems to do that */
1778 WSET(param, 0, SMB_I(inode)->fileid);
1779 WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
1780 WSET(param, 4, 0);
1781 LSET(data, 0, length);
1782
1783 req->rq_trans2_command = TRANSACT2_SETFILEINFO;
1784 req->rq_ldata = 8;
1785 req->rq_data = data;
1786 req->rq_lparm = 6;
1787 req->rq_parm = param;
1788 req->rq_flags |= SMB_REQ_NORETRY;
1789 result = smb_add_request(req);
1790 if (result < 0)
1791 goto out_free;
1792
1793 result = 0;
1794 if (req->rq_rcls != 0)
1795 result = smb_errno(req);
1796
1797out_free:
1798 smb_rput(req);
1799out:
1800 return result;
1801}
1802
1803static int
1804smb_proc_trunc95(struct inode *inode, loff_t length)
1805{
1806 struct smb_sb_info *server = server_from_inode(inode);
1807 int result = smb_proc_trunc32(inode, length);
1808
1809 /*
1810 * win9x doesn't appear to update the size immediately.
1811 * It will return the old file size after the truncate,
1812 * confusing smbfs. So we force an update.
1813 *
1814 * FIXME: is this still necessary?
1815 */
1816 smb_proc_flush(server, SMB_I(inode)->fileid);
1817 return result;
1818}
1819
1820static void
1821smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1822{
1823 memset(fattr, 0, sizeof(*fattr));
1824
1825 fattr->f_nlink = 1;
1826 fattr->f_uid = server->mnt->uid;
1827 fattr->f_gid = server->mnt->gid;
1828 fattr->f_unix = 0;
1829}
1830
1831static void
1832smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1833{
1834 if (fattr->f_unix)
1835 return;
1836
1837 fattr->f_mode = server->mnt->file_mode;
1838 if (fattr->attr & aDIR) {
1839 fattr->f_mode = server->mnt->dir_mode;
1840 fattr->f_size = SMB_ST_BLKSIZE;
1841 }
1842 /* Check the read-only flag */
1843 if (fattr->attr & aRONLY)
1844 fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
1845
1846 /* How many 512 byte blocks do we need for this file? */
1847 fattr->f_blocks = 0;
1848 if (fattr->f_size != 0)
1849 fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
1850 return;
1851}
1852
1853void
1854smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
1855 struct super_block *sb)
1856{
1857 smb_init_dirent(server, fattr);
1858 fattr->attr = aDIR;
1859 fattr->f_ino = 2; /* traditional root inode number */
1860 fattr->f_mtime = current_fs_time(sb);
1861 smb_finish_dirent(server, fattr);
1862}
1863
1864/*
1865 * Decode a dirent for old protocols
1866 *
1867 * qname is filled with the decoded, and possibly translated, name.
1868 * fattr receives decoded attributes
1869 *
1870 * Bugs Noted:
1871 * (1) Pathworks servers may pad the name with extra spaces.
1872 */
1873static char *
1874smb_decode_short_dirent(struct smb_sb_info *server, char *p,
1875 struct qstr *qname, struct smb_fattr *fattr,
1876 unsigned char *name_buf)
1877{
1878 int len;
1879
1880 /*
1881 * SMB doesn't have a concept of inode numbers ...
1882 */
1883 smb_init_dirent(server, fattr);
1884 fattr->f_ino = 0; /* FIXME: do we need this? */
1885
1886 p += SMB_STATUS_SIZE; /* reserved (search_status) */
1887 fattr->attr = *p;
1888 fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
1889 fattr->f_mtime.tv_nsec = 0;
1890 fattr->f_size = DVAL(p, 5);
1891 fattr->f_ctime = fattr->f_mtime;
1892 fattr->f_atime = fattr->f_mtime;
1893 qname->name = p + 9;
1894 len = strnlen(qname->name, 12);
1895
1896 /*
1897 * Trim trailing blanks for Pathworks servers
1898 */
1899 while (len > 2 && qname->name[len-1] == ' ')
1900 len--;
1901
1902 smb_finish_dirent(server, fattr);
1903
1904#if 0
1905 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
1906 allow the flag to be set anyway. It kills const. Remove? */
1907 switch (server->opt.case_handling) {
1908 case SMB_CASE_UPPER:
1909 str_upper(entry->name, len);
1910 break;
1911 case SMB_CASE_LOWER:
1912 str_lower(entry->name, len);
1913 break;
1914 default:
1915 break;
1916 }
1917#endif
1918
1919 qname->len = 0;
1920 len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
1921 qname->name, len,
1922 server->remote_nls, server->local_nls);
1923 if (len > 0) {
1924 qname->len = len;
1925 qname->name = name_buf;
1926 DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
1927 }
1928
1929 return p + 22;
1930}
1931
1932/*
1933 * This routine is used to read in directory entries from the network.
1934 * Note that it is for short directory name seeks, i.e.: protocol <
1935 * SMB_PROTOCOL_LANMAN2
1936 */
1937static int
1938smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
1939 struct smb_cache_control *ctl)
1940{
1941 struct dentry *dir = filp->f_path.dentry;
1942 struct smb_sb_info *server = server_from_dentry(dir);
1943 struct qstr qname;
1944 struct smb_fattr fattr;
1945 char *p;
1946 int result;
1947 int i, first, entries_seen, entries;
1948 int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
1949 __u16 bcc;
1950 __u16 count;
1951 char status[SMB_STATUS_SIZE];
1952 static struct qstr mask = {
1953 .name = "*.*",
1954 .len = 3,
1955 };
1956 unsigned char *last_status;
1957 struct smb_request *req;
1958 unsigned char *name_buf;
1959
1960 VERBOSE("%s/%s\n", DENTRY_PATH(dir));
1961
1962 lock_kernel();
1963
1964 result = -ENOMEM;
1965 if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
1966 goto out;
1967
1968 first = 1;
1969 entries = 0;
1970 entries_seen = 2; /* implicit . and .. */
1971
1972 result = -ENOMEM;
1973 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
1974 goto out_name;
1975
1976 while (1) {
1977 p = smb_setup_header(req, SMBsearch, 2, 0);
1978 WSET(req->rq_header, smb_vwv0, entries_asked);
1979 WSET(req->rq_header, smb_vwv1, aDIR);
1980 if (first == 1) {
1981 result = smb_simple_encode_path(req, &p, dir, &mask);
1982 if (result < 0)
1983 goto out_free;
1984 if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
1985 result = -ENAMETOOLONG;
1986 goto out_free;
1987 }
1988 *p++ = 5;
1989 WSET(p, 0, 0);
1990 p += 2;
1991 first = 0;
1992 } else {
1993 if (p + 5 + SMB_STATUS_SIZE >
1994 (char *)req->rq_buffer + req->rq_bufsize) {
1995 result = -ENAMETOOLONG;
1996 goto out_free;
1997 }
1998
1999 *p++ = 4;
2000 *p++ = 0;
2001 *p++ = 5;
2002 WSET(p, 0, SMB_STATUS_SIZE);
2003 p += 2;
2004 memcpy(p, status, SMB_STATUS_SIZE);
2005 p += SMB_STATUS_SIZE;
2006 }
2007
2008 smb_setup_bcc(req, p);
2009
2010 result = smb_request_ok(req, SMBsearch, 1, -1);
2011 if (result < 0) {
2012 if ((req->rq_rcls == ERRDOS) &&
2013 (req->rq_err == ERRnofiles))
2014 break;
2015 goto out_free;
2016 }
2017 count = WVAL(req->rq_header, smb_vwv0);
2018 if (count <= 0)
2019 break;
2020
2021 result = -EIO;
2022 bcc = smb_bcc(req->rq_header);
2023 if (bcc != count * SMB_DIRINFO_SIZE + 3)
2024 goto out_free;
2025 p = req->rq_buffer + 3;
2026
2027
2028 /* Make sure the response fits in the buffer. Fixed sized
2029 entries means we don't have to check in the decode loop. */
2030
2031 last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
2032
2033 if (last_status + SMB_DIRINFO_SIZE >=
2034 req->rq_buffer + req->rq_bufsize) {
2035 printk(KERN_ERR "smb_proc_readdir_short: "
2036 "last dir entry outside buffer! "
2037 "%d@%p %d@%p\n", SMB_DIRINFO_SIZE, last_status,
2038 req->rq_bufsize, req->rq_buffer);
2039 goto out_free;
2040 }
2041
2042 /* Read the last entry into the status field. */
2043 memcpy(status, last_status, SMB_STATUS_SIZE);
2044
2045
2046 /* Now we are ready to parse smb directory entries. */
2047
2048 for (i = 0; i < count; i++) {
2049 p = smb_decode_short_dirent(server, p,
2050 &qname, &fattr, name_buf);
2051 if (qname.len == 0)
2052 continue;
2053
2054 if (entries_seen == 2 && qname.name[0] == '.') {
2055 if (qname.len == 1)
2056 continue;
2057 if (qname.name[1] == '.' && qname.len == 2)
2058 continue;
2059 }
2060 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2061 &qname, &fattr))
2062 ; /* stop reading? */
2063 entries_seen++;
2064 }
2065 }
2066 result = entries;
2067
2068out_free:
2069 smb_rput(req);
2070out_name:
2071 kfree(name_buf);
2072out:
2073 unlock_kernel();
2074 return result;
2075}
2076
2077static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
2078{
2079 u64 size, disk_bytes;
2080
2081 /* FIXME: verify nls support. all is sent as utf8? */
2082
2083 fattr->f_unix = 1;
2084 fattr->f_mode = 0;
2085
2086 /* FIXME: use the uniqueID from the remote instead? */
2087 /* 0 L file size in bytes */
2088 /* 8 L file size on disk in bytes (block count) */
2089 /* 40 L uid */
2090 /* 48 L gid */
2091 /* 56 W file type */
2092 /* 60 L devmajor */
2093 /* 68 L devminor */
2094 /* 76 L unique ID (inode) */
2095 /* 84 L permissions */
2096 /* 92 L link count */
2097
2098 size = LVAL(p, 0);
2099 disk_bytes = LVAL(p, 8);
2100
2101 /*
2102 * Some samba versions round up on-disk byte usage
2103 * to 1MB boundaries, making it useless. When seeing
2104 * that, use the size instead.
2105 */
2106 if (!(disk_bytes & 0xfffff))
2107 disk_bytes = size+511;
2108
2109 fattr->f_size = size;
2110 fattr->f_blocks = disk_bytes >> 9;
2111 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
2112 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
2113 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
2114
2115 if (server->mnt->flags & SMB_MOUNT_UID)
2116 fattr->f_uid = server->mnt->uid;
2117 else
2118 fattr->f_uid = LVAL(p, 40);
2119
2120 if (server->mnt->flags & SMB_MOUNT_GID)
2121 fattr->f_gid = server->mnt->gid;
2122 else
2123 fattr->f_gid = LVAL(p, 48);
2124
2125 fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
2126
2127 if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
2128 __u64 major = LVAL(p, 60);
2129 __u64 minor = LVAL(p, 68);
2130
2131 fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
2132 if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
2133 MINOR(fattr->f_rdev) != (minor & 0xffffffff))
2134 fattr->f_rdev = 0;
2135 }
2136
2137 fattr->f_mode |= LVAL(p, 84);
2138
2139 if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
2140 (S_ISDIR(fattr->f_mode)) )
2141 fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
2142 else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
2143 !(S_ISDIR(fattr->f_mode)) )
2144 fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
2145 (fattr->f_mode & S_IFMT);
2146
2147}
2148
2149/*
2150 * Interpret a long filename structure using the specified info level:
2151 * level 1 for anything below NT1 protocol
2152 * level 260 for NT1 protocol
2153 *
2154 * qname is filled with the decoded, and possibly translated, name
2155 * fattr receives decoded attributes.
2156 *
2157 * Bugs Noted:
2158 * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
2159 */
2160static char *
2161smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
2162 struct qstr *qname, struct smb_fattr *fattr,
2163 unsigned char *name_buf)
2164{
2165 char *result;
2166 unsigned int len = 0;
2167 int n;
2168 __u16 date, time;
2169 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
2170
2171 /*
2172 * SMB doesn't have a concept of inode numbers ...
2173 */
2174 smb_init_dirent(server, fattr);
2175 fattr->f_ino = 0; /* FIXME: do we need this? */
2176
2177 switch (level) {
2178 case 1:
2179 len = *((unsigned char *) p + 22);
2180 qname->name = p + 23;
2181 result = p + 24 + len;
2182
2183 date = WVAL(p, 0);
2184 time = WVAL(p, 2);
2185 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2186 fattr->f_ctime.tv_nsec = 0;
2187
2188 date = WVAL(p, 4);
2189 time = WVAL(p, 6);
2190 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2191 fattr->f_atime.tv_nsec = 0;
2192
2193 date = WVAL(p, 8);
2194 time = WVAL(p, 10);
2195 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2196 fattr->f_mtime.tv_nsec = 0;
2197 fattr->f_size = DVAL(p, 12);
2198 /* ULONG allocation size */
2199 fattr->attr = WVAL(p, 20);
2200
2201 VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
2202 p, len, len, qname->name);
2203 break;
2204 case 260:
2205 result = p + WVAL(p, 0);
2206 len = DVAL(p, 60);
2207 if (len > 255) len = 255;
2208 /* NT4 null terminates, unless we are using unicode ... */
2209 qname->name = p + 94;
2210 if (!unicode && len && qname->name[len-1] == '\0')
2211 len--;
2212
2213 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
2214 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
2215 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
2216 /* change time (32) */
2217 fattr->f_size = LVAL(p, 40);
2218 /* alloc size (48) */
2219 fattr->attr = DVAL(p, 56);
2220
2221 VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
2222 p, len, len, qname->name);
2223 break;
2224 case SMB_FIND_FILE_UNIX:
2225 result = p + WVAL(p, 0);
2226 qname->name = p + 108;
2227
2228 len = strlen(qname->name);
2229 /* FIXME: should we check the length?? */
2230
2231 p += 8;
2232 smb_decode_unix_basic(fattr, server, p);
2233 VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
2234 p, len, len, qname->name);
2235 break;
2236 default:
2237 PARANOIA("Unknown info level %d\n", level);
2238 result = p + WVAL(p, 0);
2239 goto out;
2240 }
2241
2242 smb_finish_dirent(server, fattr);
2243
2244#if 0
2245 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
2246 allow the flag to be set anyway. Remove? */
2247 switch (server->opt.case_handling) {
2248 case SMB_CASE_UPPER:
2249 str_upper(qname->name, len);
2250 break;
2251 case SMB_CASE_LOWER:
2252 str_lower(qname->name, len);
2253 break;
2254 default:
2255 break;
2256 }
2257#endif
2258
2259 qname->len = 0;
2260 n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
2261 qname->name, len,
2262 server->remote_nls, server->local_nls);
2263 if (n > 0) {
2264 qname->len = n;
2265 qname->name = name_buf;
2266 }
2267
2268out:
2269 return result;
2270}
2271
2272/* findfirst/findnext flags */
2273#define SMB_CLOSE_AFTER_FIRST (1<<0)
2274#define SMB_CLOSE_IF_END (1<<1)
2275#define SMB_REQUIRE_RESUME_KEY (1<<2)
2276#define SMB_CONTINUE_BIT (1<<3)
2277
2278/*
2279 * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
2280 * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
2281 * go there for advise.
2282 *
2283 * Bugs Noted:
2284 * (1) When using Info Level 1 Win NT 4.0 truncates directory listings
2285 * for certain patterns of names and/or lengths. The breakage pattern
2286 * is completely reproducible and can be toggled by the creation of a
2287 * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
2288 */
2289static int
2290smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
2291 struct smb_cache_control *ctl)
2292{
2293 struct dentry *dir = filp->f_path.dentry;
2294 struct smb_sb_info *server = server_from_dentry(dir);
2295 struct qstr qname;
2296 struct smb_fattr fattr;
2297
2298 unsigned char *p, *lastname;
2299 char *mask, *param;
2300 __u16 command;
2301 int first, entries_seen;
2302
2303 /* Both NT and OS/2 accept info level 1 (but see note below). */
2304 int info_level = 260;
2305 const int max_matches = 512;
2306
2307 unsigned int ff_searchcount = 0;
2308 unsigned int ff_eos = 0;
2309 unsigned int ff_lastname = 0;
2310 unsigned int ff_dir_handle = 0;
2311 unsigned int loop_count = 0;
2312 unsigned int mask_len, i;
2313 int result;
2314 struct smb_request *req;
2315 unsigned char *name_buf;
2316 static struct qstr star = {
2317 .name = "*",
2318 .len = 1,
2319 };
2320
2321 lock_kernel();
2322
2323 /*
2324 * We always prefer unix style. Use info level 1 for older
2325 * servers that don't do 260.
2326 */
2327 if (server->opt.capabilities & SMB_CAP_UNIX)
2328 info_level = SMB_FIND_FILE_UNIX;
2329 else if (server->opt.protocol < SMB_PROTOCOL_NT1)
2330 info_level = 1;
2331
2332 result = -ENOMEM;
2333 if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
2334 goto out;
2335 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
2336 goto out_name;
2337 param = req->rq_buffer;
2338
2339 /*
2340 * Encode the initial path
2341 */
2342 mask = param + 12;
2343
2344 result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
2345 if (result <= 0)
2346 goto out_free;
2347 mask_len = result - 1; /* mask_len is strlen, not #bytes */
2348 result = 0;
2349 first = 1;
2350 VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
2351
2352 entries_seen = 2;
2353 ff_eos = 0;
2354
2355 while (ff_eos == 0) {
2356 loop_count += 1;
2357 if (loop_count > 10) {
2358 printk(KERN_WARNING "smb_proc_readdir_long: "
2359 "Looping in FIND_NEXT??\n");
2360 result = -EIO;
2361 break;
2362 }
2363
2364 if (first != 0) {
2365 command = TRANSACT2_FINDFIRST;
2366 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2367 WSET(param, 2, max_matches); /* max count */
2368 WSET(param, 4, SMB_CLOSE_IF_END);
2369 WSET(param, 6, info_level);
2370 DSET(param, 8, 0);
2371 } else {
2372 command = TRANSACT2_FINDNEXT;
2373
2374 VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
2375 ff_dir_handle, ff_lastname, mask_len, mask);
2376
2377 WSET(param, 0, ff_dir_handle); /* search handle */
2378 WSET(param, 2, max_matches); /* max count */
2379 WSET(param, 4, info_level);
2380 DSET(param, 6, 0);
2381 WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
2382 }
2383
2384 req->rq_trans2_command = command;
2385 req->rq_ldata = 0;
2386 req->rq_data = NULL;
2387 req->rq_lparm = 12 + mask_len + 1;
2388 req->rq_parm = param;
2389 req->rq_flags = 0;
2390 result = smb_add_request(req);
2391 if (result < 0) {
2392 PARANOIA("error=%d, breaking\n", result);
2393 break;
2394 }
2395
2396 if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
2397 /* a damn Win95 bug - sometimes it clags if you
2398 ask it too fast */
2399 schedule_timeout_interruptible(msecs_to_jiffies(200));
2400 continue;
2401 }
2402
2403 if (req->rq_rcls != 0) {
2404 result = smb_errno(req);
2405 PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
2406 mask, result, req->rq_rcls, req->rq_err);
2407 break;
2408 }
2409
2410 /* parse out some important return info */
2411 if (first != 0) {
2412 ff_dir_handle = WVAL(req->rq_parm, 0);
2413 ff_searchcount = WVAL(req->rq_parm, 2);
2414 ff_eos = WVAL(req->rq_parm, 4);
2415 ff_lastname = WVAL(req->rq_parm, 8);
2416 } else {
2417 ff_searchcount = WVAL(req->rq_parm, 0);
2418 ff_eos = WVAL(req->rq_parm, 2);
2419 ff_lastname = WVAL(req->rq_parm, 6);
2420 }
2421
2422 if (ff_searchcount == 0)
2423 break;
2424
2425 /* Now we are ready to parse smb directory entries. */
2426
2427 /* point to the data bytes */
2428 p = req->rq_data;
2429 for (i = 0; i < ff_searchcount; i++) {
2430 /* make sure we stay within the buffer */
2431 if (p >= req->rq_data + req->rq_ldata) {
2432 printk(KERN_ERR "smb_proc_readdir_long: "
2433 "dirent pointer outside buffer! "
2434 "%p %d@%p\n",
2435 p, req->rq_ldata, req->rq_data);
2436 result = -EIO; /* always a comm. error? */
2437 goto out_free;
2438 }
2439
2440 p = smb_decode_long_dirent(server, p, info_level,
2441 &qname, &fattr, name_buf);
2442
2443 /* ignore . and .. from the server */
2444 if (entries_seen == 2 && qname.name[0] == '.') {
2445 if (qname.len == 1)
2446 continue;
2447 if (qname.name[1] == '.' && qname.len == 2)
2448 continue;
2449 }
2450
2451 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2452 &qname, &fattr))
2453 ; /* stop reading? */
2454 entries_seen++;
2455 }
2456
2457 VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
2458
2459 /*
2460 * We might need the lastname for continuations.
2461 *
2462 * Note that some servers (win95?) point to the filename and
2463 * others (NT4, Samba using NT1) to the dir entry. We assume
2464 * here that those who do not point to a filename do not need
2465 * this info to continue the listing.
2466 *
2467 * OS/2 needs this and talks infolevel 1.
2468 * NetApps want lastname with infolevel 260.
2469 * win2k want lastname with infolevel 260, and points to
2470 * the record not to the name.
2471 * Samba+CifsUnixExt doesn't need lastname.
2472 *
2473 * Both are happy if we return the data they point to. So we do.
2474 * (FIXME: above is not true with win2k)
2475 */
2476 mask_len = 0;
2477 if (info_level != SMB_FIND_FILE_UNIX &&
2478 ff_lastname > 0 && ff_lastname < req->rq_ldata) {
2479 lastname = req->rq_data + ff_lastname;
2480
2481 switch (info_level) {
2482 case 260:
2483 mask_len = req->rq_ldata - ff_lastname;
2484 break;
2485 case 1:
2486 /* lastname points to a length byte */
2487 mask_len = *lastname++;
2488 if (ff_lastname + 1 + mask_len > req->rq_ldata)
2489 mask_len = req->rq_ldata - ff_lastname - 1;
2490 break;
2491 }
2492
2493 /*
2494 * Update the mask string for the next message.
2495 */
2496 if (mask_len > 255)
2497 mask_len = 255;
2498 if (mask_len)
2499 strncpy(mask, lastname, mask_len);
2500 }
2501 mask_len = strnlen(mask, mask_len);
2502 VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
2503 mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
2504
2505 first = 0;
2506 loop_count = 0;
2507 }
2508
2509out_free:
2510 smb_rput(req);
2511out_name:
2512 kfree(name_buf);
2513out:
2514 unlock_kernel();
2515 return result;
2516}
2517
2518/*
2519 * This version uses the trans2 TRANSACT2_FINDFIRST message
2520 * to get the attribute data.
2521 *
2522 * Bugs Noted:
2523 */
2524static int
2525smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
2526 struct smb_fattr *fattr)
2527{
2528 char *param, *mask;
2529 __u16 date, time;
2530 int mask_len, result;
2531 struct smb_request *req;
2532
2533 result = -ENOMEM;
2534 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2535 goto out;
2536 param = req->rq_buffer;
2537 mask = param + 12;
2538
2539 mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
2540 if (mask_len < 0) {
2541 result = mask_len;
2542 goto out_free;
2543 }
2544 VERBOSE("name=%s, len=%d\n", mask, mask_len);
2545 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2546 WSET(param, 2, 1); /* max count */
2547 WSET(param, 4, 1); /* close after this call */
2548 WSET(param, 6, 1); /* info_level */
2549 DSET(param, 8, 0);
2550
2551 req->rq_trans2_command = TRANSACT2_FINDFIRST;
2552 req->rq_ldata = 0;
2553 req->rq_data = NULL;
2554 req->rq_lparm = 12 + mask_len;
2555 req->rq_parm = param;
2556 req->rq_flags = 0;
2557 result = smb_add_request(req);
2558 if (result < 0)
2559 goto out_free;
2560 if (req->rq_rcls != 0) {
2561 result = smb_errno(req);
2562#ifdef SMBFS_PARANOIA
2563 if (result != -ENOENT)
2564 PARANOIA("error for %s, rcls=%d, err=%d\n",
2565 mask, req->rq_rcls, req->rq_err);
2566#endif
2567 goto out_free;
2568 }
2569 /* Make sure we got enough data ... */
2570 result = -EINVAL;
2571 if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
2572 PARANOIA("bad result for %s, len=%d, count=%d\n",
2573 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
2574 goto out_free;
2575 }
2576
2577 /*
2578 * Decode the response into the fattr ...
2579 */
2580 date = WVAL(req->rq_data, 0);
2581 time = WVAL(req->rq_data, 2);
2582 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2583 fattr->f_ctime.tv_nsec = 0;
2584
2585 date = WVAL(req->rq_data, 4);
2586 time = WVAL(req->rq_data, 6);
2587 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2588 fattr->f_atime.tv_nsec = 0;
2589
2590 date = WVAL(req->rq_data, 8);
2591 time = WVAL(req->rq_data, 10);
2592 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2593 fattr->f_mtime.tv_nsec = 0;
2594 VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
2595 mask, date, time, fattr->f_mtime.tv_sec);
2596 fattr->f_size = DVAL(req->rq_data, 12);
2597 /* ULONG allocation size */
2598 fattr->attr = WVAL(req->rq_data, 20);
2599 result = 0;
2600
2601out_free:
2602 smb_rput(req);
2603out:
2604 return result;
2605}
2606
2607static int
2608smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
2609 struct smb_fattr *fattr)
2610{
2611 int result;
2612 char *p;
2613 struct smb_request *req;
2614
2615 result = -ENOMEM;
2616 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2617 goto out;
2618
2619 p = smb_setup_header(req, SMBgetatr, 0, 0);
2620 result = smb_simple_encode_path(req, &p, dir, NULL);
2621 if (result < 0)
2622 goto out_free;
2623 smb_setup_bcc(req, p);
2624
2625 if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
2626 goto out_free;
2627 fattr->attr = WVAL(req->rq_header, smb_vwv0);
2628 fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
2629 fattr->f_mtime.tv_nsec = 0;
2630 fattr->f_size = DVAL(req->rq_header, smb_vwv3);
2631 fattr->f_ctime = fattr->f_mtime;
2632 fattr->f_atime = fattr->f_mtime;
2633#ifdef SMBFS_DEBUG_TIMESTAMP
2634 printk("getattr_core: %s/%s, mtime=%ld\n",
2635 DENTRY_PATH(dir), fattr->f_mtime);
2636#endif
2637 result = 0;
2638
2639out_free:
2640 smb_rput(req);
2641out:
2642 return result;
2643}
2644
2645/*
2646 * Bugs Noted:
2647 * (1) Win 95 swaps the date and time fields in the standard info level.
2648 */
2649static int
2650smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
2651 struct smb_request *req, int infolevel)
2652{
2653 char *p, *param;
2654 int result;
2655
2656 param = req->rq_buffer;
2657 WSET(param, 0, infolevel);
2658 DSET(param, 2, 0);
2659 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
2660 if (result < 0)
2661 goto out;
2662 p = param + 6 + result;
2663
2664 req->rq_trans2_command = TRANSACT2_QPATHINFO;
2665 req->rq_ldata = 0;
2666 req->rq_data = NULL;
2667 req->rq_lparm = p - param;
2668 req->rq_parm = param;
2669 req->rq_flags = 0;
2670 result = smb_add_request(req);
2671 if (result < 0)
2672 goto out;
2673 if (req->rq_rcls != 0) {
2674 VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
2675 &param[6], result, req->rq_rcls, req->rq_err);
2676 result = smb_errno(req);
2677 goto out;
2678 }
2679 result = -ENOENT;
2680 if (req->rq_ldata < 22) {
2681 PARANOIA("not enough data for %s, len=%d\n",
2682 &param[6], req->rq_ldata);
2683 goto out;
2684 }
2685
2686 result = 0;
2687out:
2688 return result;
2689}
2690
2691static int
2692smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
2693 struct smb_fattr *attr)
2694{
2695 u16 date, time;
2696 int off_date = 0, off_time = 2;
2697 int result;
2698 struct smb_request *req;
2699
2700 result = -ENOMEM;
2701 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2702 goto out;
2703
2704 result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
2705 if (result < 0)
2706 goto out_free;
2707
2708 /*
2709 * Kludge alert: Win 95 swaps the date and time field,
2710 * contrary to the CIFS docs and Win NT practice.
2711 */
2712 if (server->mnt->flags & SMB_MOUNT_WIN95) {
2713 off_date = 2;
2714 off_time = 0;
2715 }
2716 date = WVAL(req->rq_data, off_date);
2717 time = WVAL(req->rq_data, off_time);
2718 attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2719 attr->f_ctime.tv_nsec = 0;
2720
2721 date = WVAL(req->rq_data, 4 + off_date);
2722 time = WVAL(req->rq_data, 4 + off_time);
2723 attr->f_atime.tv_sec = date_dos2unix(server, date, time);
2724 attr->f_atime.tv_nsec = 0;
2725
2726 date = WVAL(req->rq_data, 8 + off_date);
2727 time = WVAL(req->rq_data, 8 + off_time);
2728 attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2729 attr->f_mtime.tv_nsec = 0;
2730#ifdef SMBFS_DEBUG_TIMESTAMP
2731 printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
2732 DENTRY_PATH(dir), date, time, attr->f_mtime);
2733#endif
2734 attr->f_size = DVAL(req->rq_data, 12);
2735 attr->attr = WVAL(req->rq_data, 20);
2736
2737out_free:
2738 smb_rput(req);
2739out:
2740 return result;
2741}
2742
2743static int
2744smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
2745 struct smb_fattr *attr)
2746{
2747 struct smb_request *req;
2748 int result;
2749
2750 result = -ENOMEM;
2751 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2752 goto out;
2753
2754 result = smb_proc_getattr_trans2(server, dir, req,
2755 SMB_QUERY_FILE_ALL_INFO);
2756 if (result < 0)
2757 goto out_free;
2758
2759 attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
2760 attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
2761 attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
2762 /* change (24) */
2763 attr->attr = WVAL(req->rq_data, 32);
2764 /* pad? (34) */
2765 /* allocated size (40) */
2766 attr->f_size = LVAL(req->rq_data, 48);
2767
2768out_free:
2769 smb_rput(req);
2770out:
2771 return result;
2772}
2773
2774static int
2775smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
2776 struct smb_fattr *attr)
2777{
2778 struct smb_request *req;
2779 int result;
2780
2781 result = -ENOMEM;
2782 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2783 goto out;
2784
2785 result = smb_proc_getattr_trans2(server, dir, req,
2786 SMB_QUERY_FILE_UNIX_BASIC);
2787 if (result < 0)
2788 goto out_free;
2789
2790 smb_decode_unix_basic(attr, server, req->rq_data);
2791
2792out_free:
2793 smb_rput(req);
2794out:
2795 return result;
2796}
2797
2798static int
2799smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
2800 struct smb_fattr *attr)
2801{
2802 struct inode *inode = dir->d_inode;
2803 int result;
2804
2805 /* FIXME: why not use the "all" version? */
2806 result = smb_proc_getattr_trans2_std(server, dir, attr);
2807 if (result < 0)
2808 goto out;
2809
2810 /*
2811 * None of the getattr versions here can make win9x return the right
2812 * filesize if there are changes made to an open file.
2813 * A seek-to-end does return the right size, but we only need to do
2814 * that on files we have written.
2815 */
2816 if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
2817 smb_is_open(inode))
2818 {
2819 __u16 fileid = SMB_I(inode)->fileid;
2820 attr->f_size = smb_proc_seek(server, fileid, 2, 0);
2821 }
2822
2823out:
2824 return result;
2825}
2826
2827static int
2828smb_proc_ops_wait(struct smb_sb_info *server)
2829{
2830 int result;
2831
2832 result = wait_event_interruptible_timeout(server->conn_wq,
2833 server->conn_complete, 30*HZ);
2834
2835 if (!result || signal_pending(current))
2836 return -EIO;
2837
2838 return 0;
2839}
2840
2841static int
2842smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
2843 struct smb_fattr *fattr)
2844{
2845 int result;
2846
2847 if (smb_proc_ops_wait(server) < 0)
2848 return -EIO;
2849
2850 smb_init_dirent(server, fattr);
2851 result = server->ops->getattr(server, dir, fattr);
2852 smb_finish_dirent(server, fattr);
2853
2854 return result;
2855}
2856
2857static int
2858smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
2859 struct smb_cache_control *ctl)
2860{
2861 struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
2862
2863 if (smb_proc_ops_wait(server) < 0)
2864 return -EIO;
2865
2866 return server->ops->readdir(filp, dirent, filldir, ctl);
2867}
2868
2869int
2870smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
2871{
2872 struct smb_sb_info *server = server_from_dentry(dir);
2873 int result;
2874
2875 smb_init_dirent(server, fattr);
2876 result = server->ops->getattr(server, dir, fattr);
2877 smb_finish_dirent(server, fattr);
2878
2879 return result;
2880}
2881
2882
2883/*
2884 * Because of bugs in the core protocol, we use this only to set
2885 * attributes. See smb_proc_settime() below for timestamp handling.
2886 *
2887 * Bugs Noted:
2888 * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
2889 * with an undocumented error (ERRDOS code 50). Setting
2890 * mtime to 0 allows the attributes to be set.
2891 * (2) The extra parameters following the name string aren't
2892 * in the CIFS docs, but seem to be necessary for operation.
2893 */
2894static int
2895smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
2896 __u16 attr)
2897{
2898 char *p;
2899 int result;
2900 struct smb_request *req;
2901
2902 result = -ENOMEM;
2903 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2904 goto out;
2905
2906 p = smb_setup_header(req, SMBsetatr, 8, 0);
2907 WSET(req->rq_header, smb_vwv0, attr);
2908 DSET(req->rq_header, smb_vwv1, 0); /* mtime */
2909 WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
2910 WSET(req->rq_header, smb_vwv4, 0);
2911 WSET(req->rq_header, smb_vwv5, 0);
2912 WSET(req->rq_header, smb_vwv6, 0);
2913 WSET(req->rq_header, smb_vwv7, 0);
2914 result = smb_simple_encode_path(req, &p, dentry, NULL);
2915 if (result < 0)
2916 goto out_free;
2917 if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
2918 result = -ENAMETOOLONG;
2919 goto out_free;
2920 }
2921 *p++ = 4;
2922 *p++ = 0;
2923 smb_setup_bcc(req, p);
2924
2925 result = smb_request_ok(req, SMBsetatr, 0, 0);
2926 if (result < 0)
2927 goto out_free;
2928 result = 0;
2929
2930out_free:
2931 smb_rput(req);
2932out:
2933 return result;
2934}
2935
2936/*
2937 * Because of bugs in the trans2 setattr messages, we must set
2938 * attributes and timestamps separately. The core SMBsetatr
2939 * message seems to be the only reliable way to set attributes.
2940 */
2941int
2942smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
2943{
2944 struct smb_sb_info *server = server_from_dentry(dir);
2945 int result;
2946
2947 VERBOSE("setting %s/%s, open=%d\n",
2948 DENTRY_PATH(dir), smb_is_open(dir->d_inode));
2949 result = smb_proc_setattr_core(server, dir, fattr->attr);
2950 return result;
2951}
2952
2953/*
2954 * Sets the timestamps for an file open with write permissions.
2955 */
2956static int
2957smb_proc_setattr_ext(struct smb_sb_info *server,
2958 struct inode *inode, struct smb_fattr *fattr)
2959{
2960 __u16 date, time;
2961 int result;
2962 struct smb_request *req;
2963
2964 result = -ENOMEM;
2965 if (! (req = smb_alloc_request(server, 0)))
2966 goto out;
2967
2968 smb_setup_header(req, SMBsetattrE, 7, 0);
2969 WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
2970 /* We don't change the creation time */
2971 WSET(req->rq_header, smb_vwv1, 0);
2972 WSET(req->rq_header, smb_vwv2, 0);
2973 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
2974 WSET(req->rq_header, smb_vwv3, date);
2975 WSET(req->rq_header, smb_vwv4, time);
2976 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
2977 WSET(req->rq_header, smb_vwv5, date);
2978 WSET(req->rq_header, smb_vwv6, time);
2979#ifdef SMBFS_DEBUG_TIMESTAMP
2980 printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
2981 date, time, fattr->f_mtime);
2982#endif
2983
2984 req->rq_flags |= SMB_REQ_NORETRY;
2985 result = smb_request_ok(req, SMBsetattrE, 0, 0);
2986 if (result < 0)
2987 goto out_free;
2988 result = 0;
2989out_free:
2990 smb_rput(req);
2991out:
2992 return result;
2993}
2994
2995/*
2996 * Bugs Noted:
2997 * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
2998 * set the file's attribute flags.
2999 */
3000static int
3001smb_proc_setattr_trans2(struct smb_sb_info *server,
3002 struct dentry *dir, struct smb_fattr *fattr)
3003{
3004 __u16 date, time;
3005 char *p, *param;
3006 int result;
3007 char data[26];
3008 struct smb_request *req;
3009
3010 result = -ENOMEM;
3011 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3012 goto out;
3013 param = req->rq_buffer;
3014
3015 WSET(param, 0, 1); /* Info level SMB_INFO_STANDARD */
3016 DSET(param, 2, 0);
3017 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
3018 if (result < 0)
3019 goto out_free;
3020 p = param + 6 + result;
3021
3022 WSET(data, 0, 0); /* creation time */
3023 WSET(data, 2, 0);
3024 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
3025 WSET(data, 4, date);
3026 WSET(data, 6, time);
3027 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
3028 WSET(data, 8, date);
3029 WSET(data, 10, time);
3030#ifdef SMBFS_DEBUG_TIMESTAMP
3031 printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
3032 DENTRY_PATH(dir), date, time, fattr->f_mtime);
3033#endif
3034 DSET(data, 12, 0); /* size */
3035 DSET(data, 16, 0); /* blksize */
3036 WSET(data, 20, 0); /* attr */
3037 DSET(data, 22, 0); /* ULONG EA size */
3038
3039 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3040 req->rq_ldata = 26;
3041 req->rq_data = data;
3042 req->rq_lparm = p - param;
3043 req->rq_parm = param;
3044 req->rq_flags = 0;
3045 result = smb_add_request(req);
3046 if (result < 0)
3047 goto out_free;
3048 result = 0;
3049 if (req->rq_rcls != 0)
3050 result = smb_errno(req);
3051
3052out_free:
3053 smb_rput(req);
3054out:
3055 return result;
3056}
3057
3058/*
3059 * ATTR_MODE 0x001
3060 * ATTR_UID 0x002
3061 * ATTR_GID 0x004
3062 * ATTR_SIZE 0x008
3063 * ATTR_ATIME 0x010
3064 * ATTR_MTIME 0x020
3065 * ATTR_CTIME 0x040
3066 * ATTR_ATIME_SET 0x080
3067 * ATTR_MTIME_SET 0x100
3068 * ATTR_FORCE 0x200
3069 * ATTR_ATTR_FLAG 0x400
3070 *
3071 * major/minor should only be set by mknod.
3072 */
3073int
3074smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
3075 unsigned int major, unsigned int minor)
3076{
3077 struct smb_sb_info *server = server_from_dentry(d);
3078 u64 nttime;
3079 char *p, *param;
3080 int result;
3081 char data[100];
3082 struct smb_request *req;
3083
3084 result = -ENOMEM;
3085 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3086 goto out;
3087 param = req->rq_buffer;
3088
3089 DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
3090
3091 WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
3092 DSET(param, 2, 0);
3093 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3094 if (result < 0)
3095 goto out_free;
3096 p = param + 6 + result;
3097
3098 /* 0 L file size in bytes */
3099 /* 8 L file size on disk in bytes (block count) */
3100 /* 40 L uid */
3101 /* 48 L gid */
3102 /* 56 W file type enum */
3103 /* 60 L devmajor */
3104 /* 68 L devminor */
3105 /* 76 L unique ID (inode) */
3106 /* 84 L permissions */
3107 /* 92 L link count */
3108 LSET(data, 0, SMB_SIZE_NO_CHANGE);
3109 LSET(data, 8, SMB_SIZE_NO_CHANGE);
3110 LSET(data, 16, SMB_TIME_NO_CHANGE);
3111 LSET(data, 24, SMB_TIME_NO_CHANGE);
3112 LSET(data, 32, SMB_TIME_NO_CHANGE);
3113 LSET(data, 40, SMB_UID_NO_CHANGE);
3114 LSET(data, 48, SMB_GID_NO_CHANGE);
3115 DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
3116 LSET(data, 60, major);
3117 LSET(data, 68, minor);
3118 LSET(data, 76, 0);
3119 LSET(data, 84, SMB_MODE_NO_CHANGE);
3120 LSET(data, 92, 0);
3121
3122 if (attr->ia_valid & ATTR_SIZE) {
3123 LSET(data, 0, attr->ia_size);
3124 LSET(data, 8, 0); /* can't set anyway */
3125 }
3126
3127 /*
3128 * FIXME: check the conversion function it the correct one
3129 *
3130 * we can't set ctime but we might as well pass this to the server
3131 * and let it ignore it.
3132 */
3133 if (attr->ia_valid & ATTR_CTIME) {
3134 nttime = smb_unixutc2ntutc(attr->ia_ctime);
3135 LSET(data, 16, nttime);
3136 }
3137 if (attr->ia_valid & ATTR_ATIME) {
3138 nttime = smb_unixutc2ntutc(attr->ia_atime);
3139 LSET(data, 24, nttime);
3140 }
3141 if (attr->ia_valid & ATTR_MTIME) {
3142 nttime = smb_unixutc2ntutc(attr->ia_mtime);
3143 LSET(data, 32, nttime);
3144 }
3145
3146 if (attr->ia_valid & ATTR_UID) {
3147 LSET(data, 40, attr->ia_uid);
3148 }
3149 if (attr->ia_valid & ATTR_GID) {
3150 LSET(data, 48, attr->ia_gid);
3151 }
3152
3153 if (attr->ia_valid & ATTR_MODE) {
3154 LSET(data, 84, attr->ia_mode);
3155 }
3156
3157 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3158 req->rq_ldata = 100;
3159 req->rq_data = data;
3160 req->rq_lparm = p - param;
3161 req->rq_parm = param;
3162 req->rq_flags = 0;
3163 result = smb_add_request(req);
3164
3165out_free:
3166 smb_rput(req);
3167out:
3168 return result;
3169}
3170
3171
3172/*
3173 * Set the modify and access timestamps for a file.
3174 *
3175 * Incredibly enough, in all of SMB there is no message to allow
3176 * setting both attributes and timestamps at once.
3177 *
3178 * Bugs Noted:
3179 * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message
3180 * with info level 1 (INFO_STANDARD).
3181 * (2) Win 95 seems not to support setting directory timestamps.
3182 * (3) Under the core protocol apparently the only way to set the
3183 * timestamp is to open and close the file.
3184 */
3185int
3186smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
3187{
3188 struct smb_sb_info *server = server_from_dentry(dentry);
3189 struct inode *inode = dentry->d_inode;
3190 int result;
3191
3192 VERBOSE("setting %s/%s, open=%d\n",
3193 DENTRY_PATH(dentry), smb_is_open(inode));
3194
3195 /* setting the time on a Win95 server fails (tridge) */
3196 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 &&
3197 !(server->mnt->flags & SMB_MOUNT_WIN95)) {
3198 if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
3199 result = smb_proc_setattr_ext(server, inode, fattr);
3200 else
3201 result = smb_proc_setattr_trans2(server, dentry, fattr);
3202 } else {
3203 /*
3204 * Fail silently on directories ... timestamp can't be set?
3205 */
3206 result = 0;
3207 if (S_ISREG(inode->i_mode)) {
3208 /*
3209 * Set the mtime by opening and closing the file.
3210 * Note that the file is opened read-only, but this
3211 * still allows us to set the date (tridge)
3212 */
3213 result = -EACCES;
3214 if (!smb_is_open(inode))
3215 smb_proc_open(server, dentry, SMB_O_RDONLY);
3216 if (smb_is_open(inode)) {
3217 inode->i_mtime = fattr->f_mtime;
3218 result = smb_proc_close_inode(server, inode);
3219 }
3220 }
3221 }
3222
3223 return result;
3224}
3225
3226int
3227smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
3228{
3229 struct smb_sb_info *server = SMB_SB(dentry->d_sb);
3230 int result;
3231 char *p;
3232 long unit;
3233 struct smb_request *req;
3234
3235 result = -ENOMEM;
3236 if (! (req = smb_alloc_request(server, 0)))
3237 goto out;
3238
3239 smb_setup_header(req, SMBdskattr, 0, 0);
3240 if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
3241 goto out_free;
3242 p = SMB_VWV(req->rq_header);
3243 unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
3244 attr->f_blocks = WVAL(p, 0) * unit;
3245 attr->f_bsize = SMB_ST_BLKSIZE;
3246 attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
3247 result = 0;
3248
3249out_free:
3250 smb_rput(req);
3251out:
3252 return result;
3253}
3254
3255int
3256smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
3257 char *buffer, int len)
3258{
3259 char *p, *param;
3260 int result;
3261 struct smb_request *req;
3262
3263 DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
3264
3265 result = -ENOMEM;
3266 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3267 goto out;
3268 param = req->rq_buffer;
3269
3270 WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
3271 DSET(param, 2, 0);
3272 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3273 if (result < 0)
3274 goto out_free;
3275 p = param + 6 + result;
3276
3277 req->rq_trans2_command = TRANSACT2_QPATHINFO;
3278 req->rq_ldata = 0;
3279 req->rq_data = NULL;
3280 req->rq_lparm = p - param;
3281 req->rq_parm = param;
3282 req->rq_flags = 0;
3283 result = smb_add_request(req);
3284 if (result < 0)
3285 goto out_free;
3286 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3287 &param[6], result, req->rq_rcls, req->rq_err);
3288
3289 /* copy data up to the \0 or buffer length */
3290 result = len;
3291 if (req->rq_ldata < len)
3292 result = req->rq_ldata;
3293 strncpy(buffer, req->rq_data, result);
3294
3295out_free:
3296 smb_rput(req);
3297out:
3298 return result;
3299}
3300
3301
3302/*
3303 * Create a symlink object called dentry which points to oldpath.
3304 * Samba does not permit dangling links but returns a suitable error message.
3305 */
3306int
3307smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
3308 const char *oldpath)
3309{
3310 char *p, *param;
3311 int result;
3312 struct smb_request *req;
3313
3314 result = -ENOMEM;
3315 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3316 goto out;
3317 param = req->rq_buffer;
3318
3319 WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
3320 DSET(param, 2, 0);
3321 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
3322 if (result < 0)
3323 goto out_free;
3324 p = param + 6 + result;
3325
3326 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3327 req->rq_ldata = strlen(oldpath) + 1;
3328 req->rq_data = (char *) oldpath;
3329 req->rq_lparm = p - param;
3330 req->rq_parm = param;
3331 req->rq_flags = 0;
3332 result = smb_add_request(req);
3333 if (result < 0)
3334 goto out_free;
3335
3336 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3337 &param[6], result, req->rq_rcls, req->rq_err);
3338 result = 0;
3339
3340out_free:
3341 smb_rput(req);
3342out:
3343 return result;
3344}
3345
3346/*
3347 * Create a hard link object called new_dentry which points to dentry.
3348 */
3349int
3350smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
3351 struct dentry *new_dentry)
3352{
3353 char *p, *param;
3354 int result;
3355 struct smb_request *req;
3356
3357 result = -ENOMEM;
3358 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3359 goto out;
3360 param = req->rq_buffer;
3361
3362 WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
3363 DSET(param, 2, 0);
3364 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
3365 new_dentry, NULL);
3366 if (result < 0)
3367 goto out_free;
3368 p = param + 6 + result;
3369
3370 /* Grr, pointless separation of parameters and data ... */
3371 req->rq_data = p;
3372 req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
3373 dentry, NULL);
3374
3375 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3376 req->rq_lparm = p - param;
3377 req->rq_parm = param;
3378 req->rq_flags = 0;
3379 result = smb_add_request(req);
3380 if (result < 0)
3381 goto out_free;
3382
3383 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3384 &param[6], result, req->rq_rcls, req->rq_err);
3385 result = 0;
3386
3387out_free:
3388 smb_rput(req);
3389out:
3390 return result;
3391}
3392
3393static int
3394smb_proc_query_cifsunix(struct smb_sb_info *server)
3395{
3396 int result;
3397 int major, minor;
3398 u64 caps;
3399 char param[2];
3400 struct smb_request *req;
3401
3402 result = -ENOMEM;
3403 if (! (req = smb_alloc_request(server, 100)))
3404 goto out;
3405
3406 WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
3407
3408 req->rq_trans2_command = TRANSACT2_QFSINFO;
3409 req->rq_ldata = 0;
3410 req->rq_data = NULL;
3411 req->rq_lparm = 2;
3412 req->rq_parm = param;
3413 req->rq_flags = 0;
3414 result = smb_add_request(req);
3415 if (result < 0)
3416 goto out_free;
3417
3418 if (req->rq_ldata < 12) {
3419 PARANOIA("Not enough data\n");
3420 goto out_free;
3421 }
3422 major = WVAL(req->rq_data, 0);
3423 minor = WVAL(req->rq_data, 2);
3424
3425 DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
3426 major, minor);
3427 /* FIXME: verify that we are ok with this major/minor? */
3428
3429 caps = LVAL(req->rq_data, 4);
3430 DEBUG1("Server capabilities 0x%016llx\n", caps);
3431
3432out_free:
3433 smb_rput(req);
3434out:
3435 return result;
3436}
3437
3438
3439static void
3440install_ops(struct smb_ops *dst, struct smb_ops *src)
3441{
3442 memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
3443}
3444
3445/* < LANMAN2 */
3446static struct smb_ops smb_ops_core =
3447{
3448 .read = smb_proc_read,
3449 .write = smb_proc_write,
3450 .readdir = smb_proc_readdir_short,
3451 .getattr = smb_proc_getattr_core,
3452 .truncate = smb_proc_trunc32,
3453};
3454
3455/* LANMAN2, OS/2, others? */
3456static struct smb_ops smb_ops_os2 =
3457{
3458 .read = smb_proc_read,
3459 .write = smb_proc_write,
3460 .readdir = smb_proc_readdir_long,
3461 .getattr = smb_proc_getattr_trans2_std,
3462 .truncate = smb_proc_trunc32,
3463};
3464
3465/* Win95, and possibly some NetApp versions too */
3466static struct smb_ops smb_ops_win95 =
3467{
3468 .read = smb_proc_read, /* does not support 12word readX */
3469 .write = smb_proc_write,
3470 .readdir = smb_proc_readdir_long,
3471 .getattr = smb_proc_getattr_95,
3472 .truncate = smb_proc_trunc95,
3473};
3474
3475/* Samba, NT4 and NT5 */
3476static struct smb_ops smb_ops_winNT =
3477{
3478 .read = smb_proc_readX,
3479 .write = smb_proc_writeX,
3480 .readdir = smb_proc_readdir_long,
3481 .getattr = smb_proc_getattr_trans2_all,
3482 .truncate = smb_proc_trunc64,
3483};
3484
3485/* Samba w/ unix extensions. Others? */
3486static struct smb_ops smb_ops_unix =
3487{
3488 .read = smb_proc_readX,
3489 .write = smb_proc_writeX,
3490 .readdir = smb_proc_readdir_long,
3491 .getattr = smb_proc_getattr_unix,
3492 /* FIXME: core/ext/time setattr needs to be cleaned up! */
3493 /* .setattr = smb_proc_setattr_unix, */
3494 .truncate = smb_proc_trunc64,
3495};
3496
3497/* Place holder until real ops are in place */
3498static struct smb_ops smb_ops_null =
3499{
3500 .readdir = smb_proc_readdir_null,
3501 .getattr = smb_proc_getattr_null,
3502};
3503
3504void smb_install_null_ops(struct smb_ops *ops)
3505{
3506 install_ops(ops, &smb_ops_null);
3507}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e..00000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * Autogenerated with cproto on: Sat Sep 13 17:18:51 CEST 2003
3 */
4
5struct smb_request;
6struct sock;
7struct statfs;
8
9/* proc.c */
10extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
11extern __u32 smb_len(__u8 *p);
12extern int smb_get_rsize(struct smb_sb_info *server);
13extern int smb_get_wsize(struct smb_sb_info *server);
14extern int smb_errno(struct smb_request *req);
15extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
16extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
17extern int smb_open(struct dentry *dentry, int wish);
18extern int smb_close(struct inode *ino);
19extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
20extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
21extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
22extern int smb_proc_mkdir(struct dentry *dentry);
23extern int smb_proc_rmdir(struct dentry *dentry);
24extern int smb_proc_unlink(struct dentry *dentry);
25extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
26extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
27 struct super_block *sb);
28extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
29extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
30extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
31extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
32extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
33extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
34extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
35extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
36extern void smb_install_null_ops(struct smb_ops *ops);
37/* dir.c */
38extern const struct file_operations smb_dir_operations;
39extern const struct inode_operations smb_dir_inode_operations;
40extern const struct inode_operations smb_dir_inode_operations_unix;
41extern void smb_new_dentry(struct dentry *dentry);
42extern void smb_renew_times(struct dentry *dentry);
43/* cache.c */
44extern void smb_invalid_dir_cache(struct inode *dir);
45extern void smb_invalidate_dircache_entries(struct dentry *parent);
46extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
47extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
48/* sock.c */
49extern void smb_data_ready(struct sock *sk, int len);
50extern int smb_valid_socket(struct inode *inode);
51extern void smb_close_socket(struct smb_sb_info *server);
52extern int smb_recv_available(struct smb_sb_info *server);
53extern int smb_receive_header(struct smb_sb_info *server);
54extern int smb_receive_drop(struct smb_sb_info *server);
55extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
56extern int smb_send_request(struct smb_request *req);
57/* inode.c */
58extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
59extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
60extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
61extern void smb_invalidate_inodes(struct smb_sb_info *server);
62extern int smb_revalidate_inode(struct dentry *dentry);
63extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
64extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
65/* file.c */
66extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */
70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */
72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server);
74extern void smbiod_unregister_server(struct smb_sb_info *server);
75extern void smbiod_flush(struct smb_sb_info *server);
76extern int smbiod_retry(struct smb_sb_info *server);
77/* request.c */
78extern int smb_init_request_cache(void);
79extern void smb_destroy_request_cache(void);
80extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
81extern void smb_rput(struct smb_request *req);
82extern int smb_add_request(struct smb_request *req);
83extern int smb_request_send_server(struct smb_sb_info *server);
84extern int smb_request_recv(struct smb_sb_info *server);
85/* symlink.c */
86extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
87extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e86..00000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
1/*
2 * request.c
3 *
4 * Copyright (C) 2001 by Urban Widmark
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/slab.h>
13#include <linux/net.h>
14#include <linux/sched.h>
15
16#include <linux/smb_fs.h>
17#include <linux/smbno.h>
18#include <linux/smb_mount.h>
19
20#include "smb_debug.h"
21#include "request.h"
22#include "proto.h"
23
24/* #define SMB_SLAB_DEBUG (SLAB_RED_ZONE | SLAB_POISON) */
25#define SMB_SLAB_DEBUG 0
26
27/* cache for request structures */
28static struct kmem_cache *req_cachep;
29
30static int smb_request_send_req(struct smb_request *req);
31
32/*
33 /proc/slabinfo:
34 name, active, num, objsize, active_slabs, num_slaps, #pages
35*/
36
37
38int smb_init_request_cache(void)
39{
40 req_cachep = kmem_cache_create("smb_request",
41 sizeof(struct smb_request), 0,
42 SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
43 NULL);
44 if (req_cachep == NULL)
45 return -ENOMEM;
46
47 return 0;
48}
49
50void smb_destroy_request_cache(void)
51{
52 kmem_cache_destroy(req_cachep);
53}
54
55/*
56 * Allocate and initialise a request structure
57 */
58static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
59 int bufsize)
60{
61 struct smb_request *req;
62 unsigned char *buf = NULL;
63
64 req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
65 VERBOSE("allocating request: %p\n", req);
66 if (!req)
67 goto out;
68
69 if (bufsize > 0) {
70 buf = kmalloc(bufsize, GFP_NOFS);
71 if (!buf) {
72 kmem_cache_free(req_cachep, req);
73 return NULL;
74 }
75 }
76
77 req->rq_buffer = buf;
78 req->rq_bufsize = bufsize;
79 req->rq_server = server;
80 init_waitqueue_head(&req->rq_wait);
81 INIT_LIST_HEAD(&req->rq_queue);
82 atomic_set(&req->rq_count, 1);
83
84out:
85 return req;
86}
87
88struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
89{
90 struct smb_request *req = NULL;
91
92 for (;;) {
93 atomic_inc(&server->nr_requests);
94 if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
95 req = smb_do_alloc_request(server, bufsize);
96 if (req != NULL)
97 break;
98 }
99
100#if 0
101 /*
102 * Try to free up at least one request in order to stay
103 * below the hard limit
104 */
105 if (nfs_try_to_free_pages(server))
106 continue;
107
108 if (fatal_signal_pending(current))
109 return ERR_PTR(-ERESTARTSYS);
110 current->policy = SCHED_YIELD;
111 schedule();
112#else
113 /* FIXME: we want something like nfs does above, but that
114 requires changes to all callers and can wait. */
115 break;
116#endif
117 }
118 return req;
119}
120
121static void smb_free_request(struct smb_request *req)
122{
123 atomic_dec(&req->rq_server->nr_requests);
124 if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
125 kfree(req->rq_buffer);
126 kfree(req->rq_trans2buffer);
127 kmem_cache_free(req_cachep, req);
128}
129
130/*
131 * What prevents a rget to race with a rput? The count must never drop to zero
132 * while it is in use. Only rput if it is ok that it is free'd.
133 */
134static void smb_rget(struct smb_request *req)
135{
136 atomic_inc(&req->rq_count);
137}
138void smb_rput(struct smb_request *req)
139{
140 if (atomic_dec_and_test(&req->rq_count)) {
141 list_del_init(&req->rq_queue);
142 smb_free_request(req);
143 }
144}
145
146/* setup to receive the data part of the SMB */
147static int smb_setup_bcc(struct smb_request *req)
148{
149 int result = 0;
150 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
151
152 if (req->rq_rlen > req->rq_bufsize) {
153 PARANOIA("Packet too large %d > %d\n",
154 req->rq_rlen, req->rq_bufsize);
155 return -ENOBUFS;
156 }
157
158 req->rq_iov[0].iov_base = req->rq_buffer;
159 req->rq_iov[0].iov_len = req->rq_rlen;
160 req->rq_iovlen = 1;
161
162 return result;
163}
164
165/*
166 * Prepare a "normal" request structure.
167 */
168static int smb_setup_request(struct smb_request *req)
169{
170 int len = smb_len(req->rq_header) + 4;
171 req->rq_slen = len;
172
173 /* if we expect a data part in the reply we set the iov's to read it */
174 if (req->rq_resp_bcc)
175 req->rq_setup_read = smb_setup_bcc;
176
177 /* This tries to support re-using the same request */
178 req->rq_bytes_sent = 0;
179 req->rq_rcls = 0;
180 req->rq_err = 0;
181 req->rq_errno = 0;
182 req->rq_fragment = 0;
183 kfree(req->rq_trans2buffer);
184 req->rq_trans2buffer = NULL;
185
186 return 0;
187}
188
189/*
190 * Prepare a transaction2 request structure
191 */
192static int smb_setup_trans2request(struct smb_request *req)
193{
194 struct smb_sb_info *server = req->rq_server;
195 int mparam, mdata;
196 static unsigned char padding[4];
197
198 /* I know the following is very ugly, but I want to build the
199 smb packet as efficiently as possible. */
200
201 const int smb_parameters = 15;
202 const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
203 const int oparam = ALIGN(header + 3, sizeof(u32));
204 const int odata = ALIGN(oparam + req->rq_lparm, sizeof(u32));
205 const int bcc = (req->rq_data ? odata + req->rq_ldata :
206 oparam + req->rq_lparm) - header;
207
208 if ((bcc + oparam) > server->opt.max_xmit)
209 return -ENOMEM;
210 smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
211
212 /*
213 * max parameters + max data + max setup == bufsize to make NT4 happy
214 * and not abort the transfer or split into multiple responses. It also
215 * makes smbfs happy as handling packets larger than the buffer size
216 * is extra work.
217 *
218 * OS/2 is probably going to hate me for this ...
219 */
220 mparam = SMB_TRANS2_MAX_PARAM;
221 mdata = req->rq_bufsize - mparam;
222
223 mdata = server->opt.max_xmit - mparam - 100;
224 if (mdata < 1024) {
225 mdata = 1024;
226 mparam = 20;
227 }
228
229#if 0
230 /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
231 to return as one SMB. Useful for testing the fragmented trans2
232 handling. */
233 mdata = 8192;
234#endif
235
236 WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
237 WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
238 WSET(req->rq_header, smb_mprcnt, mparam);
239 WSET(req->rq_header, smb_mdrcnt, mdata);
240 WSET(req->rq_header, smb_msrcnt, 0); /* max setup always 0 ? */
241 WSET(req->rq_header, smb_flags, 0);
242 DSET(req->rq_header, smb_timeout, 0);
243 WSET(req->rq_header, smb_pscnt, req->rq_lparm);
244 WSET(req->rq_header, smb_psoff, oparam - 4);
245 WSET(req->rq_header, smb_dscnt, req->rq_ldata);
246 WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
247 *(req->rq_header + smb_suwcnt) = 0x01; /* setup count */
248 *(req->rq_header + smb_suwcnt + 1) = 0x00; /* reserved */
249 WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
250
251 req->rq_iovlen = 2;
252 req->rq_iov[0].iov_base = (void *) req->rq_header;
253 req->rq_iov[0].iov_len = oparam;
254 req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
255 req->rq_iov[1].iov_len = req->rq_lparm;
256 req->rq_slen = oparam + req->rq_lparm;
257
258 if (req->rq_data) {
259 req->rq_iovlen += 2;
260 req->rq_iov[2].iov_base = padding;
261 req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
262 req->rq_iov[3].iov_base = req->rq_data;
263 req->rq_iov[3].iov_len = req->rq_ldata;
264 req->rq_slen = odata + req->rq_ldata;
265 }
266
267 /* always a data part for trans2 replies */
268 req->rq_setup_read = smb_setup_bcc;
269
270 return 0;
271}
272
273/*
274 * Add a request and tell smbiod to process it
275 */
276int smb_add_request(struct smb_request *req)
277{
278 long timeleft;
279 struct smb_sb_info *server = req->rq_server;
280 int result = 0;
281
282 smb_setup_request(req);
283 if (req->rq_trans2_command) {
284 if (req->rq_buffer == NULL) {
285 PARANOIA("trans2 attempted without response buffer!\n");
286 return -EIO;
287 }
288 result = smb_setup_trans2request(req);
289 }
290 if (result < 0)
291 return result;
292
293#ifdef SMB_DEBUG_PACKET_SIZE
294 add_xmit_stats(req);
295#endif
296
297 /* add 'req' to the queue of requests */
298 if (smb_lock_server_interruptible(server))
299 return -EINTR;
300
301 /*
302 * Try to send the request as the process. If that fails we queue the
303 * request and let smbiod send it later.
304 */
305
306 /* FIXME: each server has a number on the maximum number of parallel
307 requests. 10, 50 or so. We should not allow more requests to be
308 active. */
309 if (server->mid > 0xf000)
310 server->mid = 0;
311 req->rq_mid = server->mid++;
312 WSET(req->rq_header, smb_mid, req->rq_mid);
313
314 result = 0;
315 if (server->state == CONN_VALID) {
316 if (list_empty(&server->xmitq))
317 result = smb_request_send_req(req);
318 if (result < 0) {
319 /* Connection lost? */
320 server->conn_error = result;
321 server->state = CONN_INVALID;
322 }
323 }
324 if (result != 1)
325 list_add_tail(&req->rq_queue, &server->xmitq);
326 smb_rget(req);
327
328 if (server->state != CONN_VALID)
329 smbiod_retry(server);
330
331 smb_unlock_server(server);
332
333 smbiod_wake_up();
334
335 timeleft = wait_event_interruptible_timeout(req->rq_wait,
336 req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
337 if (!timeleft || signal_pending(current)) {
338 /*
339 * On timeout or on interrupt we want to try and remove the
340 * request from the recvq/xmitq.
341 * First check if the request is still part of a queue. (May
342 * have been removed by some error condition)
343 */
344 smb_lock_server(server);
345 if (!list_empty(&req->rq_queue)) {
346 list_del_init(&req->rq_queue);
347 smb_rput(req);
348 }
349 smb_unlock_server(server);
350 }
351
352 if (!timeleft) {
353 PARANOIA("request [%p, mid=%d] timed out!\n",
354 req, req->rq_mid);
355 VERBOSE("smb_com: %02x\n", *(req->rq_header + smb_com));
356 VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
357 VERBOSE("smb_flg: %02x\n", *(req->rq_header + smb_flg));
358 VERBOSE("smb_tid: %04x\n", WVAL(req->rq_header, smb_tid));
359 VERBOSE("smb_pid: %04x\n", WVAL(req->rq_header, smb_pid));
360 VERBOSE("smb_uid: %04x\n", WVAL(req->rq_header, smb_uid));
361 VERBOSE("smb_mid: %04x\n", WVAL(req->rq_header, smb_mid));
362 VERBOSE("smb_wct: %02x\n", *(req->rq_header + smb_wct));
363
364 req->rq_rcls = ERRSRV;
365 req->rq_err = ERRtimeout;
366
367 /* Just in case it was "stuck" */
368 smbiod_wake_up();
369 }
370 VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
371
372 if (req->rq_rcls != 0)
373 req->rq_errno = smb_errno(req);
374 if (signal_pending(current))
375 req->rq_errno = -ERESTARTSYS;
376 return req->rq_errno;
377}
378
379/*
380 * Send a request and place it on the recvq if successfully sent.
381 * Must be called with the server lock held.
382 */
383static int smb_request_send_req(struct smb_request *req)
384{
385 struct smb_sb_info *server = req->rq_server;
386 int result;
387
388 if (req->rq_bytes_sent == 0) {
389 WSET(req->rq_header, smb_tid, server->opt.tid);
390 WSET(req->rq_header, smb_pid, 1);
391 WSET(req->rq_header, smb_uid, server->opt.server_uid);
392 }
393
394 result = smb_send_request(req);
395 if (result < 0 && result != -EAGAIN)
396 goto out;
397
398 result = 0;
399 if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
400 goto out;
401
402 list_move_tail(&req->rq_queue, &server->recvq);
403 result = 1;
404out:
405 return result;
406}
407
408/*
409 * Sends one request for this server. (smbiod)
410 * Must be called with the server lock held.
411 * Returns: <0 on error
412 * 0 if no request could be completely sent
413 * 1 if all data for one request was sent
414 */
415int smb_request_send_server(struct smb_sb_info *server)
416{
417 struct list_head *head;
418 struct smb_request *req;
419 int result;
420
421 if (server->state != CONN_VALID)
422 return 0;
423
424 /* dequeue first request, if any */
425 req = NULL;
426 head = server->xmitq.next;
427 if (head != &server->xmitq) {
428 req = list_entry(head, struct smb_request, rq_queue);
429 }
430 if (!req)
431 return 0;
432
433 result = smb_request_send_req(req);
434 if (result < 0) {
435 server->conn_error = result;
436 list_move(&req->rq_queue, &server->xmitq);
437 result = -EIO;
438 goto out;
439 }
440
441out:
442 return result;
443}
444
445/*
446 * Try to find a request matching this "mid". Typically the first entry will
447 * be the matching one.
448 */
449static struct smb_request *find_request(struct smb_sb_info *server, int mid)
450{
451 struct list_head *tmp;
452 struct smb_request *req = NULL;
453
454 list_for_each(tmp, &server->recvq) {
455 req = list_entry(tmp, struct smb_request, rq_queue);
456 if (req->rq_mid == mid) {
457 break;
458 }
459 req = NULL;
460 }
461
462 if (!req) {
463 VERBOSE("received reply with mid %d but no request!\n",
464 WVAL(server->header, smb_mid));
465 server->rstate = SMB_RECV_DROP;
466 }
467
468 return req;
469}
470
471/*
472 * Called when we have read the smb header and believe this is a response.
473 */
474static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
475{
476 int hdrlen, wct;
477
478 memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
479
480 wct = *(req->rq_header + smb_wct);
481 if (wct > 20) {
482 PARANOIA("wct too large, %d > 20\n", wct);
483 server->rstate = SMB_RECV_DROP;
484 return 0;
485 }
486
487 req->rq_resp_wct = wct;
488 hdrlen = SMB_HEADER_LEN + wct*2 + 2;
489 VERBOSE("header length: %d smb_wct: %2d\n", hdrlen, wct);
490
491 req->rq_bytes_recvd = SMB_HEADER_LEN;
492 req->rq_rlen = hdrlen;
493 req->rq_iov[0].iov_base = req->rq_header;
494 req->rq_iov[0].iov_len = hdrlen;
495 req->rq_iovlen = 1;
496 server->rstate = SMB_RECV_PARAM;
497
498#ifdef SMB_DEBUG_PACKET_SIZE
499 add_recv_stats(smb_len(server->header));
500#endif
501 return 0;
502}
503
504/*
505 * Reads the SMB parameters
506 */
507static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
508{
509 int result;
510
511 result = smb_receive(server, req);
512 if (result < 0)
513 return result;
514 if (req->rq_bytes_recvd < req->rq_rlen)
515 return 0;
516
517 VERBOSE("result: %d smb_bcc: %04x\n", result,
518 WVAL(req->rq_header, SMB_HEADER_LEN +
519 (*(req->rq_header + smb_wct) * 2)));
520
521 result = 0;
522 req->rq_iov[0].iov_base = NULL;
523 req->rq_rlen = 0;
524 if (req->rq_callback)
525 req->rq_callback(req);
526 else if (req->rq_setup_read)
527 result = req->rq_setup_read(req);
528 if (result < 0) {
529 server->rstate = SMB_RECV_DROP;
530 return result;
531 }
532
533 server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
534
535 req->rq_bytes_recvd = 0; // recvd out of the iov
536
537 VERBOSE("rlen: %d\n", req->rq_rlen);
538 if (req->rq_rlen < 0) {
539 PARANOIA("Parameters read beyond end of packet!\n");
540 server->rstate = SMB_RECV_END;
541 return -EIO;
542 }
543 return 0;
544}
545
546/*
547 * Reads the SMB data
548 */
549static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
550{
551 int result;
552
553 result = smb_receive(server, req);
554 if (result < 0)
555 goto out;
556 if (req->rq_bytes_recvd < req->rq_rlen)
557 goto out;
558 server->rstate = SMB_RECV_END;
559out:
560 VERBOSE("result: %d\n", result);
561 return result;
562}
563
564/*
565 * Receive a transaction2 response
566 * Return: 0 if the response has been fully read
567 * 1 if there are further "fragments" to read
568 * <0 if there is an error
569 */
570static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
571{
572 unsigned char *inbuf;
573 unsigned int parm_disp, parm_offset, parm_count, parm_tot;
574 unsigned int data_disp, data_offset, data_count, data_tot;
575 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
576
577 VERBOSE("handling trans2\n");
578
579 inbuf = req->rq_header;
580 data_tot = WVAL(inbuf, smb_tdrcnt);
581 parm_tot = WVAL(inbuf, smb_tprcnt);
582 parm_disp = WVAL(inbuf, smb_prdisp);
583 parm_offset = WVAL(inbuf, smb_proff);
584 parm_count = WVAL(inbuf, smb_prcnt);
585 data_disp = WVAL(inbuf, smb_drdisp);
586 data_offset = WVAL(inbuf, smb_droff);
587 data_count = WVAL(inbuf, smb_drcnt);
588
589 /* Modify offset for the split header/buffer we use */
590 if (data_count || data_offset) {
591 if (unlikely(data_offset < hdrlen))
592 goto out_bad_data;
593 else
594 data_offset -= hdrlen;
595 }
596 if (parm_count || parm_offset) {
597 if (unlikely(parm_offset < hdrlen))
598 goto out_bad_parm;
599 else
600 parm_offset -= hdrlen;
601 }
602
603 if (parm_count == parm_tot && data_count == data_tot) {
604 /*
605 * This packet has all the trans2 data.
606 *
607 * We setup the request so that this will be the common
608 * case. It may be a server error to not return a
609 * response that fits.
610 */
611 VERBOSE("single trans2 response "
612 "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
613 data_count, parm_count,
614 data_offset, parm_offset);
615 req->rq_ldata = data_count;
616 req->rq_lparm = parm_count;
617 req->rq_data = req->rq_buffer + data_offset;
618 req->rq_parm = req->rq_buffer + parm_offset;
619 if (unlikely(parm_offset + parm_count > req->rq_rlen))
620 goto out_bad_parm;
621 if (unlikely(data_offset + data_count > req->rq_rlen))
622 goto out_bad_data;
623 return 0;
624 }
625
626 VERBOSE("multi trans2 response "
627 "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
628 req->rq_fragment,
629 data_count, parm_count,
630 data_offset, parm_offset);
631
632 if (!req->rq_fragment) {
633 int buf_len;
634
635 /* We got the first trans2 fragment */
636 req->rq_fragment = 1;
637 req->rq_total_data = data_tot;
638 req->rq_total_parm = parm_tot;
639 req->rq_ldata = 0;
640 req->rq_lparm = 0;
641
642 buf_len = data_tot + parm_tot;
643 if (buf_len > SMB_MAX_PACKET_SIZE)
644 goto out_too_long;
645
646 req->rq_trans2bufsize = buf_len;
647 req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
648 if (!req->rq_trans2buffer)
649 goto out_no_mem;
650
651 req->rq_parm = req->rq_trans2buffer;
652 req->rq_data = req->rq_trans2buffer + parm_tot;
653 } else if (unlikely(req->rq_total_data < data_tot ||
654 req->rq_total_parm < parm_tot))
655 goto out_data_grew;
656
657 if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
658 parm_offset + parm_count > req->rq_rlen))
659 goto out_bad_parm;
660 if (unlikely(data_disp + data_count > req->rq_total_data ||
661 data_offset + data_count > req->rq_rlen))
662 goto out_bad_data;
663
664 inbuf = req->rq_buffer;
665 memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
666 memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
667
668 req->rq_ldata += data_count;
669 req->rq_lparm += parm_count;
670
671 /*
672 * Check whether we've received all of the data. Note that
673 * we use the packet totals -- total lengths might shrink!
674 */
675 if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
676 req->rq_ldata = data_tot;
677 req->rq_lparm = parm_tot;
678 return 0;
679 }
680 return 1;
681
682out_too_long:
683 printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
684 data_tot, parm_tot);
685 goto out_EIO;
686out_no_mem:
687 printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
688 req->rq_trans2bufsize);
689 req->rq_errno = -ENOMEM;
690 goto out;
691out_data_grew:
692 printk(KERN_ERR "smb_trans2: data/params grew!\n");
693 goto out_EIO;
694out_bad_parm:
695 printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
696 parm_disp, parm_count, parm_tot, parm_offset);
697 goto out_EIO;
698out_bad_data:
699 printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
700 data_disp, data_count, data_tot, data_offset);
701out_EIO:
702 req->rq_errno = -EIO;
703out:
704 return req->rq_errno;
705}
706
707/*
708 * State machine for receiving responses. We handle the fact that we can't
709 * read the full response in one try by having states telling us how much we
710 * have read.
711 *
712 * Must be called with the server lock held (only called from smbiod).
713 *
714 * Return: <0 on error
715 */
716int smb_request_recv(struct smb_sb_info *server)
717{
718 struct smb_request *req = NULL;
719 int result = 0;
720
721 if (smb_recv_available(server) <= 0)
722 return 0;
723
724 VERBOSE("state: %d\n", server->rstate);
725 switch (server->rstate) {
726 case SMB_RECV_DROP:
727 result = smb_receive_drop(server);
728 if (result < 0)
729 break;
730 if (server->rstate == SMB_RECV_DROP)
731 break;
732 server->rstate = SMB_RECV_START;
733 /* fallthrough */
734 case SMB_RECV_START:
735 server->smb_read = 0;
736 server->rstate = SMB_RECV_HEADER;
737 /* fallthrough */
738 case SMB_RECV_HEADER:
739 result = smb_receive_header(server);
740 if (result < 0)
741 break;
742 if (server->rstate == SMB_RECV_HEADER)
743 break;
744 if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
745 server->rstate = SMB_RECV_REQUEST;
746 break;
747 }
748 if (server->rstate != SMB_RECV_HCOMPLETE)
749 break;
750 /* fallthrough */
751 case SMB_RECV_HCOMPLETE:
752 req = find_request(server, WVAL(server->header, smb_mid));
753 if (!req)
754 break;
755 smb_init_request(server, req);
756 req->rq_rcls = *(req->rq_header + smb_rcls);
757 req->rq_err = WVAL(req->rq_header, smb_err);
758 if (server->rstate != SMB_RECV_PARAM)
759 break;
760 /* fallthrough */
761 case SMB_RECV_PARAM:
762 if (!req)
763 req = find_request(server,WVAL(server->header,smb_mid));
764 if (!req)
765 break;
766 result = smb_recv_param(server, req);
767 if (result < 0)
768 break;
769 if (server->rstate != SMB_RECV_DATA)
770 break;
771 /* fallthrough */
772 case SMB_RECV_DATA:
773 if (!req)
774 req = find_request(server,WVAL(server->header,smb_mid));
775 if (!req)
776 break;
777 result = smb_recv_data(server, req);
778 if (result < 0)
779 break;
780 break;
781
782 /* We should never be called with any of these states */
783 case SMB_RECV_END:
784 case SMB_RECV_REQUEST:
785 BUG();
786 }
787
788 if (result < 0) {
789 /* We saw an error */
790 return result;
791 }
792
793 if (server->rstate != SMB_RECV_END)
794 return 0;
795
796 result = 0;
797 if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
798 result = smb_recv_trans2(server, req);
799
800 /*
801 * Response completely read. Drop any extra bytes sent by the server.
802 * (Yes, servers sometimes add extra bytes to responses)
803 */
804 VERBOSE("smb_len: %d smb_read: %d\n",
805 server->smb_len, server->smb_read);
806 if (server->smb_read < server->smb_len)
807 smb_receive_drop(server);
808
809 server->rstate = SMB_RECV_START;
810
811 if (!result) {
812 list_del_init(&req->rq_queue);
813 req->rq_flags |= SMB_REQ_RECEIVED;
814 smb_rput(req);
815 wake_up_interruptible(&req->rq_wait);
816 }
817 return 0;
818}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c..00000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
1#include <linux/list.h>
2#include <linux/types.h>
3#include <linux/uio.h>
4#include <linux/wait.h>
5
6struct smb_request {
7 struct list_head rq_queue; /* recvq or xmitq for the server */
8
9 atomic_t rq_count;
10
11 wait_queue_head_t rq_wait;
12 int rq_flags;
13 int rq_mid; /* multiplex ID, set by request.c */
14
15 struct smb_sb_info *rq_server;
16
17 /* header + word count + parameter words + byte count */
18 unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
19
20 int rq_bufsize;
21 unsigned char *rq_buffer;
22
23 /* FIXME: this is not good enough for merging IO requests. */
24 unsigned char *rq_page;
25 int rq_rsize;
26
27 int rq_resp_wct;
28 int rq_resp_bcc;
29
30 int rq_rlen;
31 int rq_bytes_recvd;
32
33 int rq_slen;
34 int rq_bytes_sent;
35
36 int rq_iovlen;
37 struct kvec rq_iov[4];
38
39 int (*rq_setup_read) (struct smb_request *);
40 void (*rq_callback) (struct smb_request *);
41
42 /* ------ trans2 stuff ------ */
43
44 u16 rq_trans2_command; /* 0 if not a trans2 request */
45 unsigned int rq_ldata;
46 unsigned char *rq_data;
47 unsigned int rq_lparm;
48 unsigned char *rq_parm;
49
50 int rq_fragment;
51 u32 rq_total_data;
52 u32 rq_total_parm;
53 int rq_trans2bufsize;
54 unsigned char *rq_trans2buffer;
55
56 /* ------ response ------ */
57
58 unsigned short rq_rcls;
59 unsigned short rq_err;
60 int rq_errno;
61};
62
63#define SMB_REQ_STATIC 0x0001 /* rq_buffer is static */
64#define SMB_REQ_NORETRY 0x0002 /* request is invalid after retry */
65
66#define SMB_REQ_TRANSMITTED 0x4000 /* all data has been sent */
67#define SMB_REQ_RECEIVED 0x8000 /* reply received, smbiod is done */
68
69#define xSMB_REQ_NOREPLY 0x0004 /* we don't want the reply (if any) */
70#define xSMB_REQ_NORECEIVER 0x0008 /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd75..00000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * Defines some debug macros for smbfs.
3 */
4
5/* This makes a dentry parent/child name pair. Useful for debugging printk's */
6#define DENTRY_PATH(dentry) \
7 (dentry)->d_parent->d_name.name,(dentry)->d_name.name
8
9/*
10 * safety checks that should never happen ???
11 * these are normally enabled.
12 */
13#ifdef SMBFS_PARANOIA
14# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
15#else
16# define PARANOIA(f, a...) do { ; } while(0)
17#endif
18
19/* lots of debug messages */
20#ifdef SMBFS_DEBUG_VERBOSE
21# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
22#else
23# define VERBOSE(f, a...) do { ; } while(0)
24#endif
25
26/*
27 * "normal" debug messages, but not with a normal DEBUG define ... way
28 * too common name.
29 */
30#ifdef SMBFS_DEBUG
31#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
32#else
33#define DEBUG1(f, a...) do { ; } while(0)
34#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10..00000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
1/*
2 * smbiod.c
3 *
4 * Copyright (C) 2000, Charles Loep / Corel Corp.
5 * Copyright (C) 2001, Urban Widmark
6 */
7
8
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/stat.h>
14#include <linux/errno.h>
15#include <linux/init.h>
16#include <linux/file.h>
17#include <linux/dcache.h>
18#include <linux/module.h>
19#include <linux/net.h>
20#include <linux/kthread.h>
21#include <net/ip.h>
22
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <asm/system.h>
28#include <asm/uaccess.h>
29
30#include "smb_debug.h"
31#include "request.h"
32#include "proto.h"
33
34enum smbiod_state {
35 SMBIOD_DEAD,
36 SMBIOD_STARTING,
37 SMBIOD_RUNNING,
38};
39
40static enum smbiod_state smbiod_state = SMBIOD_DEAD;
41static struct task_struct *smbiod_thread;
42static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
43static LIST_HEAD(smb_servers);
44static DEFINE_SPINLOCK(servers_lock);
45
46#define SMBIOD_DATA_READY (1<<0)
47static unsigned long smbiod_flags;
48
49static int smbiod(void *);
50static int smbiod_start(void);
51
52/*
53 * called when there's work for us to do
54 */
55void smbiod_wake_up(void)
56{
57 if (smbiod_state == SMBIOD_DEAD)
58 return;
59 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
60 wake_up_interruptible(&smbiod_wait);
61}
62
63/*
64 * start smbiod if none is running
65 */
66static int smbiod_start(void)
67{
68 struct task_struct *tsk;
69 int err = 0;
70
71 if (smbiod_state != SMBIOD_DEAD)
72 return 0;
73 smbiod_state = SMBIOD_STARTING;
74 __module_get(THIS_MODULE);
75 spin_unlock(&servers_lock);
76 tsk = kthread_run(smbiod, NULL, "smbiod");
77 if (IS_ERR(tsk)) {
78 err = PTR_ERR(tsk);
79 module_put(THIS_MODULE);
80 }
81
82 spin_lock(&servers_lock);
83 if (err < 0) {
84 smbiod_state = SMBIOD_DEAD;
85 smbiod_thread = NULL;
86 } else {
87 smbiod_state = SMBIOD_RUNNING;
88 smbiod_thread = tsk;
89 }
90 return err;
91}
92
93/*
94 * register a server & start smbiod if necessary
95 */
96int smbiod_register_server(struct smb_sb_info *server)
97{
98 int ret;
99 spin_lock(&servers_lock);
100 list_add(&server->entry, &smb_servers);
101 VERBOSE("%p\n", server);
102 ret = smbiod_start();
103 spin_unlock(&servers_lock);
104 return ret;
105}
106
107/*
108 * Unregister a server
109 * Must be called with the server lock held.
110 */
111void smbiod_unregister_server(struct smb_sb_info *server)
112{
113 spin_lock(&servers_lock);
114 list_del_init(&server->entry);
115 VERBOSE("%p\n", server);
116 spin_unlock(&servers_lock);
117
118 smbiod_wake_up();
119 smbiod_flush(server);
120}
121
122void smbiod_flush(struct smb_sb_info *server)
123{
124 struct list_head *tmp, *n;
125 struct smb_request *req;
126
127 list_for_each_safe(tmp, n, &server->xmitq) {
128 req = list_entry(tmp, struct smb_request, rq_queue);
129 req->rq_errno = -EIO;
130 list_del_init(&req->rq_queue);
131 smb_rput(req);
132 wake_up_interruptible(&req->rq_wait);
133 }
134 list_for_each_safe(tmp, n, &server->recvq) {
135 req = list_entry(tmp, struct smb_request, rq_queue);
136 req->rq_errno = -EIO;
137 list_del_init(&req->rq_queue);
138 smb_rput(req);
139 wake_up_interruptible(&req->rq_wait);
140 }
141}
142
143/*
144 * Wake up smbmount and make it reconnect to the server.
145 * This must be called with the server locked.
146 *
147 * FIXME: add smbconnect version to this
148 */
149int smbiod_retry(struct smb_sb_info *server)
150{
151 struct list_head *head;
152 struct smb_request *req;
153 struct pid *pid = get_pid(server->conn_pid);
154 int result = 0;
155
156 VERBOSE("state: %d\n", server->state);
157 if (server->state == CONN_VALID || server->state == CONN_RETRYING)
158 goto out;
159
160 smb_invalidate_inodes(server);
161
162 /*
163 * Some requests are meaningless after a retry, so we abort them.
164 * One example are all requests using 'fileid' since the files are
165 * closed on retry.
166 */
167 head = server->xmitq.next;
168 while (head != &server->xmitq) {
169 req = list_entry(head, struct smb_request, rq_queue);
170 head = head->next;
171
172 req->rq_bytes_sent = 0;
173 if (req->rq_flags & SMB_REQ_NORETRY) {
174 VERBOSE("aborting request %p on xmitq\n", req);
175 req->rq_errno = -EIO;
176 list_del_init(&req->rq_queue);
177 smb_rput(req);
178 wake_up_interruptible(&req->rq_wait);
179 }
180 }
181
182 /*
183 * FIXME: test the code for retrying request we already sent
184 */
185 head = server->recvq.next;
186 while (head != &server->recvq) {
187 req = list_entry(head, struct smb_request, rq_queue);
188 head = head->next;
189#if 0
190 if (req->rq_flags & SMB_REQ_RETRY) {
191 /* must move the request to the xmitq */
192 VERBOSE("retrying request %p on recvq\n", req);
193 list_move(&req->rq_queue, &server->xmitq);
194 continue;
195 }
196#endif
197
198 VERBOSE("aborting request %p on recvq\n", req);
199 /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
200 req->rq_errno = -EIO;
201 list_del_init(&req->rq_queue);
202 smb_rput(req);
203 wake_up_interruptible(&req->rq_wait);
204 }
205
206 smb_close_socket(server);
207
208 if (!pid) {
209 /* FIXME: this is fatal, umount? */
210 printk(KERN_ERR "smb_retry: no connection process\n");
211 server->state = CONN_RETRIED;
212 goto out;
213 }
214
215 /*
216 * Change state so that only one retry per server will be started.
217 */
218 server->state = CONN_RETRYING;
219
220 /*
221 * Note: use the "priv" flag, as a user process may need to reconnect.
222 */
223 result = kill_pid(pid, SIGUSR1, 1);
224 if (result) {
225 /* FIXME: this is most likely fatal, umount? */
226 printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
227 goto out;
228 }
229 VERBOSE("signalled pid %d\n", pid_nr(pid));
230
231 /* FIXME: The retried requests should perhaps get a "time boost". */
232
233out:
234 put_pid(pid);
235 return result;
236}
237
238/*
239 * Currently handles lockingX packets.
240 */
241static void smbiod_handle_request(struct smb_sb_info *server)
242{
243 PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
244 server->rstate = SMB_RECV_DROP;
245}
246
247/*
248 * Do some IO for one server.
249 */
250static void smbiod_doio(struct smb_sb_info *server)
251{
252 int result;
253 int maxwork = 7;
254
255 if (server->state != CONN_VALID)
256 goto out;
257
258 do {
259 result = smb_request_recv(server);
260 if (result < 0) {
261 server->state = CONN_INVALID;
262 smbiod_retry(server);
263 goto out; /* reconnecting is slow */
264 } else if (server->rstate == SMB_RECV_REQUEST)
265 smbiod_handle_request(server);
266 } while (result > 0 && maxwork-- > 0);
267
268 /*
269 * If there is more to read then we want to be sure to wake up again.
270 */
271 if (server->state != CONN_VALID)
272 goto out;
273 if (smb_recv_available(server) > 0)
274 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
275
276 do {
277 result = smb_request_send_server(server);
278 if (result < 0) {
279 server->state = CONN_INVALID;
280 smbiod_retry(server);
281 goto out; /* reconnecting is slow */
282 }
283 } while (result > 0);
284
285 /*
286 * If the last request was not sent out we want to wake up again.
287 */
288 if (!list_empty(&server->xmitq))
289 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
290
291out:
292 return;
293}
294
295/*
296 * smbiod kernel thread
297 */
298static int smbiod(void *unused)
299{
300 VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
301
302 for (;;) {
303 struct smb_sb_info *server;
304 struct list_head *pos, *n;
305
306 /* FIXME: Use poll? */
307 wait_event_interruptible(smbiod_wait,
308 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
309 if (signal_pending(current)) {
310 spin_lock(&servers_lock);
311 smbiod_state = SMBIOD_DEAD;
312 spin_unlock(&servers_lock);
313 break;
314 }
315
316 clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
317
318 spin_lock(&servers_lock);
319 if (list_empty(&smb_servers)) {
320 smbiod_state = SMBIOD_DEAD;
321 spin_unlock(&servers_lock);
322 break;
323 }
324
325 list_for_each_safe(pos, n, &smb_servers) {
326 server = list_entry(pos, struct smb_sb_info, entry);
327 VERBOSE("checking server %p\n", server);
328
329 if (server->state == CONN_VALID) {
330 spin_unlock(&servers_lock);
331
332 smb_lock_server(server);
333 smbiod_doio(server);
334 smb_unlock_server(server);
335
336 spin_lock(&servers_lock);
337 }
338 }
339 spin_unlock(&servers_lock);
340 }
341
342 VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
343 module_put_and_exit(0);
344}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd..00000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
1/*
2 * sock.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/errno.h>
13#include <linux/socket.h>
14#include <linux/fcntl.h>
15#include <linux/file.h>
16#include <linux/in.h>
17#include <linux/net.h>
18#include <linux/mm.h>
19#include <linux/netdevice.h>
20#include <linux/workqueue.h>
21#include <net/scm.h>
22#include <net/tcp_states.h>
23#include <net/ip.h>
24
25#include <linux/smb_fs.h>
26#include <linux/smb.h>
27#include <linux/smbno.h>
28
29#include <asm/uaccess.h>
30#include <asm/ioctls.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37static int
38_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
39{
40 struct kvec iov = {ubuf, size};
41 struct msghdr msg = {.msg_flags = flags};
42 msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
43 return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
44}
45
46/*
47 * Return the server this socket belongs to
48 */
49static struct smb_sb_info *
50server_from_socket(struct socket *socket)
51{
52 return socket->sk->sk_user_data;
53}
54
55/*
56 * Called when there is data on the socket.
57 */
58void
59smb_data_ready(struct sock *sk, int len)
60{
61 struct smb_sb_info *server = server_from_socket(sk->sk_socket);
62 void (*data_ready)(struct sock *, int) = server->data_ready;
63
64 data_ready(sk, len);
65 VERBOSE("(%p, %d)\n", sk, len);
66 smbiod_wake_up();
67}
68
69int
70smb_valid_socket(struct inode * inode)
71{
72 return (inode && S_ISSOCK(inode->i_mode) &&
73 SOCKET_I(inode)->type == SOCK_STREAM);
74}
75
76static struct socket *
77server_sock(struct smb_sb_info *server)
78{
79 struct file *file;
80
81 if (server && (file = server->sock_file))
82 {
83#ifdef SMBFS_PARANOIA
84 if (!smb_valid_socket(file->f_path.dentry->d_inode))
85 PARANOIA("bad socket!\n");
86#endif
87 return SOCKET_I(file->f_path.dentry->d_inode);
88 }
89 return NULL;
90}
91
92void
93smb_close_socket(struct smb_sb_info *server)
94{
95 struct file * file = server->sock_file;
96
97 if (file) {
98 struct socket *sock = server_sock(server);
99
100 VERBOSE("closing socket %p\n", sock);
101 sock->sk->sk_data_ready = server->data_ready;
102 server->sock_file = NULL;
103 fput(file);
104 }
105}
106
107static int
108smb_get_length(struct socket *socket, unsigned char *header)
109{
110 int result;
111
112 result = _recvfrom(socket, header, 4, MSG_PEEK);
113 if (result == -EAGAIN)
114 return -ENODATA;
115 if (result < 0) {
116 PARANOIA("recv error = %d\n", -result);
117 return result;
118 }
119 if (result < 4)
120 return -ENODATA;
121
122 switch (header[0]) {
123 case 0x00:
124 case 0x82:
125 break;
126
127 case 0x85:
128 DEBUG1("Got SESSION KEEP ALIVE\n");
129 _recvfrom(socket, header, 4, 0); /* read away */
130 return -ENODATA;
131
132 default:
133 PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
134 return -EIO;
135 }
136
137 /* The length in the RFC NB header is the raw data length */
138 return smb_len(header);
139}
140
141int
142smb_recv_available(struct smb_sb_info *server)
143{
144 mm_segment_t oldfs;
145 int avail, err;
146 struct socket *sock = server_sock(server);
147
148 oldfs = get_fs();
149 set_fs(get_ds());
150 err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
151 set_fs(oldfs);
152 return (err >= 0) ? avail : err;
153}
154
155/*
156 * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
157 */
158static int
159smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
160{
161 struct kvec *iv = *data;
162 int i;
163 int len;
164
165 /*
166 * Eat any sent kvecs
167 */
168 while (iv->iov_len <= amount) {
169 amount -= iv->iov_len;
170 iv++;
171 (*num)--;
172 }
173
174 /*
175 * And chew down the partial one
176 */
177 vec[0].iov_len = iv->iov_len-amount;
178 vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
179 iv++;
180
181 len = vec[0].iov_len;
182
183 /*
184 * And copy any others
185 */
186 for (i = 1; i < *num; i++) {
187 vec[i] = *iv++;
188 len += vec[i].iov_len;
189 }
190
191 *data = vec;
192 return len;
193}
194
195/*
196 * smb_receive_header
197 * Only called by the smbiod thread.
198 */
199int
200smb_receive_header(struct smb_sb_info *server)
201{
202 struct socket *sock;
203 int result = 0;
204 unsigned char peek_buf[4];
205
206 result = -EIO;
207 sock = server_sock(server);
208 if (!sock)
209 goto out;
210 if (sock->sk->sk_state != TCP_ESTABLISHED)
211 goto out;
212
213 if (!server->smb_read) {
214 result = smb_get_length(sock, peek_buf);
215 if (result < 0) {
216 if (result == -ENODATA)
217 result = 0;
218 goto out;
219 }
220 server->smb_len = result + 4;
221
222 if (server->smb_len < SMB_HEADER_LEN) {
223 PARANOIA("short packet: %d\n", result);
224 server->rstate = SMB_RECV_DROP;
225 result = -EIO;
226 goto out;
227 }
228 if (server->smb_len > SMB_MAX_PACKET_SIZE) {
229 PARANOIA("long packet: %d\n", result);
230 server->rstate = SMB_RECV_DROP;
231 result = -EIO;
232 goto out;
233 }
234 }
235
236 result = _recvfrom(sock, server->header + server->smb_read,
237 SMB_HEADER_LEN - server->smb_read, 0);
238 VERBOSE("_recvfrom: %d\n", result);
239 if (result < 0) {
240 VERBOSE("receive error: %d\n", result);
241 goto out;
242 }
243 server->smb_read += result;
244
245 if (server->smb_read == SMB_HEADER_LEN)
246 server->rstate = SMB_RECV_HCOMPLETE;
247out:
248 return result;
249}
250
251static char drop_buffer[PAGE_SIZE];
252
253/*
254 * smb_receive_drop - read and throw away the data
255 * Only called by the smbiod thread.
256 *
257 * FIXME: we are in the kernel, could we just tell the socket that we want
258 * to drop stuff from the buffer?
259 */
260int
261smb_receive_drop(struct smb_sb_info *server)
262{
263 struct socket *sock;
264 unsigned int flags;
265 struct kvec iov;
266 struct msghdr msg;
267 int rlen = smb_len(server->header) - server->smb_read + 4;
268 int result = -EIO;
269
270 if (rlen > PAGE_SIZE)
271 rlen = PAGE_SIZE;
272
273 sock = server_sock(server);
274 if (!sock)
275 goto out;
276 if (sock->sk->sk_state != TCP_ESTABLISHED)
277 goto out;
278
279 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
280 iov.iov_base = drop_buffer;
281 iov.iov_len = PAGE_SIZE;
282 msg.msg_flags = flags;
283 msg.msg_name = NULL;
284 msg.msg_namelen = 0;
285 msg.msg_control = NULL;
286
287 result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
288
289 VERBOSE("read: %d\n", result);
290 if (result < 0) {
291 VERBOSE("receive error: %d\n", result);
292 goto out;
293 }
294 server->smb_read += result;
295
296 if (server->smb_read >= server->smb_len)
297 server->rstate = SMB_RECV_END;
298
299out:
300 return result;
301}
302
303/*
304 * smb_receive
305 * Only called by the smbiod thread.
306 */
307int
308smb_receive(struct smb_sb_info *server, struct smb_request *req)
309{
310 struct socket *sock;
311 unsigned int flags;
312 struct kvec iov[4];
313 struct kvec *p = req->rq_iov;
314 size_t num = req->rq_iovlen;
315 struct msghdr msg;
316 int rlen;
317 int result = -EIO;
318
319 sock = server_sock(server);
320 if (!sock)
321 goto out;
322 if (sock->sk->sk_state != TCP_ESTABLISHED)
323 goto out;
324
325 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
326 msg.msg_flags = flags;
327 msg.msg_name = NULL;
328 msg.msg_namelen = 0;
329 msg.msg_control = NULL;
330
331 /* Dont repeat bytes and count available bufferspace */
332 rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
333 (req->rq_rlen - req->rq_bytes_recvd));
334
335 result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
336
337 VERBOSE("read: %d\n", result);
338 if (result < 0) {
339 VERBOSE("receive error: %d\n", result);
340 goto out;
341 }
342 req->rq_bytes_recvd += result;
343 server->smb_read += result;
344
345out:
346 return result;
347}
348
349/*
350 * Try to send a SMB request. This may return after sending only parts of the
351 * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
352 *
353 * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
354 */
355int
356smb_send_request(struct smb_request *req)
357{
358 struct smb_sb_info *server = req->rq_server;
359 struct socket *sock;
360 struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
361 int slen = req->rq_slen - req->rq_bytes_sent;
362 int result = -EIO;
363 struct kvec iov[4];
364 struct kvec *p = req->rq_iov;
365 size_t num = req->rq_iovlen;
366
367 sock = server_sock(server);
368 if (!sock)
369 goto out;
370 if (sock->sk->sk_state != TCP_ESTABLISHED)
371 goto out;
372
373 /* Dont repeat bytes */
374 if (req->rq_bytes_sent)
375 smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
376
377 result = kernel_sendmsg(sock, &msg, p, num, slen);
378
379 if (result >= 0) {
380 req->rq_bytes_sent += result;
381 if (req->rq_bytes_sent >= req->rq_slen)
382 req->rq_flags |= SMB_REQ_TRANSMITTED;
383 }
384out:
385 return result;
386}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd46..00000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * symlink.c
3 *
4 * Copyright (C) 2002 by John Newbigin
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/fcntl.h>
12#include <linux/stat.h>
13#include <linux/mm.h>
14#include <linux/slab.h>
15#include <linux/pagemap.h>
16#include <linux/net.h>
17#include <linux/namei.h>
18
19#include <asm/uaccess.h>
20#include <asm/system.h>
21
22#include <linux/smbno.h>
23#include <linux/smb_fs.h>
24
25#include "smb_debug.h"
26#include "proto.h"
27
28int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
29{
30 DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
31
32 return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
33}
34
35static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
36{
37 char *link = __getname();
38 DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
39
40 if (!link) {
41 link = ERR_PTR(-ENOMEM);
42 } else {
43 int len = smb_proc_read_link(server_from_dentry(dentry),
44 dentry, link, PATH_MAX - 1);
45 if (len < 0) {
46 __putname(link);
47 link = ERR_PTR(len);
48 } else {
49 link[len] = 0;
50 }
51 }
52 nd_set_link(nd, link);
53 return NULL;
54}
55
56static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
57{
58 char *s = nd_get_link(nd);
59 if (!IS_ERR(s))
60 __putname(s);
61}
62
63const struct inode_operations smb_link_inode_operations =
64{
65 .readlink = generic_readlink,
66 .follow_link = smb_follow_link,
67 .put_link = smb_put_link,
68};
diff --git a/fs/splice.c b/fs/splice.c
index efdbfece993..8f1dfaecc8f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -399,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
399 * If the page isn't uptodate, we may need to start io on it 399 * If the page isn't uptodate, we may need to start io on it
400 */ 400 */
401 if (!PageUptodate(page)) { 401 if (!PageUptodate(page)) {
402 /* 402 lock_page(page);
403 * If in nonblock mode then dont block on waiting
404 * for an in-flight io page
405 */
406 if (flags & SPLICE_F_NONBLOCK) {
407 if (!trylock_page(page)) {
408 error = -EAGAIN;
409 break;
410 }
411 } else
412 lock_page(page);
413 403
414 /* 404 /*
415 * Page was truncated, or invalidated by the 405 * Page was truncated, or invalidated by the
@@ -597,7 +587,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
597 struct page *pages[PIPE_DEF_BUFFERS]; 587 struct page *pages[PIPE_DEF_BUFFERS];
598 struct partial_page partial[PIPE_DEF_BUFFERS]; 588 struct partial_page partial[PIPE_DEF_BUFFERS];
599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; 589 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
600 pgoff_t index;
601 ssize_t res; 590 ssize_t res;
602 size_t this_len; 591 size_t this_len;
603 int error; 592 int error;
@@ -621,7 +610,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
621 goto shrink_ret; 610 goto shrink_ret;
622 } 611 }
623 612
624 index = *ppos >> PAGE_CACHE_SHIFT;
625 offset = *ppos & ~PAGE_CACHE_MASK; 613 offset = *ppos & ~PAGE_CACHE_MASK;
626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 614 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
627 615
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index cc6ce8a84c2..e5f63da64d0 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
5 help 5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed 6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only 7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib compression to compress both 8 filesystem for Linux. It uses zlib/lzo compression to compress both
9 files, inodes and directories. Inodes in the system are very small 9 files, inodes and directories. Inodes in the system are very small
10 and all blocks are packed to minimise data overhead. Block sizes 10 and all blocks are packed to minimise data overhead. Block sizes
11 greater than 4K are supported up to a maximum of 1 Mbytes (default 11 greater than 4K are supported up to a maximum of 1 Mbytes (default
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files 12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
13 (larger than 4GB), full uid/gid information, hard links and 13 (larger than 4GB), full uid/gid information, hard links and
14 timestamps. 14 timestamps.
15 15
16 Squashfs is intended for general read-only filesystem use, for 16 Squashfs is intended for general read-only filesystem use, for
17 archival use (i.e. in cases where a .tar.gz file may be used), and in 17 archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,7 +26,7 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n 32 default n
@@ -37,9 +37,24 @@ config SQUASHFS_XATTRS
37 37
38 If unsure, say N. 38 If unsure, say N.
39 39
40config SQUASHFS_EMBEDDED 40config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS
45 help
46 Saying Y here includes support for reading Squashfs file systems
47 compressed with LZO compresssion. LZO compression is mainly
48 aimed at embedded systems with slower CPUs where the overheads
49 of zlib are too high.
41 50
42 bool "Additional option for memory-constrained systems" 51 LZO is not the standard compression used in Squashfs and so most
52 file systems will be readable without selecting this option.
53
54 If unsure, say N.
55
56config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems"
43 depends on SQUASHFS 58 depends on SQUASHFS
44 default n 59 default n
45 help 60 help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 2cee3e9fa45..7672bac8d32 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,5 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac..24af9ce9722 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41}; 41};
42 42
43#ifndef CONFIG_SQUASHFS_LZO
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45}; 46};
47#endif
46 48
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 49static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0 50 NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
51static const struct squashfs_decompressor *decompressor[] = { 53static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops, 54 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops, 55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops,
58#else
54 &squashfs_lzo_unsupported_comp_ops, 59 &squashfs_lzo_unsupported_comp_ops,
60#endif
55 &squashfs_unknown_comp_ops 61 &squashfs_unknown_comp_ops
56}; 62};
57 63
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac658..0dc340aa2be 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
230 230
231const struct file_operations squashfs_dir_ops = { 231const struct file_operations squashfs_dir_ops = {
232 .read = generic_read_dir, 232 .read = generic_read_dir,
233 .readdir = squashfs_readdir 233 .readdir = squashfs_readdir,
234 .llseek = default_llseek,
234}; 235};
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 00000000000..5d87789bf1c
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010 LG Electronics
5 * Chan Jeong <chan.jeong@lge.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * lzo_wrapper.c
22 */
23
24#include <linux/mutex.h>
25#include <linux/buffer_head.h>
26#include <linux/slab.h>
27#include <linux/vmalloc.h>
28#include <linux/lzo.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_lzo {
37 void *input;
38 void *output;
39};
40
41static void *lzo_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48 stream->input = vmalloc(block_size);
49 if (stream->input == NULL)
50 goto failed;
51 stream->output = vmalloc(block_size);
52 if (stream->output == NULL)
53 goto failed2;
54
55 return stream;
56
57failed2:
58 vfree(stream->input);
59failed:
60 ERROR("Failed to allocate lzo workspace\n");
61 kfree(stream);
62 return NULL;
63}
64
65
66static void lzo_free(void *strm)
67{
68 struct squashfs_lzo *stream = strm;
69
70 if (stream) {
71 vfree(stream->input);
72 vfree(stream->output);
73 }
74 kfree(stream);
75}
76
77
78static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
79 struct buffer_head **bh, int b, int offset, int length, int srclength,
80 int pages)
81{
82 struct squashfs_lzo *stream = msblk->stream;
83 void *buff = stream->input;
84 int avail, i, bytes = length, res;
85 size_t out_len = srclength;
86
87 mutex_lock(&msblk->read_data_mutex);
88
89 for (i = 0; i < b; i++) {
90 wait_on_buffer(bh[i]);
91 if (!buffer_uptodate(bh[i]))
92 goto block_release;
93
94 avail = min(bytes, msblk->devblksize - offset);
95 memcpy(buff, bh[i]->b_data + offset, avail);
96 buff += avail;
97 bytes -= avail;
98 offset = 0;
99 put_bh(bh[i]);
100 }
101
102 res = lzo1x_decompress_safe(stream->input, (size_t)length,
103 stream->output, &out_len);
104 if (res != LZO_E_OK)
105 goto failed;
106
107 res = bytes = (int)out_len;
108 for (i = 0, buff = stream->output; bytes && i < pages; i++) {
109 avail = min_t(int, bytes, PAGE_CACHE_SIZE);
110 memcpy(buffer[i], buff, avail);
111 buff += avail;
112 bytes -= avail;
113 }
114
115 mutex_unlock(&msblk->read_data_mutex);
116 return res;
117
118block_release:
119 for (; i < b; i++)
120 put_bh(bh[i]);
121
122failed:
123 mutex_unlock(&msblk->read_data_mutex);
124
125 ERROR("lzo decompression failed, data probably corrupt\n");
126 return -EIO;
127}
128
129const struct squashfs_decompressor squashfs_lzo_comp_ops = {
130 .init = lzo_init,
131 .free = lzo_free,
132 .decompress = lzo_uncompress,
133 .id = LZO_COMPRESSION,
134 .name = "lzo",
135 .supported = 1
136};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 733a17c4294..5d45569d5f7 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -104,3 +104,6 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 104
105/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 8eabb808b78..c5137fc9ab1 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -274,7 +274,7 @@ struct squashfs_base_inode {
274 __le16 uid; 274 __le16 uid;
275 __le16 guid; 275 __le16 guid;
276 __le32 mtime; 276 __le32 mtime;
277 __le32 inode_number; 277 __le32 inode_number;
278}; 278};
279 279
280struct squashfs_ipc_inode { 280struct squashfs_ipc_inode {
@@ -283,7 +283,7 @@ struct squashfs_ipc_inode {
283 __le16 uid; 283 __le16 uid;
284 __le16 guid; 284 __le16 guid;
285 __le32 mtime; 285 __le32 mtime;
286 __le32 inode_number; 286 __le32 inode_number;
287 __le32 nlink; 287 __le32 nlink;
288}; 288};
289 289
@@ -293,7 +293,7 @@ struct squashfs_lipc_inode {
293 __le16 uid; 293 __le16 uid;
294 __le16 guid; 294 __le16 guid;
295 __le32 mtime; 295 __le32 mtime;
296 __le32 inode_number; 296 __le32 inode_number;
297 __le32 nlink; 297 __le32 nlink;
298 __le32 xattr; 298 __le32 xattr;
299}; 299};
@@ -304,7 +304,7 @@ struct squashfs_dev_inode {
304 __le16 uid; 304 __le16 uid;
305 __le16 guid; 305 __le16 guid;
306 __le32 mtime; 306 __le32 mtime;
307 __le32 inode_number; 307 __le32 inode_number;
308 __le32 nlink; 308 __le32 nlink;
309 __le32 rdev; 309 __le32 rdev;
310}; 310};
@@ -315,7 +315,7 @@ struct squashfs_ldev_inode {
315 __le16 uid; 315 __le16 uid;
316 __le16 guid; 316 __le16 guid;
317 __le32 mtime; 317 __le32 mtime;
318 __le32 inode_number; 318 __le32 inode_number;
319 __le32 nlink; 319 __le32 nlink;
320 __le32 rdev; 320 __le32 rdev;
321 __le32 xattr; 321 __le32 xattr;
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
327 __le16 uid; 327 __le16 uid;
328 __le16 guid; 328 __le16 guid;
329 __le32 mtime; 329 __le32 mtime;
330 __le32 inode_number; 330 __le32 inode_number;
331 __le32 nlink; 331 __le32 nlink;
332 __le32 symlink_size; 332 __le32 symlink_size;
333 char symlink[0]; 333 char symlink[0];
@@ -339,7 +339,7 @@ struct squashfs_reg_inode {
339 __le16 uid; 339 __le16 uid;
340 __le16 guid; 340 __le16 guid;
341 __le32 mtime; 341 __le32 mtime;
342 __le32 inode_number; 342 __le32 inode_number;
343 __le32 start_block; 343 __le32 start_block;
344 __le32 fragment; 344 __le32 fragment;
345 __le32 offset; 345 __le32 offset;
@@ -353,7 +353,7 @@ struct squashfs_lreg_inode {
353 __le16 uid; 353 __le16 uid;
354 __le16 guid; 354 __le16 guid;
355 __le32 mtime; 355 __le32 mtime;
356 __le32 inode_number; 356 __le32 inode_number;
357 __le64 start_block; 357 __le64 start_block;
358 __le64 file_size; 358 __le64 file_size;
359 __le64 sparse; 359 __le64 sparse;
@@ -370,7 +370,7 @@ struct squashfs_dir_inode {
370 __le16 uid; 370 __le16 uid;
371 __le16 guid; 371 __le16 guid;
372 __le32 mtime; 372 __le32 mtime;
373 __le32 inode_number; 373 __le32 inode_number;
374 __le32 start_block; 374 __le32 start_block;
375 __le32 nlink; 375 __le32 nlink;
376 __le16 file_size; 376 __le16 file_size;
@@ -384,7 +384,7 @@ struct squashfs_ldir_inode {
384 __le16 uid; 384 __le16 uid;
385 __le16 guid; 385 __le16 guid;
386 __le32 mtime; 386 __le32 mtime;
387 __le32 inode_number; 387 __le32 inode_number;
388 __le32 nlink; 388 __le32 nlink;
389 __le32 file_size; 389 __le32 file_size;
390 __le32 start_block; 390 __le32 start_block;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f860665..24de30ba34c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/mutex.h> 33#include <linux/mutex.h>
35#include <linux/pagemap.h> 34#include <linux/pagemap.h>
36#include <linux/init.h> 35#include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
354 353
355static void squashfs_put_super(struct super_block *sb) 354static void squashfs_put_super(struct super_block *sb)
356{ 355{
357 lock_kernel();
358
359 if (sb->s_fs_info) { 356 if (sb->s_fs_info) {
360 struct squashfs_sb_info *sbi = sb->s_fs_info; 357 struct squashfs_sb_info *sbi = sb->s_fs_info;
361 squashfs_cache_delete(sbi->block_cache); 358 squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
370 kfree(sb->s_fs_info); 367 kfree(sb->s_fs_info);
371 sb->s_fs_info = NULL; 368 sb->s_fs_info = NULL;
372 } 369 }
373
374 unlock_kernel();
375} 370}
376 371
377 372
378static int squashfs_get_sb(struct file_system_type *fs_type, int flags, 373static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
379 const char *dev_name, void *data, 374 const char *dev_name, void *data)
380 struct vfsmount *mnt)
381{ 375{
382 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, 376 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
383 mnt);
384} 377}
385 378
386 379
@@ -456,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
456static struct file_system_type squashfs_fs_type = { 449static struct file_system_type squashfs_fs_type = {
457 .owner = THIS_MODULE, 450 .owner = THIS_MODULE,
458 .name = "squashfs", 451 .name = "squashfs",
459 .get_sb = squashfs_get_sb, 452 .mount = squashfs_mount,
460 .kill_sb = kill_block_super, 453 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV 454 .fs_flags = FS_REQUIRES_DEV
462}; 455};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index c7655e8b31c..3876c36699a 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -18,7 +18,7 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * 20 *
21 * xattr_id.c 21 * xattr.c
22 */ 22 */
23 23
24#include <linux/init.h> 24#include <linux/init.h>
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
158 strncmp(target, name, name_size) == 0) { 158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */ 159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) { 160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr; 161 __le64 xattr_val;
162 u64 xattr;
162 /* val is a reference to the real location */ 163 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start, 164 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val)); 165 &offset, sizeof(val));
165 if (err < 0) 166 if (err < 0)
166 goto failed; 167 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start, 168 err = squashfs_read_metadata(sb, &xattr_val,
168 &offset, sizeof(xattr)); 169 &start, &offset, sizeof(xattr_val));
169 if (err < 0) 170 if (err < 0)
170 goto failed; 171 goto failed;
171 xattr = le64_to_cpu(xattr); 172 xattr = le64_to_cpu(xattr_val);
172 start = SQUASHFS_XATTR_BLK(xattr) + 173 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table; 174 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr); 175 offset = SQUASHFS_XATTR_OFFSET(xattr);
@@ -295,7 +296,7 @@ static const struct xattr_handler squashfs_xattr_security_handler = {
295 .get = squashfs_security_get 296 .get = squashfs_security_get
296}; 297};
297 298
298static inline const struct xattr_handler *squashfs_xattr_handler(int type) 299static const struct xattr_handler *squashfs_xattr_handler(int type)
299{ 300{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) 301 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */ 302 /* ignore unrecognised type */
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 9da071ae181..b634efce4bd 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -21,11 +21,11 @@
21 * xattr.h 21 * xattr.h
22 */ 22 */
23 23
24#ifdef CONFIG_SQUASHFS_XATTRS 24#ifdef CONFIG_SQUASHFS_XATTR
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, 25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *); 26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, 27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *); 28 unsigned int *, unsigned long long *);
29#else 29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, 30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids) 31 u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
35} 35}
36 36
37static inline int squashfs_xattr_lookup(struct super_block *sb, 37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size, 38 unsigned int index, int *count, unsigned int *size,
39 unsigned long long *xattr) 39 unsigned long long *xattr)
40{ 40{
41 return 0; 41 return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098..d33be5dd6c3 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h" 35#include "squashfs_fs_i.h"
36#include "squashfs.h" 36#include "squashfs.h"
37#include "xattr.h"
37 38
38/* 39/*
39 * Map xattr id using the xattr id look up table 40 * Map xattr id using the xattr id look up table
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c573..12e90e21390 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
68} 68}
69EXPORT_SYMBOL(vfs_fstat); 69EXPORT_SYMBOL(vfs_fstat);
70 70
71int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag) 71int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
72 int flag)
72{ 73{
73 struct path path; 74 struct path path;
74 int error = -EINVAL; 75 int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
91} 92}
92EXPORT_SYMBOL(vfs_fstatat); 93EXPORT_SYMBOL(vfs_fstatat);
93 94
94int vfs_stat(char __user *name, struct kstat *stat) 95int vfs_stat(const char __user *name, struct kstat *stat)
95{ 96{
96 return vfs_fstatat(AT_FDCWD, name, stat, 0); 97 return vfs_fstatat(AT_FDCWD, name, stat, 0);
97} 98}
98EXPORT_SYMBOL(vfs_stat); 99EXPORT_SYMBOL(vfs_stat);
99 100
100int vfs_lstat(char __user *name, struct kstat *stat) 101int vfs_lstat(const char __user *name, struct kstat *stat)
101{ 102{
102 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); 103 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
103} 104}
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
147 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 148 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
148} 149}
149 150
150SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 151SYSCALL_DEFINE2(stat, const char __user *, filename,
152 struct __old_kernel_stat __user *, statbuf)
151{ 153{
152 struct kstat stat; 154 struct kstat stat;
153 int error; 155 int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
159 return cp_old_stat(&stat, statbuf); 161 return cp_old_stat(&stat, statbuf);
160} 162}
161 163
162SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 164SYSCALL_DEFINE2(lstat, const char __user *, filename,
165 struct __old_kernel_stat __user *, statbuf)
163{ 166{
164 struct kstat stat; 167 struct kstat stat;
165 int error; 168 int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
234 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 237 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
235} 238}
236 239
237SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) 240SYSCALL_DEFINE2(newstat, const char __user *, filename,
241 struct stat __user *, statbuf)
238{ 242{
239 struct kstat stat; 243 struct kstat stat;
240 int error = vfs_stat(filename, &stat); 244 int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
244 return cp_new_stat(&stat, statbuf); 248 return cp_new_stat(&stat, statbuf);
245} 249}
246 250
247SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) 251SYSCALL_DEFINE2(newlstat, const char __user *, filename,
252 struct stat __user *, statbuf)
248{ 253{
249 struct kstat stat; 254 struct kstat stat;
250 int error; 255 int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
257} 262}
258 263
259#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) 264#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
260SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename, 265SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
261 struct stat __user *, statbuf, int, flag) 266 struct stat __user *, statbuf, int, flag)
262{ 267{
263 struct kstat stat; 268 struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
355 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 360 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
356} 361}
357 362
358SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf) 363SYSCALL_DEFINE2(stat64, const char __user *, filename,
364 struct stat64 __user *, statbuf)
359{ 365{
360 struct kstat stat; 366 struct kstat stat;
361 int error = vfs_stat(filename, &stat); 367 int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
366 return error; 372 return error;
367} 373}
368 374
369SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf) 375SYSCALL_DEFINE2(lstat64, const char __user *, filename,
376 struct stat64 __user *, statbuf)
370{ 377{
371 struct kstat stat; 378 struct kstat stat;
372 int error = vfs_lstat(filename, &stat); 379 int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
388 return error; 395 return error;
389} 396}
390 397
391SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, 398SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
392 struct stat64 __user *, statbuf, int, flag) 399 struct stat64 __user *, statbuf, int, flag)
393{ 400{
394 struct kstat stat; 401 struct kstat stat;
diff --git a/fs/statfs.c b/fs/statfs.c
index 4ef021f3b61..30ea8c8a996 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -2,38 +2,83 @@
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/file.h> 4#include <linux/file.h>
5#include <linux/mount.h>
5#include <linux/namei.h> 6#include <linux/namei.h>
6#include <linux/statfs.h> 7#include <linux/statfs.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <linux/uaccess.h> 9#include <linux/uaccess.h>
9 10
10int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 11static int flags_by_mnt(int mnt_flags)
11{ 12{
12 int retval = -ENODEV; 13 int flags = 0;
13 14
14 if (dentry) { 15 if (mnt_flags & MNT_READONLY)
15 retval = -ENOSYS; 16 flags |= ST_RDONLY;
16 if (dentry->d_sb->s_op->statfs) { 17 if (mnt_flags & MNT_NOSUID)
17 memset(buf, 0, sizeof(*buf)); 18 flags |= ST_NOSUID;
18 retval = security_sb_statfs(dentry); 19 if (mnt_flags & MNT_NODEV)
19 if (retval) 20 flags |= ST_NODEV;
20 return retval; 21 if (mnt_flags & MNT_NOEXEC)
21 retval = dentry->d_sb->s_op->statfs(dentry, buf); 22 flags |= ST_NOEXEC;
22 if (retval == 0 && buf->f_frsize == 0) 23 if (mnt_flags & MNT_NOATIME)
23 buf->f_frsize = buf->f_bsize; 24 flags |= ST_NOATIME;
24 } 25 if (mnt_flags & MNT_NODIRATIME)
25 } 26 flags |= ST_NODIRATIME;
27 if (mnt_flags & MNT_RELATIME)
28 flags |= ST_RELATIME;
29 return flags;
30}
31
32static int flags_by_sb(int s_flags)
33{
34 int flags = 0;
35 if (s_flags & MS_SYNCHRONOUS)
36 flags |= ST_SYNCHRONOUS;
37 if (s_flags & MS_MANDLOCK)
38 flags |= ST_MANDLOCK;
39 return flags;
40}
41
42static int calculate_f_flags(struct vfsmount *mnt)
43{
44 return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
45 flags_by_sb(mnt->mnt_sb->s_flags);
46}
47
48int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
49{
50 int retval;
51
52 if (!dentry->d_sb->s_op->statfs)
53 return -ENOSYS;
54
55 memset(buf, 0, sizeof(*buf));
56 retval = security_sb_statfs(dentry);
57 if (retval)
58 return retval;
59 retval = dentry->d_sb->s_op->statfs(dentry, buf);
60 if (retval == 0 && buf->f_frsize == 0)
61 buf->f_frsize = buf->f_bsize;
26 return retval; 62 return retval;
27} 63}
28 64
65int vfs_statfs(struct path *path, struct kstatfs *buf)
66{
67 int error;
68
69 error = statfs_by_dentry(path->dentry, buf);
70 if (!error)
71 buf->f_flags = calculate_f_flags(path->mnt);
72 return error;
73}
29EXPORT_SYMBOL(vfs_statfs); 74EXPORT_SYMBOL(vfs_statfs);
30 75
31static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) 76static int do_statfs_native(struct path *path, struct statfs *buf)
32{ 77{
33 struct kstatfs st; 78 struct kstatfs st;
34 int retval; 79 int retval;
35 80
36 retval = vfs_statfs(dentry, &st); 81 retval = vfs_statfs(path, &st);
37 if (retval) 82 if (retval)
38 return retval; 83 return retval;
39 84
@@ -67,17 +112,18 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
67 buf->f_fsid = st.f_fsid; 112 buf->f_fsid = st.f_fsid;
68 buf->f_namelen = st.f_namelen; 113 buf->f_namelen = st.f_namelen;
69 buf->f_frsize = st.f_frsize; 114 buf->f_frsize = st.f_frsize;
115 buf->f_flags = st.f_flags;
70 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 116 memset(buf->f_spare, 0, sizeof(buf->f_spare));
71 } 117 }
72 return 0; 118 return 0;
73} 119}
74 120
75static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) 121static int do_statfs64(struct path *path, struct statfs64 *buf)
76{ 122{
77 struct kstatfs st; 123 struct kstatfs st;
78 int retval; 124 int retval;
79 125
80 retval = vfs_statfs(dentry, &st); 126 retval = vfs_statfs(path, &st);
81 if (retval) 127 if (retval)
82 return retval; 128 return retval;
83 129
@@ -94,6 +140,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
94 buf->f_fsid = st.f_fsid; 140 buf->f_fsid = st.f_fsid;
95 buf->f_namelen = st.f_namelen; 141 buf->f_namelen = st.f_namelen;
96 buf->f_frsize = st.f_frsize; 142 buf->f_frsize = st.f_frsize;
143 buf->f_flags = st.f_flags;
97 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 144 memset(buf->f_spare, 0, sizeof(buf->f_spare));
98 } 145 }
99 return 0; 146 return 0;
@@ -107,7 +154,7 @@ SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, b
107 error = user_path(pathname, &path); 154 error = user_path(pathname, &path);
108 if (!error) { 155 if (!error) {
109 struct statfs tmp; 156 struct statfs tmp;
110 error = vfs_statfs_native(path.dentry, &tmp); 157 error = do_statfs_native(&path, &tmp);
111 if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) 158 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
112 error = -EFAULT; 159 error = -EFAULT;
113 path_put(&path); 160 path_put(&path);
@@ -125,7 +172,7 @@ SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct stat
125 error = user_path(pathname, &path); 172 error = user_path(pathname, &path);
126 if (!error) { 173 if (!error) {
127 struct statfs64 tmp; 174 struct statfs64 tmp;
128 error = vfs_statfs64(path.dentry, &tmp); 175 error = do_statfs64(&path, &tmp);
129 if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) 176 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
130 error = -EFAULT; 177 error = -EFAULT;
131 path_put(&path); 178 path_put(&path);
@@ -143,7 +190,7 @@ SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
143 file = fget(fd); 190 file = fget(fd);
144 if (!file) 191 if (!file)
145 goto out; 192 goto out;
146 error = vfs_statfs_native(file->f_path.dentry, &tmp); 193 error = do_statfs_native(&file->f_path, &tmp);
147 if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) 194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
148 error = -EFAULT; 195 error = -EFAULT;
149 fput(file); 196 fput(file);
@@ -164,7 +211,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
164 file = fget(fd); 211 file = fget(fd);
165 if (!file) 212 if (!file)
166 goto out; 213 goto out;
167 error = vfs_statfs64(file->f_path.dentry, &tmp); 214 error = do_statfs64(&file->f_path, &tmp);
168 if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) 215 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
169 error = -EFAULT; 216 error = -EFAULT;
170 fput(file); 217 fput(file);
@@ -183,7 +230,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
183 if (!s) 230 if (!s)
184 return -EINVAL; 231 return -EINVAL;
185 232
186 err = vfs_statfs(s->s_root, &sbuf); 233 err = statfs_by_dentry(s->s_root, &sbuf);
187 drop_super(s); 234 drop_super(s);
188 if (err) 235 if (err)
189 return err; 236 return err;
diff --git a/fs/super.c b/fs/super.c
index 938119ab8dc..ca696155cd9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
54 s = NULL; 54 s = NULL;
55 goto out; 55 goto out;
56 } 56 }
57#ifdef CONFIG_SMP
58 s->s_files = alloc_percpu(struct list_head);
59 if (!s->s_files) {
60 security_sb_free(s);
61 kfree(s);
62 s = NULL;
63 goto out;
64 } else {
65 int i;
66
67 for_each_possible_cpu(i)
68 INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
69 }
70#else
57 INIT_LIST_HEAD(&s->s_files); 71 INIT_LIST_HEAD(&s->s_files);
72#endif
58 INIT_LIST_HEAD(&s->s_instances); 73 INIT_LIST_HEAD(&s->s_instances);
59 INIT_HLIST_HEAD(&s->s_anon); 74 INIT_HLIST_HEAD(&s->s_anon);
60 INIT_LIST_HEAD(&s->s_inodes); 75 INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
108 */ 123 */
109static inline void destroy_super(struct super_block *s) 124static inline void destroy_super(struct super_block *s)
110{ 125{
126#ifdef CONFIG_SMP
127 free_percpu(s->s_files);
128#endif
111 security_sb_free(s); 129 security_sb_free(s);
112 kfree(s->s_subtype); 130 kfree(s->s_subtype);
113 kfree(s->s_options); 131 kfree(s->s_options);
@@ -255,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
255 get_fs_excl(); 273 get_fs_excl();
256 sb->s_flags &= ~MS_ACTIVE; 274 sb->s_flags &= ~MS_ACTIVE;
257 275
258 /* bad name - it should be evict_inodes() */ 276 fsnotify_unmount_inodes(&sb->s_inodes);
259 invalidate_inodes(sb); 277
278 evict_inodes(sb);
260 279
261 if (sop->put_super) 280 if (sop->put_super)
262 sop->put_super(sb); 281 sop->put_super(sb);
263 282
264 /* Forget any remaining inodes */ 283 if (!list_empty(&sb->s_inodes)) {
265 if (invalidate_inodes(sb)) {
266 printk("VFS: Busy inodes after unmount of %s. " 284 printk("VFS: Busy inodes after unmount of %s. "
267 "Self-destruct in 5 seconds. Have a nice day...\n", 285 "Self-destruct in 5 seconds. Have a nice day...\n",
268 sb->s_id); 286 sb->s_id);
@@ -305,8 +323,13 @@ retry:
305 if (s) { 323 if (s) {
306 up_write(&s->s_umount); 324 up_write(&s->s_umount);
307 destroy_super(s); 325 destroy_super(s);
326 s = NULL;
308 } 327 }
309 down_write(&old->s_umount); 328 down_write(&old->s_umount);
329 if (unlikely(!(old->s_flags & MS_BORN))) {
330 deactivate_locked_super(old);
331 goto retry;
332 }
310 return old; 333 return old;
311 } 334 }
312 } 335 }
@@ -358,10 +381,10 @@ EXPORT_SYMBOL(drop_super);
358 */ 381 */
359void sync_supers(void) 382void sync_supers(void)
360{ 383{
361 struct super_block *sb, *n; 384 struct super_block *sb, *p = NULL;
362 385
363 spin_lock(&sb_lock); 386 spin_lock(&sb_lock);
364 list_for_each_entry_safe(sb, n, &super_blocks, s_list) { 387 list_for_each_entry(sb, &super_blocks, s_list) {
365 if (list_empty(&sb->s_instances)) 388 if (list_empty(&sb->s_instances))
366 continue; 389 continue;
367 if (sb->s_op->write_super && sb->s_dirt) { 390 if (sb->s_op->write_super && sb->s_dirt) {
@@ -374,11 +397,13 @@ void sync_supers(void)
374 up_read(&sb->s_umount); 397 up_read(&sb->s_umount);
375 398
376 spin_lock(&sb_lock); 399 spin_lock(&sb_lock);
377 /* lock was dropped, must reset next */ 400 if (p)
378 list_safe_reset_next(sb, n, s_list); 401 __put_super(p);
379 __put_super(sb); 402 p = sb;
380 } 403 }
381 } 404 }
405 if (p)
406 __put_super(p);
382 spin_unlock(&sb_lock); 407 spin_unlock(&sb_lock);
383} 408}
384 409
@@ -392,10 +417,10 @@ void sync_supers(void)
392 */ 417 */
393void iterate_supers(void (*f)(struct super_block *, void *), void *arg) 418void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
394{ 419{
395 struct super_block *sb, *n; 420 struct super_block *sb, *p = NULL;
396 421
397 spin_lock(&sb_lock); 422 spin_lock(&sb_lock);
398 list_for_each_entry_safe(sb, n, &super_blocks, s_list) { 423 list_for_each_entry(sb, &super_blocks, s_list) {
399 if (list_empty(&sb->s_instances)) 424 if (list_empty(&sb->s_instances))
400 continue; 425 continue;
401 sb->s_count++; 426 sb->s_count++;
@@ -407,10 +432,12 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
407 up_read(&sb->s_umount); 432 up_read(&sb->s_umount);
408 433
409 spin_lock(&sb_lock); 434 spin_lock(&sb_lock);
410 /* lock was dropped, must reset next */ 435 if (p)
411 list_safe_reset_next(sb, n, s_list); 436 __put_super(p);
412 __put_super(sb); 437 p = sb;
413 } 438 }
439 if (p)
440 __put_super(p);
414 spin_unlock(&sb_lock); 441 spin_unlock(&sb_lock);
415} 442}
416 443
@@ -572,10 +599,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
572 599
573static void do_emergency_remount(struct work_struct *work) 600static void do_emergency_remount(struct work_struct *work)
574{ 601{
575 struct super_block *sb, *n; 602 struct super_block *sb, *p = NULL;
576 603
577 spin_lock(&sb_lock); 604 spin_lock(&sb_lock);
578 list_for_each_entry_safe(sb, n, &super_blocks, s_list) { 605 list_for_each_entry(sb, &super_blocks, s_list) {
579 if (list_empty(&sb->s_instances)) 606 if (list_empty(&sb->s_instances))
580 continue; 607 continue;
581 sb->s_count++; 608 sb->s_count++;
@@ -589,10 +616,12 @@ static void do_emergency_remount(struct work_struct *work)
589 } 616 }
590 up_write(&sb->s_umount); 617 up_write(&sb->s_umount);
591 spin_lock(&sb_lock); 618 spin_lock(&sb_lock);
592 /* lock was dropped, must reset next */ 619 if (p)
593 list_safe_reset_next(sb, n, s_list); 620 __put_super(p);
594 __put_super(sb); 621 p = sb;
595 } 622 }
623 if (p)
624 __put_super(p);
596 spin_unlock(&sb_lock); 625 spin_unlock(&sb_lock);
597 kfree(work); 626 kfree(work);
598 printk("Emergency Remount complete\n"); 627 printk("Emergency Remount complete\n");
@@ -686,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
686 return set_anon_super(sb, NULL); 715 return set_anon_super(sb, NULL);
687} 716}
688 717
689int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, 718struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
690 int (*fill_super)(struct super_block *, void *, int), 719 void *data, int (*fill_super)(struct super_block *, void *, int))
691 struct vfsmount *mnt)
692{ 720{
693 struct super_block *sb; 721 struct super_block *sb;
694 722
695 sb = sget(fs_type, ns_test_super, ns_set_super, data); 723 sb = sget(fs_type, ns_test_super, ns_set_super, data);
696 if (IS_ERR(sb)) 724 if (IS_ERR(sb))
697 return PTR_ERR(sb); 725 return ERR_CAST(sb);
698 726
699 if (!sb->s_root) { 727 if (!sb->s_root) {
700 int err; 728 int err;
@@ -702,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
702 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 730 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
703 if (err) { 731 if (err) {
704 deactivate_locked_super(sb); 732 deactivate_locked_super(sb);
705 return err; 733 return ERR_PTR(err);
706 } 734 }
707 735
708 sb->s_flags |= MS_ACTIVE; 736 sb->s_flags |= MS_ACTIVE;
709 } 737 }
710 738
711 simple_set_mnt(mnt, sb); 739 return dget(sb->s_root);
712 return 0;
713} 740}
714 741
715EXPORT_SYMBOL(get_sb_ns); 742EXPORT_SYMBOL(mount_ns);
716 743
717#ifdef CONFIG_BLOCK 744#ifdef CONFIG_BLOCK
718static int set_bdev_super(struct super_block *s, void *data) 745static int set_bdev_super(struct super_block *s, void *data)
@@ -733,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data)
733 return (void *)s->s_bdev == data; 760 return (void *)s->s_bdev == data;
734} 761}
735 762
736int get_sb_bdev(struct file_system_type *fs_type, 763struct dentry *mount_bdev(struct file_system_type *fs_type,
737 int flags, const char *dev_name, void *data, 764 int flags, const char *dev_name, void *data,
738 int (*fill_super)(struct super_block *, void *, int), 765 int (*fill_super)(struct super_block *, void *, int))
739 struct vfsmount *mnt)
740{ 766{
741 struct block_device *bdev; 767 struct block_device *bdev;
742 struct super_block *s; 768 struct super_block *s;
@@ -748,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
748 774
749 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 775 bdev = open_bdev_exclusive(dev_name, mode, fs_type);
750 if (IS_ERR(bdev)) 776 if (IS_ERR(bdev))
751 return PTR_ERR(bdev); 777 return ERR_CAST(bdev);
752 778
753 /* 779 /*
754 * once the super is inserted into the list by sget, s_umount 780 * once the super is inserted into the list by sget, s_umount
@@ -773,7 +799,16 @@ int get_sb_bdev(struct file_system_type *fs_type,
773 goto error_bdev; 799 goto error_bdev;
774 } 800 }
775 801
802 /*
803 * s_umount nests inside bd_mutex during
804 * __invalidate_device(). close_bdev_exclusive()
805 * acquires bd_mutex and can't be called under
806 * s_umount. Drop s_umount temporarily. This is safe
807 * as we're holding an active reference.
808 */
809 up_write(&s->s_umount);
776 close_bdev_exclusive(bdev, mode); 810 close_bdev_exclusive(bdev, mode);
811 down_write(&s->s_umount);
777 } else { 812 } else {
778 char b[BDEVNAME_SIZE]; 813 char b[BDEVNAME_SIZE];
779 814
@@ -791,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
791 bdev->bd_super = s; 826 bdev->bd_super = s;
792 } 827 }
793 828
794 simple_set_mnt(mnt, s); 829 return dget(s->s_root);
795 return 0;
796 830
797error_s: 831error_s:
798 error = PTR_ERR(s); 832 error = PTR_ERR(s);
799error_bdev: 833error_bdev:
800 close_bdev_exclusive(bdev, mode); 834 close_bdev_exclusive(bdev, mode);
801error: 835error:
802 return error; 836 return ERR_PTR(error);
837}
838EXPORT_SYMBOL(mount_bdev);
839
840int get_sb_bdev(struct file_system_type *fs_type,
841 int flags, const char *dev_name, void *data,
842 int (*fill_super)(struct super_block *, void *, int),
843 struct vfsmount *mnt)
844{
845 struct dentry *root;
846
847 root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
848 if (IS_ERR(root))
849 return PTR_ERR(root);
850 mnt->mnt_root = root;
851 mnt->mnt_sb = root->d_sb;
852 return 0;
803} 853}
804 854
805EXPORT_SYMBOL(get_sb_bdev); 855EXPORT_SYMBOL(get_sb_bdev);
@@ -818,29 +868,42 @@ void kill_block_super(struct super_block *sb)
818EXPORT_SYMBOL(kill_block_super); 868EXPORT_SYMBOL(kill_block_super);
819#endif 869#endif
820 870
821int get_sb_nodev(struct file_system_type *fs_type, 871struct dentry *mount_nodev(struct file_system_type *fs_type,
822 int flags, void *data, 872 int flags, void *data,
823 int (*fill_super)(struct super_block *, void *, int), 873 int (*fill_super)(struct super_block *, void *, int))
824 struct vfsmount *mnt)
825{ 874{
826 int error; 875 int error;
827 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 876 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
828 877
829 if (IS_ERR(s)) 878 if (IS_ERR(s))
830 return PTR_ERR(s); 879 return ERR_CAST(s);
831 880
832 s->s_flags = flags; 881 s->s_flags = flags;
833 882
834 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 883 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
835 if (error) { 884 if (error) {
836 deactivate_locked_super(s); 885 deactivate_locked_super(s);
837 return error; 886 return ERR_PTR(error);
838 } 887 }
839 s->s_flags |= MS_ACTIVE; 888 s->s_flags |= MS_ACTIVE;
840 simple_set_mnt(mnt, s); 889 return dget(s->s_root);
841 return 0;
842} 890}
891EXPORT_SYMBOL(mount_nodev);
892
893int get_sb_nodev(struct file_system_type *fs_type,
894 int flags, void *data,
895 int (*fill_super)(struct super_block *, void *, int),
896 struct vfsmount *mnt)
897{
898 struct dentry *root;
843 899
900 root = mount_nodev(fs_type, flags, data, fill_super);
901 if (IS_ERR(root))
902 return PTR_ERR(root);
903 mnt->mnt_root = root;
904 mnt->mnt_sb = root->d_sb;
905 return 0;
906}
844EXPORT_SYMBOL(get_sb_nodev); 907EXPORT_SYMBOL(get_sb_nodev);
845 908
846static int compare_single(struct super_block *s, void *p) 909static int compare_single(struct super_block *s, void *p)
@@ -848,29 +911,42 @@ static int compare_single(struct super_block *s, void *p)
848 return 1; 911 return 1;
849} 912}
850 913
851int get_sb_single(struct file_system_type *fs_type, 914struct dentry *mount_single(struct file_system_type *fs_type,
852 int flags, void *data, 915 int flags, void *data,
853 int (*fill_super)(struct super_block *, void *, int), 916 int (*fill_super)(struct super_block *, void *, int))
854 struct vfsmount *mnt)
855{ 917{
856 struct super_block *s; 918 struct super_block *s;
857 int error; 919 int error;
858 920
859 s = sget(fs_type, compare_single, set_anon_super, NULL); 921 s = sget(fs_type, compare_single, set_anon_super, NULL);
860 if (IS_ERR(s)) 922 if (IS_ERR(s))
861 return PTR_ERR(s); 923 return ERR_CAST(s);
862 if (!s->s_root) { 924 if (!s->s_root) {
863 s->s_flags = flags; 925 s->s_flags = flags;
864 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 926 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
865 if (error) { 927 if (error) {
866 deactivate_locked_super(s); 928 deactivate_locked_super(s);
867 return error; 929 return ERR_PTR(error);
868 } 930 }
869 s->s_flags |= MS_ACTIVE; 931 s->s_flags |= MS_ACTIVE;
870 } else { 932 } else {
871 do_remount_sb(s, flags, data, 0); 933 do_remount_sb(s, flags, data, 0);
872 } 934 }
873 simple_set_mnt(mnt, s); 935 return dget(s->s_root);
936}
937EXPORT_SYMBOL(mount_single);
938
939int get_sb_single(struct file_system_type *fs_type,
940 int flags, void *data,
941 int (*fill_super)(struct super_block *, void *, int),
942 struct vfsmount *mnt)
943{
944 struct dentry *root;
945 root = mount_single(fs_type, flags, data, fill_super);
946 if (IS_ERR(root))
947 return PTR_ERR(root);
948 mnt->mnt_root = root;
949 mnt->mnt_sb = root->d_sb;
874 return 0; 950 return 0;
875} 951}
876 952
@@ -880,6 +956,7 @@ struct vfsmount *
880vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 956vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
881{ 957{
882 struct vfsmount *mnt; 958 struct vfsmount *mnt;
959 struct dentry *root;
883 char *secdata = NULL; 960 char *secdata = NULL;
884 int error; 961 int error;
885 962
@@ -904,11 +981,22 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
904 goto out_free_secdata; 981 goto out_free_secdata;
905 } 982 }
906 983
907 error = type->get_sb(type, flags, name, data, mnt); 984 if (type->mount) {
908 if (error < 0) 985 root = type->mount(type, flags, name, data);
909 goto out_free_secdata; 986 if (IS_ERR(root)) {
987 error = PTR_ERR(root);
988 goto out_free_secdata;
989 }
990 mnt->mnt_root = root;
991 mnt->mnt_sb = root->d_sb;
992 } else {
993 error = type->get_sb(type, flags, name, data, mnt);
994 if (error < 0)
995 goto out_free_secdata;
996 }
910 BUG_ON(!mnt->mnt_sb); 997 BUG_ON(!mnt->mnt_sb);
911 WARN_ON(!mnt->mnt_sb->s_bdi); 998 WARN_ON(!mnt->mnt_sb->s_bdi);
999 mnt->mnt_sb->s_flags |= MS_BORN;
912 1000
913 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 1001 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
914 if (error) 1002 if (error)
diff --git a/fs/sync.c b/fs/sync.c
index 15aa6f03b2d..ba76b9623e7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -128,31 +128,6 @@ void emergency_sync(void)
128 } 128 }
129} 129}
130 130
131/*
132 * Generic function to fsync a file.
133 */
134int file_fsync(struct file *filp, int datasync)
135{
136 struct inode *inode = filp->f_mapping->host;
137 struct super_block * sb;
138 int ret, err;
139
140 /* sync the inode to buffers */
141 ret = write_inode_now(inode, 0);
142
143 /* sync the superblock to buffers */
144 sb = inode->i_sb;
145 if (sb->s_dirt && sb->s_op->write_super)
146 sb->s_op->write_super(sb);
147
148 /* .. finally sync the buffers to disk */
149 err = sync_blockdev(sb->s_bdev);
150 if (!ret)
151 ret = err;
152 return ret;
153}
154EXPORT_SYMBOL(file_fsync);
155
156/** 131/**
157 * vfs_fsync_range - helper to sync a range of data & metadata to disk 132 * vfs_fsync_range - helper to sync a range of data & metadata to disk
158 * @file: file to sync 133 * @file: file to sync
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353f..a4759833d62 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
179 struct bin_buffer *bb = file->private_data; 179 struct bin_buffer *bb = file->private_data;
180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
181 181
182 if (!bb->vm_ops || !bb->vm_ops->open) 182 if (!bb->vm_ops)
183 return;
184
185 if (!sysfs_get_active(attr_sd))
186 return;
187
188 bb->vm_ops->open(vma);
189
190 sysfs_put_active(attr_sd);
191}
192
193static void bin_vma_close(struct vm_area_struct *vma)
194{
195 struct file *file = vma->vm_file;
196 struct bin_buffer *bb = file->private_data;
197 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
198
199 if (!bb->vm_ops || !bb->vm_ops->close)
200 return; 183 return;
201 184
202 if (!sysfs_get_active(attr_sd)) 185 if (!sysfs_get_active(attr_sd))
203 return; 186 return;
204 187
205 bb->vm_ops->close(vma); 188 if (bb->vm_ops->open)
189 bb->vm_ops->open(vma);
206 190
207 sysfs_put_active(attr_sd); 191 sysfs_put_active(attr_sd);
208} 192}
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
214 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 198 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
215 int ret; 199 int ret;
216 200
217 if (!bb->vm_ops || !bb->vm_ops->fault) 201 if (!bb->vm_ops)
218 return VM_FAULT_SIGBUS; 202 return VM_FAULT_SIGBUS;
219 203
220 if (!sysfs_get_active(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
221 return VM_FAULT_SIGBUS; 205 return VM_FAULT_SIGBUS;
222 206
223 ret = bb->vm_ops->fault(vma, vmf); 207 ret = VM_FAULT_SIGBUS;
208 if (bb->vm_ops->fault)
209 ret = bb->vm_ops->fault(vma, vmf);
224 210
225 sysfs_put_active(attr_sd); 211 sysfs_put_active(attr_sd);
226 return ret; 212 return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
236 if (!bb->vm_ops) 222 if (!bb->vm_ops)
237 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
238 224
239 if (!bb->vm_ops->page_mkwrite)
240 return 0;
241
242 if (!sysfs_get_active(attr_sd)) 225 if (!sysfs_get_active(attr_sd))
243 return VM_FAULT_SIGBUS; 226 return VM_FAULT_SIGBUS;
244 227
245 ret = bb->vm_ops->page_mkwrite(vma, vmf); 228 ret = 0;
229 if (bb->vm_ops->page_mkwrite)
230 ret = bb->vm_ops->page_mkwrite(vma, vmf);
246 231
247 sysfs_put_active(attr_sd); 232 sysfs_put_active(attr_sd);
248 return ret; 233 return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
256 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
257 int ret; 242 int ret;
258 243
259 if (!bb->vm_ops || !bb->vm_ops->access) 244 if (!bb->vm_ops)
260 return -EINVAL; 245 return -EINVAL;
261 246
262 if (!sysfs_get_active(attr_sd)) 247 if (!sysfs_get_active(attr_sd))
263 return -EINVAL; 248 return -EINVAL;
264 249
265 ret = bb->vm_ops->access(vma, addr, buf, len, write); 250 ret = -EINVAL;
251 if (bb->vm_ops->access)
252 ret = bb->vm_ops->access(vma, addr, buf, len, write);
266 253
267 sysfs_put_active(attr_sd); 254 sysfs_put_active(attr_sd);
268 return ret; 255 return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
276 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 263 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
277 int ret; 264 int ret;
278 265
279 if (!bb->vm_ops || !bb->vm_ops->set_policy) 266 if (!bb->vm_ops)
280 return 0; 267 return 0;
281 268
282 if (!sysfs_get_active(attr_sd)) 269 if (!sysfs_get_active(attr_sd))
283 return -EINVAL; 270 return -EINVAL;
284 271
285 ret = bb->vm_ops->set_policy(vma, new); 272 ret = 0;
273 if (bb->vm_ops->set_policy)
274 ret = bb->vm_ops->set_policy(vma, new);
286 275
287 sysfs_put_active(attr_sd); 276 sysfs_put_active(attr_sd);
288 return ret; 277 return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
296 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 285 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
297 struct mempolicy *pol; 286 struct mempolicy *pol;
298 287
299 if (!bb->vm_ops || !bb->vm_ops->get_policy) 288 if (!bb->vm_ops)
300 return vma->vm_policy; 289 return vma->vm_policy;
301 290
302 if (!sysfs_get_active(attr_sd)) 291 if (!sysfs_get_active(attr_sd))
303 return vma->vm_policy; 292 return vma->vm_policy;
304 293
305 pol = bb->vm_ops->get_policy(vma, addr); 294 pol = vma->vm_policy;
295 if (bb->vm_ops->get_policy)
296 pol = bb->vm_ops->get_policy(vma, addr);
306 297
307 sysfs_put_active(attr_sd); 298 sysfs_put_active(attr_sd);
308 return pol; 299 return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
316 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 307 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
317 int ret; 308 int ret;
318 309
319 if (!bb->vm_ops || !bb->vm_ops->migrate) 310 if (!bb->vm_ops)
320 return 0; 311 return 0;
321 312
322 if (!sysfs_get_active(attr_sd)) 313 if (!sysfs_get_active(attr_sd))
323 return 0; 314 return 0;
324 315
325 ret = bb->vm_ops->migrate(vma, from, to, flags); 316 ret = 0;
317 if (bb->vm_ops->migrate)
318 ret = bb->vm_ops->migrate(vma, from, to, flags);
326 319
327 sysfs_put_active(attr_sd); 320 sysfs_put_active(attr_sd);
328 return ret; 321 return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331 324
332static const struct vm_operations_struct bin_vm_ops = { 325static const struct vm_operations_struct bin_vm_ops = {
333 .open = bin_vma_open, 326 .open = bin_vma_open,
334 .close = bin_vma_close,
335 .fault = bin_fault, 327 .fault = bin_fault,
336 .page_mkwrite = bin_page_mkwrite, 328 .page_mkwrite = bin_page_mkwrite,
337 .access = bin_access, 329 .access = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
377 if (bb->mmapped && bb->vm_ops != vma->vm_ops) 369 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
378 goto out_put; 370 goto out_put;
379 371
372 /*
373 * It is not possible to successfully wrap close.
374 * So error if someone is trying to use close.
375 */
376 rc = -EINVAL;
377 if (vma->vm_ops && vma->vm_ops->close)
378 goto out_put;
379
380 rc = 0; 380 rc = 0;
381 bb->mmapped = 1; 381 bb->mmapped = 1;
382 bb->vm_ops = vma->vm_ops; 382 bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a..da3fefe91a8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
340 char *p; 340 char *p;
341 341
342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); 342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
343 if (p) 343 if (!IS_ERR(p))
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
593 * @mode: file permissions. 593 * @mode: file permissions.
594 * 594 *
595 */ 595 */
596int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) 596int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
597 mode_t mode)
597{ 598{
598 struct sysfs_dirent *sd; 599 struct sysfs_dirent *sd;
599 struct iattr newattrs; 600 struct iattr newattrs;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792..442f34ff1af 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
148 sysfs_put(sd); 148 sysfs_put(sd);
149} 149}
150 150
151/**
152 * sysfs_merge_group - merge files into a pre-existing attribute group.
153 * @kobj: The kobject containing the group.
154 * @grp: The files to create and the attribute group they belong to.
155 *
156 * This function returns an error if the group doesn't exist or any of the
157 * files already exist in that group, in which case none of the new files
158 * are created.
159 */
160int sysfs_merge_group(struct kobject *kobj,
161 const struct attribute_group *grp)
162{
163 struct sysfs_dirent *dir_sd;
164 int error = 0;
165 struct attribute *const *attr;
166 int i;
167
168 if (grp)
169 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
170 else
171 dir_sd = sysfs_get(kobj->sd);
172 if (!dir_sd)
173 return -ENOENT;
174
175 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
176 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
177 if (error) {
178 while (--i >= 0)
179 sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
180 }
181 sysfs_put(dir_sd);
182
183 return error;
184}
185EXPORT_SYMBOL_GPL(sysfs_merge_group);
186
187/**
188 * sysfs_unmerge_group - remove files from a pre-existing attribute group.
189 * @kobj: The kobject containing the group.
190 * @grp: The files to remove and the attribute group they belong to.
191 */
192void sysfs_unmerge_group(struct kobject *kobj,
193 const struct attribute_group *grp)
194{
195 struct sysfs_dirent *dir_sd;
196 struct attribute *const *attr;
197
198 if (grp)
199 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
200 else
201 dir_sd = sysfs_get(kobj->sd);
202 if (dir_sd) {
203 for (attr = grp->attrs; *attr; ++attr)
204 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
205 sysfs_put(dir_sd);
206 }
207}
208EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
209
151 210
152EXPORT_SYMBOL_GPL(sysfs_create_group); 211EXPORT_SYMBOL_GPL(sysfs_create_group);
153EXPORT_SYMBOL_GPL(sysfs_update_group); 212EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0835a3b70e0..cffb1fd8ba3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -122,7 +122,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
122 goto out; 122 goto out;
123 123
124 /* this ignores size changes */ 124 /* this ignores size changes */
125 generic_setattr(inode, iattr); 125 setattr_copy(inode, iattr);
126 126
127out: 127out:
128 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
@@ -312,15 +312,15 @@ struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
312 * The sysfs_dirent serves as both an inode and a directory entry for sysfs. 312 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
313 * To prevent the sysfs inode numbers from being freed prematurely we take a 313 * To prevent the sysfs inode numbers from being freed prematurely we take a
314 * reference to sysfs_dirent from the sysfs inode. A 314 * reference to sysfs_dirent from the sysfs inode. A
315 * super_operations.delete_inode() implementation is needed to drop that 315 * super_operations.evict_inode() implementation is needed to drop that
316 * reference upon inode destruction. 316 * reference upon inode destruction.
317 */ 317 */
318void sysfs_delete_inode(struct inode *inode) 318void sysfs_evict_inode(struct inode *inode)
319{ 319{
320 struct sysfs_dirent *sd = inode->i_private; 320 struct sysfs_dirent *sd = inode->i_private;
321 321
322 truncate_inode_pages(&inode->i_data, 0); 322 truncate_inode_pages(&inode->i_data, 0);
323 clear_inode(inode); 323 end_writeback(inode);
324 sysfs_put(sd); 324 sysfs_put(sd);
325} 325}
326 326
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 281c0c9bc39..266895783b4 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,13 +23,13 @@
23#include "sysfs.h" 23#include "sysfs.h"
24 24
25 25
26static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mnt;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
30 .statfs = simple_statfs, 30 .statfs = simple_statfs,
31 .drop_inode = generic_delete_inode, 31 .drop_inode = generic_delete_inode,
32 .delete_inode = sysfs_delete_inode, 32 .evict_inode = sysfs_evict_inode,
33}; 33};
34 34
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
95 return error; 95 return error;
96} 96}
97 97
98static int sysfs_get_sb(struct file_system_type *fs_type, 98static struct dentry *sysfs_mount(struct file_system_type *fs_type,
99 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data)
100{ 100{
101 struct sysfs_super_info *info; 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type; 102 enum kobj_ns_type type;
103 struct super_block *sb; 103 struct super_block *sb;
104 int error; 104 int error;
105 105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL); 106 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info) 107 if (!info)
109 goto out; 108 return ERR_PTR(-ENOMEM);
110 109
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) 110 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type); 111 info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); 113 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info) 114 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info); 115 kfree(info);
117 if (IS_ERR(sb)) { 116 if (IS_ERR(sb))
118 error = PTR_ERR(sb); 117 return ERR_CAST(sb);
119 goto out;
120 }
121 if (!sb->s_root) { 118 if (!sb->s_root) {
122 sb->s_flags = flags; 119 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 120 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) { 121 if (error) {
125 deactivate_locked_super(sb); 122 deactivate_locked_super(sb);
126 goto out; 123 return ERR_PTR(error);
127 } 124 }
128 sb->s_flags |= MS_ACTIVE; 125 sb->s_flags |= MS_ACTIVE;
129 } 126 }
130 127
131 simple_set_mnt(mnt, sb); 128 return dget(sb->s_root);
132 error = 0;
133out:
134 return error;
135} 129}
136 130
137static void sysfs_kill_sb(struct super_block *sb) 131static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
147 141
148static struct file_system_type sysfs_fs_type = { 142static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 143 .name = "sysfs",
150 .get_sb = sysfs_get_sb, 144 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 145 .kill_sb = sysfs_kill_sb,
152}; 146};
153 147
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
189 183
190 err = register_filesystem(&sysfs_fs_type); 184 err = register_filesystem(&sysfs_fs_type);
191 if (!err) { 185 if (!err) {
192 sysfs_mount = kern_mount(&sysfs_fs_type); 186 sysfs_mnt = kern_mount(&sysfs_fs_type);
193 if (IS_ERR(sysfs_mount)) { 187 if (IS_ERR(sysfs_mnt)) {
194 printk(KERN_ERR "sysfs: could not mount!\n"); 188 printk(KERN_ERR "sysfs: could not mount!\n");
195 err = PTR_ERR(sysfs_mount); 189 err = PTR_ERR(sysfs_mnt);
196 sysfs_mount = NULL; 190 sysfs_mnt = NULL;
197 unregister_filesystem(&sysfs_fs_type); 191 unregister_filesystem(&sysfs_fs_type);
198 goto out_err; 192 goto out_err;
199 } 193 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6a13105b559..d9be60a2e95 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -198,7 +198,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
198 * inode.c 198 * inode.c
199 */ 199 */
200struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); 200struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
201void sysfs_delete_inode(struct inode *inode); 201void sysfs_evict_inode(struct inode *inode);
202int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 202int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
203int sysfs_permission(struct inode *inode, int mask); 203int sysfs_permission(struct inode *inode, int mask);
204int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 204int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 79941e4964a..a77c4215762 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -218,8 +218,7 @@ got_it:
218 pos = page_offset(page) + 218 pos = page_offset(page) +
219 (char*)de - (char*)page_address(page); 219 (char*)de - (char*)page_address(page);
220 lock_page(page); 220 lock_page(page);
221 err = __sysv_write_begin(NULL, page->mapping, pos, SYSV_DIRSIZE, 221 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
222 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
223 if (err) 222 if (err)
224 goto out_unlock; 223 goto out_unlock;
225 memcpy (de->name, name, namelen); 224 memcpy (de->name, name, namelen);
@@ -239,15 +238,13 @@ out_unlock:
239 238
240int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) 239int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
241{ 240{
242 struct address_space *mapping = page->mapping; 241 struct inode *inode = page->mapping->host;
243 struct inode *inode = (struct inode*)mapping->host;
244 char *kaddr = (char*)page_address(page); 242 char *kaddr = (char*)page_address(page);
245 loff_t pos = page_offset(page) + (char *)de - kaddr; 243 loff_t pos = page_offset(page) + (char *)de - kaddr;
246 int err; 244 int err;
247 245
248 lock_page(page); 246 lock_page(page);
249 err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE, 247 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
250 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
251 BUG_ON(err); 248 BUG_ON(err);
252 de->inode = 0; 249 de->inode = 0;
253 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); 250 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
@@ -259,16 +256,14 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
259 256
260int sysv_make_empty(struct inode *inode, struct inode *dir) 257int sysv_make_empty(struct inode *inode, struct inode *dir)
261{ 258{
262 struct address_space *mapping = inode->i_mapping; 259 struct page *page = grab_cache_page(inode->i_mapping, 0);
263 struct page *page = grab_cache_page(mapping, 0);
264 struct sysv_dir_entry * de; 260 struct sysv_dir_entry * de;
265 char *base; 261 char *base;
266 int err; 262 int err;
267 263
268 if (!page) 264 if (!page)
269 return -ENOMEM; 265 return -ENOMEM;
270 err = __sysv_write_begin(NULL, mapping, 0, 2 * SYSV_DIRSIZE, 266 err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
271 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
272 if (err) { 267 if (err) {
273 unlock_page(page); 268 unlock_page(page);
274 goto fail; 269 goto fail;
@@ -341,15 +336,13 @@ not_empty:
341void sysv_set_link(struct sysv_dir_entry *de, struct page *page, 336void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
342 struct inode *inode) 337 struct inode *inode)
343{ 338{
344 struct address_space *mapping = page->mapping; 339 struct inode *dir = page->mapping->host;
345 struct inode *dir = mapping->host;
346 loff_t pos = page_offset(page) + 340 loff_t pos = page_offset(page) +
347 (char *)de-(char*)page_address(page); 341 (char *)de-(char*)page_address(page);
348 int err; 342 int err;
349 343
350 lock_page(page); 344 lock_page(page);
351 err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE, 345 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
352 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
353 BUG_ON(err); 346 BUG_ON(err);
354 de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); 347 de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
355 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); 348 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 750cc22349b..0a65939508e 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -30,7 +30,29 @@ const struct file_operations sysv_file_operations = {
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
33static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
34{
35 struct inode *inode = dentry->d_inode;
36 int error;
37
38 error = inode_change_ok(inode, attr);
39 if (error)
40 return error;
41
42 if ((attr->ia_valid & ATTR_SIZE) &&
43 attr->ia_size != i_size_read(inode)) {
44 error = vmtruncate(inode, attr->ia_size);
45 if (error)
46 return error;
47 }
48
49 setattr_copy(inode, attr);
50 mark_inode_dirty(inode);
51 return 0;
52}
53
33const struct inode_operations sysv_file_inode_operations = { 54const struct inode_operations sysv_file_inode_operations = {
34 .truncate = sysv_truncate, 55 .truncate = sysv_truncate,
56 .setattr = sysv_setattr,
35 .getattr = sysv_getattr, 57 .getattr = sysv_getattr,
36}; 58};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index fcc498ec9b3..0c96c98bd1d 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -113,7 +113,6 @@ void sysv_free_inode(struct inode * inode)
113 return; 113 return;
114 } 114 }
115 raw_inode = sysv_raw_inode(sb, ino, &bh); 115 raw_inode = sysv_raw_inode(sb, ino, &bh);
116 clear_inode(inode);
117 if (!raw_inode) { 116 if (!raw_inode) {
118 printk("sysv_free_inode: unable to read inode block on device " 117 printk("sysv_free_inode: unable to read inode block on device "
119 "%s\n", inode->i_sb->s_id); 118 "%s\n", inode->i_sb->s_id);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d4a5380b566..de44d067b9e 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -71,8 +71,8 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
71 lock_super(sb); 71 lock_super(sb);
72 if (sbi->s_forced_ro) 72 if (sbi->s_forced_ro)
73 *flags |= MS_RDONLY; 73 *flags |= MS_RDONLY;
74 if (!(*flags & MS_RDONLY)) 74 if (*flags & MS_RDONLY)
75 sb->s_dirt = 1; 75 sysv_write_super(sb);
76 unlock_super(sb); 76 unlock_super(sb);
77 return 0; 77 return 0;
78} 78}
@@ -308,12 +308,17 @@ int sysv_sync_inode(struct inode *inode)
308 return __sysv_write_inode(inode, 1); 308 return __sysv_write_inode(inode, 1);
309} 309}
310 310
311static void sysv_delete_inode(struct inode *inode) 311static void sysv_evict_inode(struct inode *inode)
312{ 312{
313 truncate_inode_pages(&inode->i_data, 0); 313 truncate_inode_pages(&inode->i_data, 0);
314 inode->i_size = 0; 314 if (!inode->i_nlink) {
315 sysv_truncate(inode); 315 inode->i_size = 0;
316 sysv_free_inode(inode); 316 sysv_truncate(inode);
317 }
318 invalidate_inode_buffers(inode);
319 end_writeback(inode);
320 if (!inode->i_nlink)
321 sysv_free_inode(inode);
317} 322}
318 323
319static struct kmem_cache *sysv_inode_cachep; 324static struct kmem_cache *sysv_inode_cachep;
@@ -344,7 +349,7 @@ const struct super_operations sysv_sops = {
344 .alloc_inode = sysv_alloc_inode, 349 .alloc_inode = sysv_alloc_inode,
345 .destroy_inode = sysv_destroy_inode, 350 .destroy_inode = sysv_destroy_inode,
346 .write_inode = sysv_write_inode, 351 .write_inode = sysv_write_inode,
347 .delete_inode = sysv_delete_inode, 352 .evict_inode = sysv_evict_inode,
348 .put_super = sysv_put_super, 353 .put_super = sysv_put_super,
349 .write_super = sysv_write_super, 354 .write_super = sysv_write_super,
350 .sync_fs = sysv_sync_fs, 355 .sync_fs = sysv_sync_fs,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index f042eec464c..9ca66276315 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -459,20 +459,25 @@ static int sysv_readpage(struct file *file, struct page *page)
459 return block_read_full_page(page,get_block); 459 return block_read_full_page(page,get_block);
460} 460}
461 461
462int __sysv_write_begin(struct file *file, struct address_space *mapping, 462int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
463 loff_t pos, unsigned len, unsigned flags,
464 struct page **pagep, void **fsdata)
465{ 463{
466 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 464 return __block_write_begin(page, pos, len, get_block);
467 get_block);
468} 465}
469 466
470static int sysv_write_begin(struct file *file, struct address_space *mapping, 467static int sysv_write_begin(struct file *file, struct address_space *mapping,
471 loff_t pos, unsigned len, unsigned flags, 468 loff_t pos, unsigned len, unsigned flags,
472 struct page **pagep, void **fsdata) 469 struct page **pagep, void **fsdata)
473{ 470{
474 *pagep = NULL; 471 int ret;
475 return __sysv_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 472
473 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
474 if (unlikely(ret)) {
475 loff_t isize = mapping->host->i_size;
476 if (pos + len > isize)
477 vmtruncate(mapping->host, isize);
478 }
479
480 return ret;
476} 481}
477 482
478static sector_t sysv_bmap(struct address_space *mapping, sector_t block) 483static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8..11e7f7d11cd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
126 126
127 inode->i_ctime = CURRENT_TIME_SEC; 127 inode->i_ctime = CURRENT_TIME_SEC;
128 inode_inc_link_count(inode); 128 inode_inc_link_count(inode);
129 atomic_inc(&inode->i_count); 129 ihold(inode);
130 130
131 return add_nondir(dentry, inode); 131 return add_nondir(dentry, inode);
132} 132}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 5a903da5455..3d9c62be0c1 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -347,7 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
347 sb->s_flags |= MS_RDONLY; 347 sb->s_flags |= MS_RDONLY;
348 if (sbi->s_truncate) 348 if (sbi->s_truncate)
349 sb->s_root->d_op = &sysv_dentry_operations; 349 sb->s_root->d_op = &sysv_dentry_operations;
350 sb->s_dirt = 1;
351 return 1; 350 return 1;
352} 351}
353 352
@@ -435,12 +434,46 @@ Ebadsize:
435 goto failed; 434 goto failed;
436} 435}
437 436
438static int v7_fill_super(struct super_block *sb, void *data, int silent) 437static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
439{ 438{
440 struct sysv_sb_info *sbi;
441 struct buffer_head *bh, *bh2 = NULL;
442 struct v7_super_block *v7sb; 439 struct v7_super_block *v7sb;
443 struct sysv_inode *v7i; 440 struct sysv_inode *v7i;
441 struct buffer_head *bh2;
442 struct sysv_sb_info *sbi;
443
444 sbi = sb->s_fs_info;
445
446 /* plausibility check on superblock */
447 v7sb = (struct v7_super_block *) bh->b_data;
448 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
449 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
450 fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
451 return 0;
452
453 /* plausibility check on root inode: it is a directory,
454 with a nonzero size that is a multiple of 16 */
455 bh2 = sb_bread(sb, 2);
456 if (bh2 == NULL)
457 return 0;
458
459 v7i = (struct sysv_inode *)(bh2->b_data + 64);
460 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
461 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
462 (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
463 (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
464 sizeof(struct sysv_dir_entry))) {
465 brelse(bh2);
466 return 0;
467 }
468
469 brelse(bh2);
470 return 1;
471}
472
473static int v7_fill_super(struct super_block *sb, void *data, int silent)
474{
475 struct sysv_sb_info *sbi;
476 struct buffer_head *bh;
444 477
445 if (440 != sizeof (struct v7_super_block)) 478 if (440 != sizeof (struct v7_super_block))
446 panic("V7 FS: bad super-block size"); 479 panic("V7 FS: bad super-block size");
@@ -454,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
454 sbi->s_sb = sb; 487 sbi->s_sb = sb;
455 sbi->s_block_base = 0; 488 sbi->s_block_base = 0;
456 sbi->s_type = FSTYPE_V7; 489 sbi->s_type = FSTYPE_V7;
457 sbi->s_bytesex = BYTESEX_PDP;
458 sb->s_fs_info = sbi; 490 sb->s_fs_info = sbi;
459 491
460 sb_set_blocksize(sb, 512); 492 sb_set_blocksize(sb, 512);
@@ -466,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
466 goto failed; 498 goto failed;
467 } 499 }
468 500
469 /* plausibility check on superblock */ 501 /* Try PDP-11 UNIX */
470 v7sb = (struct v7_super_block *) bh->b_data; 502 sbi->s_bytesex = BYTESEX_PDP;
471 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || 503 if (v7_sanity_check(sb, bh))
472 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || 504 goto detected;
473 fs32_to_cpu(sbi, v7sb->s_time) == 0)
474 goto failed;
475 505
476 /* plausibility check on root inode: it is a directory, 506 /* Try PC/IX, v7/x86 */
477 with a nonzero size that is a multiple of 16 */ 507 sbi->s_bytesex = BYTESEX_LE;
478 if ((bh2 = sb_bread(sb, 2)) == NULL) 508 if (v7_sanity_check(sb, bh))
479 goto failed; 509 goto detected;
480 v7i = (struct sysv_inode *)(bh2->b_data + 64);
481 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
482 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
483 (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
484 goto failed;
485 brelse(bh2);
486 bh2 = NULL;
487 510
511 goto failed;
512
513detected:
488 sbi->s_bh1 = bh; 514 sbi->s_bh1 = bh;
489 sbi->s_bh2 = bh; 515 sbi->s_bh2 = bh;
490 if (complete_read_super(sb, silent, 1)) 516 if (complete_read_super(sb, silent, 1))
491 return 0; 517 return 0;
492 518
493failed: 519failed:
494 brelse(bh2); 520 printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
521 sb->s_id);
495 brelse(bh); 522 brelse(bh);
496 kfree(sbi); 523 kfree(sbi);
497 return -EINVAL; 524 return -EINVAL;
@@ -499,23 +526,22 @@ failed:
499 526
500/* Every kernel module contains stuff like this. */ 527/* Every kernel module contains stuff like this. */
501 528
502static int sysv_get_sb(struct file_system_type *fs_type, 529static struct dentry *sysv_mount(struct file_system_type *fs_type,
503 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 530 int flags, const char *dev_name, void *data)
504{ 531{
505 return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, 532 return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
506 mnt);
507} 533}
508 534
509static int v7_get_sb(struct file_system_type *fs_type, 535static struct dentry *v7_mount(struct file_system_type *fs_type,
510 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 536 int flags, const char *dev_name, void *data)
511{ 537{
512 return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); 538 return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
513} 539}
514 540
515static struct file_system_type sysv_fs_type = { 541static struct file_system_type sysv_fs_type = {
516 .owner = THIS_MODULE, 542 .owner = THIS_MODULE,
517 .name = "sysv", 543 .name = "sysv",
518 .get_sb = sysv_get_sb, 544 .mount = sysv_mount,
519 .kill_sb = kill_block_super, 545 .kill_sb = kill_block_super,
520 .fs_flags = FS_REQUIRES_DEV, 546 .fs_flags = FS_REQUIRES_DEV,
521}; 547};
@@ -523,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
523static struct file_system_type v7_fs_type = { 549static struct file_system_type v7_fs_type = {
524 .owner = THIS_MODULE, 550 .owner = THIS_MODULE,
525 .name = "v7", 551 .name = "v7",
526 .get_sb = v7_get_sb, 552 .mount = v7_mount,
527 .kill_sb = kill_block_super, 553 .kill_sb = kill_block_super,
528 .fs_flags = FS_REQUIRES_DEV, 554 .fs_flags = FS_REQUIRES_DEV,
529}; 555};
@@ -560,4 +586,5 @@ static void __exit exit_sysv_fs(void)
560 586
561module_init(init_sysv_fs) 587module_init(init_sysv_fs)
562module_exit(exit_sysv_fs) 588module_exit(exit_sysv_fs)
589MODULE_ALIAS("v7");
563MODULE_LICENSE("GPL"); 590MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 94cb9b4d76c..bb55cdb394b 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -136,9 +136,7 @@ extern unsigned long sysv_count_free_blocks(struct super_block *);
136 136
137/* itree.c */ 137/* itree.c */
138extern void sysv_truncate(struct inode *); 138extern void sysv_truncate(struct inode *);
139extern int __sysv_write_begin(struct file *file, struct address_space *mapping, 139extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
140 loff_t pos, unsigned len, unsigned flags,
141 struct page **pagep, void **fsdata);
142 140
143/* inode.c */ 141/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 142extern struct inode *sysv_iget(struct super_block *, unsigned int);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79..8c4fc1425b3 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
144 .release = timerfd_release, 144 .release = timerfd_release,
145 .poll = timerfd_poll, 145 .poll = timerfd_poll,
146 .read = timerfd_read, 146 .read = timerfd_read,
147 .llseek = noop_llseek,
147}; 148};
148 149
149static struct file *timerfd_fget(int fd) 150static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d..02429d81ca3 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
63 struct ubifs_lp_stats lst; 63 struct ubifs_lp_stats lst;
64 64
65 dbg_cmt("start"); 65 dbg_cmt("start");
66 if (c->ro_media) { 66 ubifs_assert(!c->ro_media && !c->ro_mount);
67
68 if (c->ro_error) {
67 err = -EROFS; 69 err = -EROFS;
68 goto out_up; 70 goto out_up;
69 } 71 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782..0bee4dbffc3 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
2239 return err; 2239 return err;
2240} 2240}
2241 2241
2242/**
2243 * dbg_check_data_nodes_order - check that list of data nodes is sorted.
2244 * @c: UBIFS file-system description object
2245 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2246 *
2247 * This function returns zero if the list of data nodes is sorted correctly,
2248 * and %-EINVAL if not.
2249 */
2250int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
2251{
2252 struct list_head *cur;
2253 struct ubifs_scan_node *sa, *sb;
2254
2255 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2256 return 0;
2257
2258 for (cur = head->next; cur->next != head; cur = cur->next) {
2259 ino_t inuma, inumb;
2260 uint32_t blka, blkb;
2261
2262 cond_resched();
2263 sa = container_of(cur, struct ubifs_scan_node, list);
2264 sb = container_of(cur->next, struct ubifs_scan_node, list);
2265
2266 if (sa->type != UBIFS_DATA_NODE) {
2267 ubifs_err("bad node type %d", sa->type);
2268 dbg_dump_node(c, sa->node);
2269 return -EINVAL;
2270 }
2271 if (sb->type != UBIFS_DATA_NODE) {
2272 ubifs_err("bad node type %d", sb->type);
2273 dbg_dump_node(c, sb->node);
2274 return -EINVAL;
2275 }
2276
2277 inuma = key_inum(c, &sa->key);
2278 inumb = key_inum(c, &sb->key);
2279
2280 if (inuma < inumb)
2281 continue;
2282 if (inuma > inumb) {
2283 ubifs_err("larger inum %lu goes before inum %lu",
2284 (unsigned long)inuma, (unsigned long)inumb);
2285 goto error_dump;
2286 }
2287
2288 blka = key_block(c, &sa->key);
2289 blkb = key_block(c, &sb->key);
2290
2291 if (blka > blkb) {
2292 ubifs_err("larger block %u goes before %u", blka, blkb);
2293 goto error_dump;
2294 }
2295 if (blka == blkb) {
2296 ubifs_err("two data nodes for the same block");
2297 goto error_dump;
2298 }
2299 }
2300
2301 return 0;
2302
2303error_dump:
2304 dbg_dump_node(c, sa->node);
2305 dbg_dump_node(c, sb->node);
2306 return -EINVAL;
2307}
2308
2309/**
2310 * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
2311 * @c: UBIFS file-system description object
2312 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2313 *
2314 * This function returns zero if the list of non-data nodes is sorted correctly,
2315 * and %-EINVAL if not.
2316 */
2317int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2318{
2319 struct list_head *cur;
2320 struct ubifs_scan_node *sa, *sb;
2321
2322 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2323 return 0;
2324
2325 for (cur = head->next; cur->next != head; cur = cur->next) {
2326 ino_t inuma, inumb;
2327 uint32_t hasha, hashb;
2328
2329 cond_resched();
2330 sa = container_of(cur, struct ubifs_scan_node, list);
2331 sb = container_of(cur->next, struct ubifs_scan_node, list);
2332
2333 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2334 sa->type != UBIFS_XENT_NODE) {
2335 ubifs_err("bad node type %d", sa->type);
2336 dbg_dump_node(c, sa->node);
2337 return -EINVAL;
2338 }
2339 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2340 sa->type != UBIFS_XENT_NODE) {
2341 ubifs_err("bad node type %d", sb->type);
2342 dbg_dump_node(c, sb->node);
2343 return -EINVAL;
2344 }
2345
2346 if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2347 ubifs_err("non-inode node goes before inode node");
2348 goto error_dump;
2349 }
2350
2351 if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
2352 continue;
2353
2354 if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2355 /* Inode nodes are sorted in descending size order */
2356 if (sa->len < sb->len) {
2357 ubifs_err("smaller inode node goes first");
2358 goto error_dump;
2359 }
2360 continue;
2361 }
2362
2363 /*
2364 * This is either a dentry or xentry, which should be sorted in
2365 * ascending (parent ino, hash) order.
2366 */
2367 inuma = key_inum(c, &sa->key);
2368 inumb = key_inum(c, &sb->key);
2369
2370 if (inuma < inumb)
2371 continue;
2372 if (inuma > inumb) {
2373 ubifs_err("larger inum %lu goes before inum %lu",
2374 (unsigned long)inuma, (unsigned long)inumb);
2375 goto error_dump;
2376 }
2377
2378 hasha = key_block(c, &sa->key);
2379 hashb = key_block(c, &sb->key);
2380
2381 if (hasha > hashb) {
2382 ubifs_err("larger hash %u goes before %u", hasha, hashb);
2383 goto error_dump;
2384 }
2385 }
2386
2387 return 0;
2388
2389error_dump:
2390 ubifs_msg("dumping first node");
2391 dbg_dump_node(c, sa->node);
2392 ubifs_msg("dumping second node");
2393 dbg_dump_node(c, sb->node);
2394 return -EINVAL;
2395 return 0;
2396}
2397
2242static int invocation_cnt; 2398static int invocation_cnt;
2243 2399
2244int dbg_force_in_the_gaps(void) 2400int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
2625 .open = open_debugfs_file, 2781 .open = open_debugfs_file,
2626 .write = write_debugfs_file, 2782 .write = write_debugfs_file,
2627 .owner = THIS_MODULE, 2783 .owner = THIS_MODULE,
2784 .llseek = default_llseek,
2628}; 2785};
2629 2786
2630/** 2787/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea..69ebe472915 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
324 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, 325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size); 326 loff_t size);
327int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
328int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
327 329
328/* Force the use of in-the-gaps method for testing */ 330/* Force the use of in-the-gaps method for testing */
329 331
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
465#define dbg_check_lprops(c) 0 467#define dbg_check_lprops(c) 0
466#define dbg_check_lpt_nodes(c, cnode, row, col) 0 468#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0 469#define dbg_check_inode_size(c, inode, size) 0
470#define dbg_check_data_nodes_order(c, head) 0
471#define dbg_check_nondata_nodes_order(c, head) 0
468#define dbg_force_in_the_gaps_enabled 0 472#define dbg_force_in_the_gaps_enabled 0
469#define dbg_force_in_the_gaps() 0 473#define dbg_force_in_the_gaps() 0
470#define dbg_failure_mode 0 474#define dbg_failure_mode 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce7221..14f64b689d7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
550 550
551 lock_2_inodes(dir, inode); 551 lock_2_inodes(dir, inode);
552 inc_nlink(inode); 552 inc_nlink(inode);
553 atomic_inc(&inode->i_count); 553 ihold(inode);
554 inode->i_ctime = ubifs_current_time(inode); 554 inode->i_ctime = ubifs_current_time(inode);
555 dir->i_size += sz_change; 555 dir->i_size += sz_change;
556 dir_ui->ui_size = dir->i_size; 556 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 12f445cee9f..d77db7e3648 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
433 struct page *page; 433 struct page *page;
434 434
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
436 ubifs_assert(!c->ro_media && !c->ro_mount);
436 437
437 if (unlikely(c->ro_media)) 438 if (unlikely(c->ro_error))
438 return -EROFS; 439 return -EROFS;
439 440
440 /* Try out the fast-path part first */ 441 /* Try out the fast-path part first */
@@ -967,14 +968,15 @@ static int do_writepage(struct page *page, int len)
967 * the page locked, and it locks @ui_mutex. However, write-back does take inode 968 * the page locked, and it locks @ui_mutex. However, write-back does take inode
968 * @i_mutex, which means other VFS operations may be run on this inode at the 969 * @i_mutex, which means other VFS operations may be run on this inode at the
969 * same time. And the problematic one is truncation to smaller size, from where 970 * same time. And the problematic one is truncation to smaller size, from where
970 * we have to call 'simple_setsize()', which first changes @inode->i_size, then 971 * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
971 * drops the truncated pages. And while dropping the pages, it takes the page 972 * drops the truncated pages. And while dropping the pages, it takes the page
972 * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with 973 * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 974 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
974 * means that @inode->i_size is changed while @ui_mutex is unlocked. 975 * means that @inode->i_size is changed while @ui_mutex is unlocked.
975 * 976 *
976 * XXX: with the new truncate the above is not true anymore, the simple_setsize 977 * XXX(truncate): with the new truncate sequence this is not true anymore,
977 * calls can be replaced with the individual components. 978 * and the calls to truncate_setsize can be move around freely. They should
979 * be moved to the very end of the truncate sequence.
978 * 980 *
979 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 981 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
980 * inode size. How do we do this if @inode->i_size may became smaller while we 982 * inode size. How do we do this if @inode->i_size may became smaller while we
@@ -1128,9 +1130,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1128 budgeted = 0; 1130 budgeted = 0;
1129 } 1131 }
1130 1132
1131 err = simple_setsize(inode, new_size); 1133 truncate_setsize(inode, new_size);
1132 if (err)
1133 goto out_budg;
1134 1134
1135 if (offset) { 1135 if (offset) {
1136 pgoff_t index = new_size >> PAGE_CACHE_SHIFT; 1136 pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
@@ -1217,16 +1217,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1217 1217
1218 if (attr->ia_valid & ATTR_SIZE) { 1218 if (attr->ia_valid & ATTR_SIZE) {
1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1220 err = simple_setsize(inode, new_size); 1220 truncate_setsize(inode, new_size);
1221 if (err)
1222 goto out;
1223 } 1221 }
1224 1222
1225 mutex_lock(&ui->ui_mutex); 1223 mutex_lock(&ui->ui_mutex);
1226 if (attr->ia_valid & ATTR_SIZE) { 1224 if (attr->ia_valid & ATTR_SIZE) {
1227 /* Truncation changes inode [mc]time */ 1225 /* Truncation changes inode [mc]time */
1228 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1226 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1229 /* 'simple_setsize()' changed @i_size, update @ui_size */ 1227 /* 'truncate_setsize()' changed @i_size, update @ui_size */
1230 ui->ui_size = inode->i_size; 1228 ui->ui_size = inode->i_size;
1231 } 1229 }
1232 1230
@@ -1248,10 +1246,6 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1248 if (IS_SYNC(inode)) 1246 if (IS_SYNC(inode))
1249 err = inode->i_sb->s_op->write_inode(inode, NULL); 1247 err = inode->i_sb->s_op->write_inode(inode, NULL);
1250 return err; 1248 return err;
1251
1252out:
1253 ubifs_release_budget(c, &req);
1254 return err;
1255} 1249}
1256 1250
1257int ubifs_setattr(struct dentry *dentry, struct iattr *attr) 1251int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -1446,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
1446 1440
1447 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, 1441 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1448 i_size_read(inode)); 1442 i_size_read(inode));
1449 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1443 ubifs_assert(!c->ro_media && !c->ro_mount);
1450 1444
1451 if (unlikely(c->ro_media)) 1445 if (unlikely(c->ro_error))
1452 return VM_FAULT_SIGBUS; /* -EROFS */ 1446 return VM_FAULT_SIGBUS; /* -EROFS */
1453 1447
1454 /* 1448 /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca0..151f1088282 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
125 struct ubifs_scan_node *sa, *sb; 125 struct ubifs_scan_node *sa, *sb;
126 126
127 cond_resched(); 127 cond_resched();
128 if (a == b)
129 return 0;
130
128 sa = list_entry(a, struct ubifs_scan_node, list); 131 sa = list_entry(a, struct ubifs_scan_node, list);
129 sb = list_entry(b, struct ubifs_scan_node, list); 132 sb = list_entry(b, struct ubifs_scan_node, list);
133
130 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); 134 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
131 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); 135 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
136 ubifs_assert(sa->type == UBIFS_DATA_NODE);
137 ubifs_assert(sb->type == UBIFS_DATA_NODE);
132 138
133 inuma = key_inum(c, &sa->key); 139 inuma = key_inum(c, &sa->key);
134 inumb = key_inum(c, &sb->key); 140 inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
157 */ 163 */
158int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
159{ 165{
160 int typea, typeb;
161 ino_t inuma, inumb; 166 ino_t inuma, inumb;
162 struct ubifs_info *c = priv; 167 struct ubifs_info *c = priv;
163 struct ubifs_scan_node *sa, *sb; 168 struct ubifs_scan_node *sa, *sb;
164 169
165 cond_resched(); 170 cond_resched();
171 if (a == b)
172 return 0;
173
166 sa = list_entry(a, struct ubifs_scan_node, list); 174 sa = list_entry(a, struct ubifs_scan_node, list);
167 sb = list_entry(b, struct ubifs_scan_node, list); 175 sb = list_entry(b, struct ubifs_scan_node, list);
168 typea = key_type(c, &sa->key); 176
169 typeb = key_type(c, &sb->key); 177 ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
170 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); 178 key_type(c, &sb->key) != UBIFS_DATA_KEY);
179 ubifs_assert(sa->type != UBIFS_DATA_NODE &&
180 sb->type != UBIFS_DATA_NODE);
171 181
172 /* Inodes go before directory entries */ 182 /* Inodes go before directory entries */
173 if (typea == UBIFS_INO_KEY) { 183 if (sa->type == UBIFS_INO_NODE) {
174 if (typeb == UBIFS_INO_KEY) 184 if (sb->type == UBIFS_INO_NODE)
175 return sb->len - sa->len; 185 return sb->len - sa->len;
176 return -1; 186 return -1;
177 } 187 }
178 if (typeb == UBIFS_INO_KEY) 188 if (sb->type == UBIFS_INO_NODE)
179 return 1; 189 return 1;
180 190
181 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); 191 ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
192 key_type(c, &sa->key) == UBIFS_XENT_KEY);
193 ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
194 key_type(c, &sb->key) == UBIFS_XENT_KEY);
195 ubifs_assert(sa->type == UBIFS_DENT_NODE ||
196 sa->type == UBIFS_XENT_NODE);
197 ubifs_assert(sb->type == UBIFS_DENT_NODE ||
198 sb->type == UBIFS_XENT_NODE);
199
182 inuma = key_inum(c, &sa->key); 200 inuma = key_inum(c, &sa->key);
183 inumb = key_inum(c, &sb->key); 201 inumb = key_inum(c, &sb->key);
184 202
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
224static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 242static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
225 struct list_head *nondata, int *min) 243 struct list_head *nondata, int *min)
226{ 244{
245 int err;
227 struct ubifs_scan_node *snod, *tmp; 246 struct ubifs_scan_node *snod, *tmp;
228 247
229 *min = INT_MAX; 248 *min = INT_MAX;
230 249
231 /* Separate data nodes and non-data nodes */ 250 /* Separate data nodes and non-data nodes */
232 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 251 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
233 int err; 252 ubifs_assert(snod->type == UBIFS_INO_NODE ||
253 snod->type == UBIFS_DATA_NODE ||
254 snod->type == UBIFS_DENT_NODE ||
255 snod->type == UBIFS_XENT_NODE ||
256 snod->type == UBIFS_TRUN_NODE);
257
258 if (snod->type != UBIFS_INO_NODE &&
259 snod->type != UBIFS_DATA_NODE &&
260 snod->type != UBIFS_DENT_NODE &&
261 snod->type != UBIFS_XENT_NODE) {
262 /* Probably truncation node, zap it */
263 list_del(&snod->list);
264 kfree(snod);
265 continue;
266 }
234 267
235 ubifs_assert(snod->type != UBIFS_IDX_NODE); 268 ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
236 ubifs_assert(snod->type != UBIFS_REF_NODE); 269 key_type(c, &snod->key) == UBIFS_INO_KEY ||
237 ubifs_assert(snod->type != UBIFS_CS_NODE); 270 key_type(c, &snod->key) == UBIFS_DENT_KEY ||
271 key_type(c, &snod->key) == UBIFS_XENT_KEY);
238 272
239 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 273 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
240 snod->offs, 0); 274 snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
258 /* Sort data and non-data nodes */ 292 /* Sort data and non-data nodes */
259 list_sort(c, &sleb->nodes, &data_nodes_cmp); 293 list_sort(c, &sleb->nodes, &data_nodes_cmp);
260 list_sort(c, nondata, &nondata_nodes_cmp); 294 list_sort(c, nondata, &nondata_nodes_cmp);
295
296 err = dbg_check_data_nodes_order(c, &sleb->nodes);
297 if (err)
298 return err;
299 err = dbg_check_nondata_nodes_order(c, nondata);
300 if (err)
301 return err;
261 return 0; 302 return 0;
262} 303}
263 304
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
575 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 616 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
576 617
577 ubifs_assert_cmt_locked(c); 618 ubifs_assert_cmt_locked(c);
619 ubifs_assert(!c->ro_media && !c->ro_mount);
578 620
579 if (ubifs_gc_should_commit(c)) 621 if (ubifs_gc_should_commit(c))
580 return -EAGAIN; 622 return -EAGAIN;
581 623
582 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 624 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
583 625
584 if (c->ro_media) { 626 if (c->ro_error) {
585 ret = -EROFS; 627 ret = -EROFS;
586 goto out_unlock; 628 goto out_unlock;
587 } 629 }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
677 719
678 ret = ubifs_garbage_collect_leb(c, &lp); 720 ret = ubifs_garbage_collect_leb(c, &lp);
679 if (ret < 0) { 721 if (ret < 0) {
680 if (ret == -EAGAIN || ret == -ENOSPC) { 722 if (ret == -EAGAIN) {
681 /* 723 /*
682 * These codes are not errors, so we have to 724 * This is not error, so we have to return the
683 * return the LEB to lprops. But if the 725 * LEB to lprops. But if 'ubifs_return_leb()'
684 * 'ubifs_return_leb()' function fails, its 726 * fails, its failure code is propagated to the
685 * failure code is propagated to the caller 727 * caller instead of the original '-EAGAIN'.
686 * instead of the original '-EAGAIN' or
687 * '-ENOSPC'.
688 */ 728 */
689 err = ubifs_return_leb(c, lp.lnum); 729 err = ubifs_return_leb(c, lp.lnum);
690 if (err) 730 if (err)
@@ -774,8 +814,8 @@ out_unlock:
774out: 814out:
775 ubifs_assert(ret < 0); 815 ubifs_assert(ret < 0);
776 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); 816 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
777 ubifs_ro_mode(c, ret);
778 ubifs_wbuf_sync_nolock(wbuf); 817 ubifs_wbuf_sync_nolock(wbuf);
818 ubifs_ro_mode(c, ret);
779 mutex_unlock(&wbuf->io_mutex); 819 mutex_unlock(&wbuf->io_mutex);
780 ubifs_return_leb(c, lp.lnum); 820 ubifs_return_leb(c, lp.lnum);
781 return ret; 821 return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30b..d82173182ee 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
61 */ 61 */
62void ubifs_ro_mode(struct ubifs_info *c, int err) 62void ubifs_ro_mode(struct ubifs_info *c, int err)
63{ 63{
64 if (!c->ro_media) { 64 if (!c->ro_error) {
65 c->ro_media = 1; 65 c->ro_error = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY; 67 c->vfs_sb->s_flags |= MS_RDONLY;
68 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
356 356
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 357 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
360 ubifs_assert(!(wbuf->avail & 7)); 359 ubifs_assert(!(wbuf->avail & 7));
361 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
361 ubifs_assert(!c->ro_media && !c->ro_mount);
362 362
363 if (c->ro_media) 363 if (c->ro_error)
364 return -EROFS; 364 return -EROFS;
365 365
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
440{ 440{
441 int err, i; 441 int err, i;
442 442
443 ubifs_assert(!c->ro_media && !c->ro_mount);
443 if (!c->need_wbuf_sync) 444 if (!c->need_wbuf_sync)
444 return 0; 445 return 0;
445 c->need_wbuf_sync = 0; 446 c->need_wbuf_sync = 0;
446 447
447 if (c->ro_media) { 448 if (c->ro_error) {
448 err = -EROFS; 449 err = -EROFS;
449 goto out_timers; 450 goto out_timers;
450 } 451 }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
519 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
520 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
521 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount);
522 524
523 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
524 err = -ENOSPC; 526 err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
527 529
528 cancel_wbuf_timer_nolock(wbuf); 530 cancel_wbuf_timer_nolock(wbuf);
529 531
530 if (c->ro_media) 532 if (c->ro_error)
531 return -EROFS; 533 return -EROFS;
532 534
533 if (aligned_len <= wbuf->avail) { 535 if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
663 buf_len); 665 buf_len);
664 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 666 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
665 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); 667 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
668 ubifs_assert(!c->ro_media && !c->ro_mount);
666 669
667 if (c->ro_media) 670 if (c->ro_error)
668 return -EROFS; 671 return -EROFS;
669 672
670 ubifs_prepare_node(c, buf, len, 1); 673 ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
815 return 0; 818 return 0;
816 819
817out: 820out:
818 ubifs_err("bad node at LEB %d:%d", lnum, offs); 821 ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
822 ubi_is_mapped(c->ubi, lnum));
819 dbg_dump_node(c, buf); 823 dbg_dump_node(c, buf);
820 dbg_dump_stack(); 824 dbg_dump_stack();
821 return -EINVAL; 825 return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68..914f1bd89e5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
122 * better to try to allocate space at the ends of eraseblocks. This is 122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does. 123 * what the squeeze parameter does.
124 */ 124 */
125 ubifs_assert(!c->ro_media && !c->ro_mount);
125 squeeze = (jhead == BASEHD); 126 squeeze = (jhead == BASEHD);
126again: 127again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 128 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128 129
129 if (c->ro_media) { 130 if (c->ro_error) {
130 err = -EROFS; 131 err = -EROFS;
131 goto out_unlock; 132 goto out_unlock;
132 } 133 }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0..92a8491a8f8 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
306} 306}
307 307
308/** 308/**
309 * invalid_key_init - initialize invalid node key.
310 * @c: UBIFS file-system description object
311 * @key: key to initialize
312 *
313 * This is a helper function which marks a @key object as invalid.
314 */
315static inline void invalid_key_init(const struct ubifs_info *c,
316 union ubifs_key *key)
317{
318 key->u32[0] = 0xDEADBEAF;
319 key->u32[1] = UBIFS_INVALID_KEY;
320}
321
322/**
309 * key_type - get key type. 323 * key_type - get key type.
310 * @c: UBIFS file-system description object 324 * @c: UBIFS file-system description object
311 * @key: key to get type of 325 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42..4d0cb124146 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
159 jhead = &c->jheads[bud->jhead]; 159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list); 160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else 161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); 162 ubifs_assert(c->replaying && c->ro_mount);
163 163
164 /* 164 /*
165 * Note, although this is a new bud, we anyway account this space now, 165 * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
223 } 223 }
224 224
225 mutex_lock(&c->log_mutex); 225 mutex_lock(&c->log_mutex);
226 226 ubifs_assert(!c->ro_media && !c->ro_mount);
227 if (c->ro_media) { 227 if (c->ro_error) {
228 err = -EROFS; 228 err = -EROFS;
229 goto out_unlock; 229 goto out_unlock;
230 } 230 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827e..72775d35b99 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
1363 goto out; 1363 goto out;
1364 for (i = 0; i < c->lsave_cnt; i++) { 1364 for (i = 0; i < c->lsave_cnt; i++) {
1365 int lnum = c->lsave[i]; 1365 int lnum = c->lsave[i];
1366 struct ubifs_lprops *lprops;
1366 1367
1367 /* 1368 /*
1368 * Due to automatic resizing, the values in the lsave table 1369 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
1370 */ 1371 */
1371 if (lnum >= c->leb_cnt) 1372 if (lnum >= c->leb_cnt)
1372 continue; 1373 continue;
1373 ubifs_lpt_lookup(c, lnum); 1374 lprops = ubifs_lpt_lookup(c, lnum);
1375 if (IS_ERR(lprops)) {
1376 err = PTR_ERR(lprops);
1377 goto out;
1378 }
1374 } 1379 }
1375out: 1380out:
1376 vfree(buf); 1381 vfree(buf);
@@ -1457,13 +1462,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
1457 shft -= UBIFS_LPT_FANOUT_SHIFT; 1462 shft -= UBIFS_LPT_FANOUT_SHIFT;
1458 nnode = ubifs_get_nnode(c, nnode, iip); 1463 nnode = ubifs_get_nnode(c, nnode, iip);
1459 if (IS_ERR(nnode)) 1464 if (IS_ERR(nnode))
1460 return ERR_PTR(PTR_ERR(nnode)); 1465 return ERR_CAST(nnode);
1461 } 1466 }
1462 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1467 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1463 shft -= UBIFS_LPT_FANOUT_SHIFT; 1468 shft -= UBIFS_LPT_FANOUT_SHIFT;
1464 pnode = ubifs_get_pnode(c, nnode, iip); 1469 pnode = ubifs_get_pnode(c, nnode, iip);
1465 if (IS_ERR(pnode)) 1470 if (IS_ERR(pnode))
1466 return ERR_PTR(PTR_ERR(pnode)); 1471 return ERR_CAST(pnode);
1467 iip = (i & (UBIFS_LPT_FANOUT - 1)); 1472 iip = (i & (UBIFS_LPT_FANOUT - 1));
1468 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, 1473 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1469 pnode->lprops[iip].free, pnode->lprops[iip].dirty, 1474 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1591,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1586 nnode = c->nroot; 1591 nnode = c->nroot;
1587 nnode = dirty_cow_nnode(c, nnode); 1592 nnode = dirty_cow_nnode(c, nnode);
1588 if (IS_ERR(nnode)) 1593 if (IS_ERR(nnode))
1589 return ERR_PTR(PTR_ERR(nnode)); 1594 return ERR_CAST(nnode);
1590 i = lnum - c->main_first; 1595 i = lnum - c->main_first;
1591 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; 1596 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1592 for (h = 1; h < c->lpt_hght; h++) { 1597 for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1599,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1594 shft -= UBIFS_LPT_FANOUT_SHIFT; 1599 shft -= UBIFS_LPT_FANOUT_SHIFT;
1595 nnode = ubifs_get_nnode(c, nnode, iip); 1600 nnode = ubifs_get_nnode(c, nnode, iip);
1596 if (IS_ERR(nnode)) 1601 if (IS_ERR(nnode))
1597 return ERR_PTR(PTR_ERR(nnode)); 1602 return ERR_CAST(nnode);
1598 nnode = dirty_cow_nnode(c, nnode); 1603 nnode = dirty_cow_nnode(c, nnode);
1599 if (IS_ERR(nnode)) 1604 if (IS_ERR(nnode))
1600 return ERR_PTR(PTR_ERR(nnode)); 1605 return ERR_CAST(nnode);
1601 } 1606 }
1602 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1607 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1603 shft -= UBIFS_LPT_FANOUT_SHIFT; 1608 shft -= UBIFS_LPT_FANOUT_SHIFT;
1604 pnode = ubifs_get_pnode(c, nnode, iip); 1609 pnode = ubifs_get_pnode(c, nnode, iip);
1605 if (IS_ERR(pnode)) 1610 if (IS_ERR(pnode))
1606 return ERR_PTR(PTR_ERR(pnode)); 1611 return ERR_CAST(pnode);
1607 pnode = dirty_cow_pnode(c, pnode); 1612 pnode = dirty_cow_pnode(c, pnode);
1608 if (IS_ERR(pnode)) 1613 if (IS_ERR(pnode))
1609 return ERR_PTR(PTR_ERR(pnode)); 1614 return ERR_CAST(pnode);
1610 iip = (i & (UBIFS_LPT_FANOUT - 1)); 1615 iip = (i & (UBIFS_LPT_FANOUT - 1));
1611 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, 1616 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1612 pnode->lprops[iip].free, pnode->lprops[iip].dirty, 1617 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237b..5c90dec5db0 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
646 shft -= UBIFS_LPT_FANOUT_SHIFT; 646 shft -= UBIFS_LPT_FANOUT_SHIFT;
647 nnode = ubifs_get_nnode(c, nnode, iip); 647 nnode = ubifs_get_nnode(c, nnode, iip);
648 if (IS_ERR(nnode)) 648 if (IS_ERR(nnode))
649 return ERR_PTR(PTR_ERR(nnode)); 649 return ERR_CAST(nnode);
650 } 650 }
651 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 651 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
652 return ubifs_get_pnode(c, nnode, iip); 652 return ubifs_get_pnode(c, nnode, iip);
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
705 struct ubifs_pnode *pnode; 705 struct ubifs_pnode *pnode;
706 706
707 pnode = pnode_lookup(c, 0); 707 pnode = pnode_lookup(c, 0);
708 if (IS_ERR(pnode))
709 return PTR_ERR(pnode);
710
708 while (pnode) { 711 while (pnode) {
709 do_make_pnode_dirty(c, pnode); 712 do_make_pnode_dirty(c, pnode);
710 pnode = next_pnode_to_dirty(c, pnode); 713 pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc..21f47afdacf 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
361{ 361{
362 int err, lnum, offs, len; 362 int err, lnum, offs, len;
363 363
364 if (c->ro_media) 364 ubifs_assert(!c->ro_media && !c->ro_mount);
365 if (c->ro_error)
365 return -EROFS; 366 return -EROFS;
366 367
367 lnum = UBIFS_MST_LNUM; 368 lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e4..c3de04dc952 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
132{ 132{
133 int err; 133 int err;
134 134
135 if (c->ro_media) 135 ubifs_assert(!c->ro_media && !c->ro_mount);
136 if (c->ro_error)
136 return -EROFS; 137 return -EROFS;
137 err = ubi_leb_unmap(c->ubi, lnum); 138 err = ubi_leb_unmap(c->ubi, lnum);
138 if (err) { 139 if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
159{ 160{
160 int err; 161 int err;
161 162
162 if (c->ro_media) 163 ubifs_assert(!c->ro_media && !c->ro_mount);
164 if (c->ro_error)
163 return -EROFS; 165 return -EROFS;
164 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); 166 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
165 if (err) { 167 if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
186{ 188{
187 int err; 189 int err;
188 190
189 if (c->ro_media) 191 ubifs_assert(!c->ro_media && !c->ro_mount);
192 if (c->ro_error)
190 return -EROFS; 193 return -EROFS;
191 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); 194 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
192 if (err) { 195 if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb..77e9b874b6c 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
24 * This file implements functions needed to recover from unclean un-mounts. 24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if 25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed successfully. If not, the process of mounting 26 * an un-mount was completed successfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures. 27 * incorporates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case. 30 * read-only, and the flash is not modified in that case.
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
292 292
293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
294 294
295 if ((c->vfs_sb->s_flags & MS_RDONLY)) { 295 if (c->ro_mount) {
296 /* Read-only mode. Keep a copy for switching to rw mode */ 296 /* Read-only mode. Keep a copy for switching to rw mode */
297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); 297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
298 if (!c->rcvrd_mst_node) { 298 if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
469 endpt = snod->offs + snod->len; 469 endpt = snod->offs + snod->len;
470 } 470 }
471 471
472 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { 472 if (c->ro_mount && !c->remounting_rw) {
473 /* Add to recovery list */ 473 /* Add to recovery list */
474 struct ubifs_unclean_leb *ucleb; 474 struct ubifs_unclean_leb *ucleb;
475 475
@@ -772,7 +772,8 @@ out_free:
772 * @sbuf: LEB-sized buffer to use 772 * @sbuf: LEB-sized buffer to use
773 * 773 *
774 * This function does a scan of a LEB, but caters for errors that might have 774 * This function does a scan of a LEB, but caters for errors that might have
775 * been caused by the unclean unmount from which we are attempting to recover. 775 * been caused by unclean reboots from which we are attempting to recover
776 * (assume that only the last log LEB can be corrupted by an unclean reboot).
776 * 777 *
777 * This function returns %0 on success and a negative error code on failure. 778 * This function returns %0 on success and a negative error code on failure.
778 */ 779 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
883{ 884{
884 int err; 885 int err;
885 886
886 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); 887 ubifs_assert(!c->ro_mount || c->remounting_rw);
887 888
888 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); 889 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
889 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); 890 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1063,8 +1064,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1063 } 1064 }
1064 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); 1065 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1065 if (err) { 1066 if (err) {
1066 if (err == -ENOSPC) 1067 /*
1067 dbg_err("could not find a dirty LEB"); 1068 * There are no dirty or empty LEBs subject to here being
1069 * enough for the index. Try to use
1070 * 'ubifs_find_free_leb_for_idx()', which will return any empty
1071 * LEBs (ignoring index requirements). If the index then
1072 * doesn't have enough LEBs the recovery commit will fail -
1073 * which is the same result anyway i.e. recovery fails. So
1074 * there is no problem ignoring index requirements and just
1075 * grabbing a free LEB since we have already established there
1076 * is not a dirty LEB we could have used instead.
1077 */
1078 if (err == -ENOSPC) {
1079 dbg_rcvry("could not find a dirty LEB");
1080 goto find_free;
1081 }
1068 return err; 1082 return err;
1069 } 1083 }
1070 ubifs_assert(!(lp.flags & LPROPS_INDEX)); 1084 ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1153,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1139find_free: 1153find_free:
1140 /* 1154 /*
1141 * There is no GC head LEB or the free space in the GC head LEB is too 1155 * There is no GC head LEB or the free space in the GC head LEB is too
1142 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so 1156 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
1143 * GC is not run. 1157 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
1144 */ 1158 */
1145 lnum = ubifs_find_free_leb_for_idx(c); 1159 lnum = ubifs_find_free_leb_for_idx(c);
1146 if (lnum < 0) { 1160 if (lnum < 0) {
@@ -1448,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
1448 } 1462 }
1449 } 1463 }
1450 if (e->exists && e->i_size < e->d_size) { 1464 if (e->exists && e->i_size < e->d_size) {
1451 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { 1465 if (!e->inode && c->ro_mount) {
1452 /* Fix the inode size and pin it in memory */ 1466 /* Fix the inode size and pin it in memory */
1453 struct inode *inode; 1467 struct inode *inode;
1454 1468
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3..eed0fcff8d7 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 ubifs_assert(sleb->endpt - offs >= used); 627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0); 628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629 629
630 if (sleb->endpt + c->min_io_size <= c->leb_size && 630 if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, 631 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM); 632 sleb->endpt, UBI_SHORTTERM);
634 633
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
840 if (IS_ERR(sleb)) { 839 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 840 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 841 return PTR_ERR(sleb);
842 /*
843 * Note, the below function will recover this log LEB only if
844 * it is the last, because unclean reboots can possibly corrupt
845 * only the tail of the log.
846 */
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 847 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
844 if (IS_ERR(sleb)) 848 if (IS_ERR(sleb))
845 return PTR_ERR(sleb); 849 return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
851 } 855 }
852 856
853 node = sleb->buf; 857 node = sleb->buf;
854
855 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); 858 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
856 if (c->cs_sqnum == 0) { 859 if (c->cs_sqnum == 0) {
857 /* 860 /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
898 } 901 }
899 902
900 list_for_each_entry(snod, &sleb->nodes, list) { 903 list_for_each_entry(snod, &sleb->nodes, list) {
901
902 cond_resched(); 904 cond_resched();
903 905
904 if (snod->sqnum >= SQNUM_WATERMARK) { 906 if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
1011int ubifs_replay_journal(struct ubifs_info *c) 1013int ubifs_replay_journal(struct ubifs_info *c)
1012{ 1014{
1013 int err, i, lnum, offs, free; 1015 int err, i, lnum, offs, free;
1014 void *sbuf = NULL;
1015 1016
1016 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); 1017 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1017 1018
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
1026 return -EINVAL; 1027 return -EINVAL;
1027 } 1028 }
1028 1029
1029 sbuf = vmalloc(c->leb_size);
1030 if (!sbuf)
1031 return -ENOMEM;
1032
1033 dbg_mnt("start replaying the journal"); 1030 dbg_mnt("start replaying the journal");
1034
1035 c->replaying = 1; 1031 c->replaying = 1;
1036
1037 lnum = c->ltail_lnum = c->lhead_lnum; 1032 lnum = c->ltail_lnum = c->lhead_lnum;
1038 offs = c->lhead_offs; 1033 offs = c->lhead_offs;
1039 1034
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
1046 lnum = UBIFS_LOG_LNUM; 1041 lnum = UBIFS_LOG_LNUM;
1047 offs = 0; 1042 offs = 0;
1048 } 1043 }
1049 err = replay_log_leb(c, lnum, offs, sbuf); 1044 err = replay_log_leb(c, lnum, offs, c->sbuf);
1050 if (err == 1) 1045 if (err == 1)
1051 /* We hit the end of the log */ 1046 /* We hit the end of the log */
1052 break; 1047 break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
1079out: 1074out:
1080 destroy_replay_tree(c); 1075 destroy_replay_tree(c);
1081 destroy_bud_list(c); 1076 destroy_bud_list(c);
1082 vfree(sbuf);
1083 c->replaying = 0; 1077 c->replaying = 0;
1084 return err; 1078 return err;
1085} 1079}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9d..bf31b4729e5 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
542 * due to the unavailability of time-travelling equipment. 542 * due to the unavailability of time-travelling equipment.
543 */ 543 */
544 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 544 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
545 struct super_block *sb = c->vfs_sb; 545 ubifs_assert(!c->ro_media || c->ro_mount);
546 int mounting_ro = sb->s_flags & MS_RDONLY; 546 if (!c->ro_mount ||
547
548 ubifs_assert(!c->ro_media || mounting_ro);
549 if (!mounting_ro ||
550 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 547 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
551 ubifs_err("on-flash format version is w%d/r%d, but " 548 ubifs_err("on-flash format version is w%d/r%d, but "
552 "software only supports up to version " 549 "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
624 c->old_leb_cnt = c->leb_cnt; 621 c->old_leb_cnt = c->leb_cnt;
625 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { 622 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
626 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); 623 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
627 if (c->vfs_sb->s_flags & MS_RDONLY) 624 if (c->ro_mount)
628 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", 625 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
629 c->old_leb_cnt, c->leb_cnt); 626 c->old_leb_cnt, c->leb_cnt);
630 else { 627 else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c52538419..3e1ee57dbea 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
197 struct ubifs_ino_node *ino = buf; 197 struct ubifs_ino_node *ino = buf;
198 struct ubifs_scan_node *snod; 198 struct ubifs_scan_node *snod;
199 199
200 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); 200 snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
201 if (!snod) 201 if (!snod)
202 return -ENOMEM; 202 return -ENOMEM;
203 203
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
212 case UBIFS_DENT_NODE: 212 case UBIFS_DENT_NODE:
213 case UBIFS_XENT_NODE: 213 case UBIFS_XENT_NODE:
214 case UBIFS_DATA_NODE: 214 case UBIFS_DATA_NODE:
215 case UBIFS_TRUN_NODE:
216 /* 215 /*
217 * The key is in the same place in all keyed 216 * The key is in the same place in all keyed
218 * nodes. 217 * nodes.
219 */ 218 */
220 key_read(c, &ino->key, &snod->key); 219 key_read(c, &ino->key, &snod->key);
221 break; 220 break;
221 default:
222 invalid_key_init(c, &snod->key);
223 break;
222 } 224 }
223 list_add_tail(&snod->list, &sleb->nodes); 225 list_add_tail(&snod->list, &sleb->nodes);
224 sleb->nodes_cnt += 1; 226 sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5a..46961c00323 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); 250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
251 251
252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || 252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
253 c->ro_media) { 253 c->ro_mount || c->ro_error) {
254 mutex_unlock(&c->umount_mutex); 254 mutex_unlock(&c->umount_mutex);
255 continue; 255 continue;
256 } 256 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3..91fac54c70e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -327,7 +327,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
327 return err; 327 return err;
328} 328}
329 329
330static void ubifs_delete_inode(struct inode *inode) 330static void ubifs_evict_inode(struct inode *inode)
331{ 331{
332 int err; 332 int err;
333 struct ubifs_info *c = inode->i_sb->s_fs_info; 333 struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -343,9 +343,12 @@ static void ubifs_delete_inode(struct inode *inode)
343 343
344 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); 344 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
345 ubifs_assert(!atomic_read(&inode->i_count)); 345 ubifs_assert(!atomic_read(&inode->i_count));
346 ubifs_assert(inode->i_nlink == 0);
347 346
348 truncate_inode_pages(&inode->i_data, 0); 347 truncate_inode_pages(&inode->i_data, 0);
348
349 if (inode->i_nlink)
350 goto done;
351
349 if (is_bad_inode(inode)) 352 if (is_bad_inode(inode))
350 goto out; 353 goto out;
351 354
@@ -367,7 +370,8 @@ out:
367 c->nospace = c->nospace_rp = 0; 370 c->nospace = c->nospace_rp = 0;
368 smp_wmb(); 371 smp_wmb();
369 } 372 }
370 clear_inode(inode); 373done:
374 end_writeback(inode);
371} 375}
372 376
373static void ubifs_dirty_inode(struct inode *inode) 377static void ubifs_dirty_inode(struct inode *inode)
@@ -1133,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
1133 */ 1137 */
1134static int mount_ubifs(struct ubifs_info *c) 1138static int mount_ubifs(struct ubifs_info *c)
1135{ 1139{
1136 struct super_block *sb = c->vfs_sb; 1140 int err;
1137 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
1138 long long x; 1141 long long x;
1139 size_t sz; 1142 size_t sz;
1140 1143
1144 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
1141 err = init_constants_early(c); 1145 err = init_constants_early(c);
1142 if (err) 1146 if (err)
1143 return err; 1147 return err;
@@ -1150,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
1150 if (err) 1154 if (err)
1151 goto out_free; 1155 goto out_free;
1152 1156
1153 if (c->empty && (mounted_read_only || c->ro_media)) { 1157 if (c->empty && (c->ro_mount || c->ro_media)) {
1154 /* 1158 /*
1155 * This UBI volume is empty, and read-only, or the file system 1159 * This UBI volume is empty, and read-only, or the file system
1156 * is mounted read-only - we cannot format it. 1160 * is mounted read-only - we cannot format it.
@@ -1161,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
1161 goto out_free; 1165 goto out_free;
1162 } 1166 }
1163 1167
1164 if (c->ro_media && !mounted_read_only) { 1168 if (c->ro_media && !c->ro_mount) {
1165 ubifs_err("cannot mount read-write - read-only media"); 1169 ubifs_err("cannot mount read-write - read-only media");
1166 err = -EROFS; 1170 err = -EROFS;
1167 goto out_free; 1171 goto out_free;
@@ -1181,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
1181 if (!c->sbuf) 1185 if (!c->sbuf)
1182 goto out_free; 1186 goto out_free;
1183 1187
1184 if (!mounted_read_only) { 1188 if (!c->ro_mount) {
1185 c->ileb_buf = vmalloc(c->leb_size); 1189 c->ileb_buf = vmalloc(c->leb_size);
1186 if (!c->ileb_buf) 1190 if (!c->ileb_buf)
1187 goto out_free; 1191 goto out_free;
@@ -1224,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
1224 } 1228 }
1225 1229
1226 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1227 if (!mounted_read_only) { 1231 if (!c->ro_mount) {
1228 err = alloc_wbufs(c); 1232 err = alloc_wbufs(c);
1229 if (err) 1233 if (err)
1230 goto out_cbuf; 1234 goto out_cbuf;
@@ -1250,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
1250 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1251 ubifs_msg("recovery needed"); 1255 ubifs_msg("recovery needed");
1252 c->need_recovery = 1; 1256 c->need_recovery = 1;
1253 if (!mounted_read_only) { 1257 if (!c->ro_mount) {
1254 err = ubifs_recover_inl_heads(c, c->sbuf); 1258 err = ubifs_recover_inl_heads(c, c->sbuf);
1255 if (err) 1259 if (err)
1256 goto out_master; 1260 goto out_master;
1257 } 1261 }
1258 } else if (!mounted_read_only) { 1262 } else if (!c->ro_mount) {
1259 /* 1263 /*
1260 * Set the "dirty" flag so that if we reboot uncleanly we 1264 * Set the "dirty" flag so that if we reboot uncleanly we
1261 * will notice this immediately on the next mount. 1265 * will notice this immediately on the next mount.
@@ -1266,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
1266 goto out_master; 1270 goto out_master;
1267 } 1271 }
1268 1272
1269 err = ubifs_lpt_init(c, 1, !mounted_read_only); 1273 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1270 if (err) 1274 if (err)
1271 goto out_lpt; 1275 goto out_lpt;
1272 1276
@@ -1281,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
1281 /* Calculate 'min_idx_lebs' after journal replay */ 1285 /* Calculate 'min_idx_lebs' after journal replay */
1282 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1283 1287
1284 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); 1288 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1285 if (err) 1289 if (err)
1286 goto out_orphans; 1290 goto out_orphans;
1287 1291
1288 if (!mounted_read_only) { 1292 if (!c->ro_mount) {
1289 int lnum; 1293 int lnum;
1290 1294
1291 err = check_free_space(c); 1295 err = check_free_space(c);
@@ -1307,6 +1311,8 @@ static int mount_ubifs(struct ubifs_info *c)
1307 if (err) 1311 if (err)
1308 goto out_orphans; 1312 goto out_orphans;
1309 err = ubifs_rcvry_gc_commit(c); 1313 err = ubifs_rcvry_gc_commit(c);
1314 if (err)
1315 goto out_orphans;
1310 } else { 1316 } else {
1311 err = take_gc_lnum(c); 1317 err = take_gc_lnum(c);
1312 if (err) 1318 if (err)
@@ -1318,7 +1324,7 @@ static int mount_ubifs(struct ubifs_info *c)
1318 */ 1324 */
1319 err = ubifs_leb_unmap(c, c->gc_lnum); 1325 err = ubifs_leb_unmap(c, c->gc_lnum);
1320 if (err) 1326 if (err)
1321 return err; 1327 goto out_orphans;
1322 } 1328 }
1323 1329
1324 err = dbg_check_lprops(c); 1330 err = dbg_check_lprops(c);
@@ -1345,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
1345 spin_unlock(&ubifs_infos_lock); 1351 spin_unlock(&ubifs_infos_lock);
1346 1352
1347 if (c->need_recovery) { 1353 if (c->need_recovery) {
1348 if (mounted_read_only) 1354 if (c->ro_mount)
1349 ubifs_msg("recovery deferred"); 1355 ubifs_msg("recovery deferred");
1350 else { 1356 else {
1351 c->need_recovery = 0; 1357 c->need_recovery = 0;
@@ -1372,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
1372 1378
1373 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1374 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
1375 if (mounted_read_only) 1381 if (c->ro_mount)
1376 ubifs_msg("mounted read-only"); 1382 ubifs_msg("mounted read-only");
1377 x = (long long)c->main_lebs * c->leb_size; 1383 x = (long long)c->main_lebs * c->leb_size;
1378 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " 1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1634,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1634 } 1640 }
1635 1641
1636 dbg_gen("re-mounted read-write"); 1642 dbg_gen("re-mounted read-write");
1637 c->vfs_sb->s_flags &= ~MS_RDONLY; 1643 c->ro_mount = 0;
1638 c->remounting_rw = 0; 1644 c->remounting_rw = 0;
1639 c->always_chk_crc = 0; 1645 c->always_chk_crc = 0;
1640 err = dbg_check_space_info(c); 1646 err = dbg_check_space_info(c);
@@ -1670,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1670 int i, err; 1676 int i, err;
1671 1677
1672 ubifs_assert(!c->need_recovery); 1678 ubifs_assert(!c->need_recovery);
1673 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 1679 ubifs_assert(!c->ro_mount);
1674 1680
1675 mutex_lock(&c->umount_mutex); 1681 mutex_lock(&c->umount_mutex);
1676 if (c->bgt) { 1682 if (c->bgt) {
@@ -1680,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1680 1686
1681 dbg_save_space_info(c); 1687 dbg_save_space_info(c);
1682 1688
1683 for (i = 0; i < c->jhead_cnt; i++) { 1689 for (i = 0; i < c->jhead_cnt; i++)
1684 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1690 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1685 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1686 }
1687 1691
1688 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1692 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1689 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1693 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1698,6 +1702,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1698 vfree(c->ileb_buf); 1702 vfree(c->ileb_buf);
1699 c->ileb_buf = NULL; 1703 c->ileb_buf = NULL;
1700 ubifs_lpt_free(c, 1); 1704 ubifs_lpt_free(c, 1);
1705 c->ro_mount = 1;
1701 err = dbg_check_space_info(c); 1706 err = dbg_check_space_info(c);
1702 if (err) 1707 if (err)
1703 ubifs_ro_mode(c, err); 1708 ubifs_ro_mode(c, err);
@@ -1729,7 +1734,7 @@ static void ubifs_put_super(struct super_block *sb)
1729 * the mutex is locked. 1734 * the mutex is locked.
1730 */ 1735 */
1731 mutex_lock(&c->umount_mutex); 1736 mutex_lock(&c->umount_mutex);
1732 if (!(c->vfs_sb->s_flags & MS_RDONLY)) { 1737 if (!c->ro_mount) {
1733 /* 1738 /*
1734 * First of all kill the background thread to make sure it does 1739 * First of all kill the background thread to make sure it does
1735 * not interfere with un-mounting and freeing resources. 1740 * not interfere with un-mounting and freeing resources.
@@ -1739,23 +1744,22 @@ static void ubifs_put_super(struct super_block *sb)
1739 c->bgt = NULL; 1744 c->bgt = NULL;
1740 } 1745 }
1741 1746
1742 /* Synchronize write-buffers */
1743 if (c->jheads)
1744 for (i = 0; i < c->jhead_cnt; i++)
1745 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1746
1747 /* 1747 /*
1748 * On fatal errors c->ro_media is set to 1, in which case we do 1748 * On fatal errors c->ro_error is set to 1, in which case we do
1749 * not write the master node. 1749 * not write the master node.
1750 */ 1750 */
1751 if (!c->ro_media) { 1751 if (!c->ro_error) {
1752 int err;
1753
1754 /* Synchronize write-buffers */
1755 for (i = 0; i < c->jhead_cnt; i++)
1756 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1757
1752 /* 1758 /*
1753 * We are being cleanly unmounted which means the 1759 * We are being cleanly unmounted which means the
1754 * orphans were killed - indicate this in the master 1760 * orphans were killed - indicate this in the master
1755 * node. Also save the reserved GC LEB number. 1761 * node. Also save the reserved GC LEB number.
1756 */ 1762 */
1757 int err;
1758
1759 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1763 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1760 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1764 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1761 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1765 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1768,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
1768 */ 1772 */
1769 ubifs_err("failed to write master node, " 1773 ubifs_err("failed to write master node, "
1770 "error %d", err); 1774 "error %d", err);
1775 } else {
1776 for (i = 0; i < c->jhead_cnt; i++)
1777 /* Make sure write-buffer timers are canceled */
1778 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1771 } 1779 }
1772 } 1780 }
1773 1781
@@ -1791,17 +1799,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1791 return err; 1799 return err;
1792 } 1800 }
1793 1801
1794 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1802 if (c->ro_mount && !(*flags & MS_RDONLY)) {
1803 if (c->ro_error) {
1804 ubifs_msg("cannot re-mount R/W due to prior errors");
1805 return -EROFS;
1806 }
1795 if (c->ro_media) { 1807 if (c->ro_media) {
1796 ubifs_msg("cannot re-mount due to prior errors"); 1808 ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
1797 return -EROFS; 1809 return -EROFS;
1798 } 1810 }
1799 err = ubifs_remount_rw(c); 1811 err = ubifs_remount_rw(c);
1800 if (err) 1812 if (err)
1801 return err; 1813 return err;
1802 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1814 } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
1803 if (c->ro_media) { 1815 if (c->ro_error) {
1804 ubifs_msg("cannot re-mount due to prior errors"); 1816 ubifs_msg("cannot re-mount R/O due to prior errors");
1805 return -EROFS; 1817 return -EROFS;
1806 } 1818 }
1807 ubifs_remount_ro(c); 1819 ubifs_remount_ro(c);
@@ -1824,7 +1836,7 @@ const struct super_operations ubifs_super_operations = {
1824 .destroy_inode = ubifs_destroy_inode, 1836 .destroy_inode = ubifs_destroy_inode,
1825 .put_super = ubifs_put_super, 1837 .put_super = ubifs_put_super,
1826 .write_inode = ubifs_write_inode, 1838 .write_inode = ubifs_write_inode,
1827 .delete_inode = ubifs_delete_inode, 1839 .evict_inode = ubifs_evict_inode,
1828 .statfs = ubifs_statfs, 1840 .statfs = ubifs_statfs,
1829 .dirty_inode = ubifs_dirty_inode, 1841 .dirty_inode = ubifs_dirty_inode,
1830 .remount_fs = ubifs_remount_fs, 1842 .remount_fs = ubifs_remount_fs,
@@ -2026,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
2026 return c->vi.cdev == *dev; 2038 return c->vi.cdev == *dev;
2027} 2039}
2028 2040
2029static int ubifs_get_sb(struct file_system_type *fs_type, int flags, 2041static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
2030 const char *name, void *data, struct vfsmount *mnt) 2042 const char *name, void *data)
2031{ 2043{
2032 struct ubi_volume_desc *ubi; 2044 struct ubi_volume_desc *ubi;
2033 struct ubi_volume_info vi; 2045 struct ubi_volume_info vi;
@@ -2043,9 +2055,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2043 */ 2055 */
2044 ubi = open_ubi(name, UBI_READONLY); 2056 ubi = open_ubi(name, UBI_READONLY);
2045 if (IS_ERR(ubi)) { 2057 if (IS_ERR(ubi)) {
2046 ubifs_err("cannot open \"%s\", error %d", 2058 dbg_err("cannot open \"%s\", error %d",
2047 name, (int)PTR_ERR(ubi)); 2059 name, (int)PTR_ERR(ubi));
2048 return PTR_ERR(ubi); 2060 return ERR_CAST(ubi);
2049 } 2061 }
2050 ubi_get_volume_info(ubi, &vi); 2062 ubi_get_volume_info(ubi, &vi);
2051 2063
@@ -2058,9 +2070,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2058 } 2070 }
2059 2071
2060 if (sb->s_root) { 2072 if (sb->s_root) {
2073 struct ubifs_info *c1 = sb->s_fs_info;
2074
2061 /* A new mount point for already mounted UBIFS */ 2075 /* A new mount point for already mounted UBIFS */
2062 dbg_gen("this ubi volume is already mounted"); 2076 dbg_gen("this ubi volume is already mounted");
2063 if ((flags ^ sb->s_flags) & MS_RDONLY) { 2077 if (!!(flags & MS_RDONLY) != c1->ro_mount) {
2064 err = -EBUSY; 2078 err = -EBUSY;
2065 goto out_deact; 2079 goto out_deact;
2066 } 2080 }
@@ -2081,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2081 /* 'fill_super()' opens ubi again so we must close it here */ 2095 /* 'fill_super()' opens ubi again so we must close it here */
2082 ubi_close_volume(ubi); 2096 ubi_close_volume(ubi);
2083 2097
2084 simple_set_mnt(mnt, sb); 2098 return dget(sb->s_root);
2085 return 0;
2086 2099
2087out_deact: 2100out_deact:
2088 deactivate_locked_super(sb); 2101 deactivate_locked_super(sb);
2089out_close: 2102out_close:
2090 ubi_close_volume(ubi); 2103 ubi_close_volume(ubi);
2091 return err; 2104 return ERR_PTR(err);
2092} 2105}
2093 2106
2094static struct file_system_type ubifs_fs_type = { 2107static struct file_system_type ubifs_fs_type = {
2095 .name = "ubifs", 2108 .name = "ubifs",
2096 .owner = THIS_MODULE, 2109 .owner = THIS_MODULE,
2097 .get_sb = ubifs_get_sb, 2110 .mount = ubifs_mount,
2098 .kill_sb = kill_anon_super, 2111 .kill_sb = kill_anon_super,
2099}; 2112};
2100 2113
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e..ad9cf013362 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1177 unsigned long time = get_seconds(); 1177 unsigned long time = get_seconds();
1178 1178
1179 dbg_tnc("search key %s", DBGKEY(key)); 1179 dbg_tnc("search key %s", DBGKEY(key));
1180 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1180 1181
1181 znode = c->zroot.znode; 1182 znode = c->zroot.znode;
1182 if (unlikely(!znode)) { 1183 if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
2966 * 2967 *
2967 * This function searches an indexing node by its first key @key and its 2968 * This function searches an indexing node by its first key @key and its
2968 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing 2969 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2969 * nodes it traverses to TNC. This function is called fro indexing nodes which 2970 * nodes it traverses to TNC. This function is called for indexing nodes which
2970 * were found on the media by scanning, for example when garbage-collecting or 2971 * were found on the media by scanning, for example when garbage-collecting or
2971 * when doing in-the-gaps commit. This means that the indexing node which is 2972 * when doing in-the-gaps commit. This means that the indexing node which is
2972 * looked for does not have to have exactly the same leftmost key @key, because 2973 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2988 struct ubifs_znode *znode, *zn; 2989 struct ubifs_znode *znode, *zn;
2989 int n, nn; 2990 int n, nn;
2990 2991
2992 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
2993
2991 /* 2994 /*
2992 * The arguments have probably been read off flash, so don't assume 2995 * The arguments have probably been read off flash, so don't assume
2993 * they are valid. 2996 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 04310878f44..381d6b207a5 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
119 * in TNC. However, when replaying, it is handy to introduce fake "truncation" 119 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
120 * keys for truncation nodes because the code becomes simpler. So we define 120 * keys for truncation nodes because the code becomes simpler. So we define
121 * %UBIFS_TRUN_KEY type. 121 * %UBIFS_TRUN_KEY type.
122 *
123 * But otherwise, out of the journal reply scope, the truncation keys are
124 * invalid.
122 */ 125 */
123#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT 126#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
127#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
124 128
125/* 129/*
126 * How much a directory entry/extended attribute entry adds to the parent/host 130 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -379,7 +383,7 @@ struct ubifs_gced_idx_leb {
379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 383 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 384 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
381 * make sure @inode->i_size is always changed under @ui_mutex, because it 385 * make sure @inode->i_size is always changed under @ui_mutex, because it
382 * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock 386 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 387 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 388 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
385 * could consider to rework locking and base it on "shadow" fields. 389 * could consider to rework locking and base it on "shadow" fields.
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
1028 * @max_leb_cnt: maximum count of logical eraseblocks 1032 * @max_leb_cnt: maximum count of logical eraseblocks
1029 * @old_leb_cnt: count of logical eraseblocks before re-size 1033 * @old_leb_cnt: count of logical eraseblocks before re-size
1030 * @ro_media: the underlying UBI volume is read-only 1034 * @ro_media: the underlying UBI volume is read-only
1035 * @ro_mount: the file-system was mounted as read-only
1036 * @ro_error: UBIFS switched to R/O mode because an error happened
1031 * 1037 *
1032 * @dirty_pg_cnt: number of dirty pages (not used) 1038 * @dirty_pg_cnt: number of dirty pages (not used)
1033 * @dirty_zn_cnt: number of dirty znodes 1039 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
1168 * @replay_sqnum: sequence number of node currently being replayed 1174 * @replay_sqnum: sequence number of node currently being replayed
1169 * @need_recovery: file-system needs recovery 1175 * @need_recovery: file-system needs recovery
1170 * @replaying: set to %1 during journal replay 1176 * @replaying: set to %1 during journal replay
1171 * @unclean_leb_list: LEBs to recover when mounting ro to rw 1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1172 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw 1178 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode
1173 * @size_tree: inode size information for recovery 1181 * @size_tree: inode size information for recovery
1174 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) 1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1175 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1176 * @mount_opts: UBIFS-specific mount options 1185 * @mount_opts: UBIFS-specific mount options
1177 * 1186 *
1178 * @dbg: debugging-related information 1187 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
1268 int leb_cnt; 1277 int leb_cnt;
1269 int max_leb_cnt; 1278 int max_leb_cnt;
1270 int old_leb_cnt; 1279 int old_leb_cnt;
1271 int ro_media; 1280 unsigned int ro_media:1;
1281 unsigned int ro_mount:1;
1282 unsigned int ro_error:1;
1272 1283
1273 atomic_long_t dirty_pg_cnt; 1284 atomic_long_t dirty_pg_cnt;
1274 atomic_long_t dirty_zn_cnt; 1285 atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bc..f8def3c8ea4 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
3 select CRC_ITU_T 4 select CRC_ITU_T
4 help 5 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 6 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 94e06d6bddb..66b9e7e7e4c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/aio.h> 38#include <linux/aio.h>
39#include <linux/smp_lock.h>
40 39
41#include "udf_i.h" 40#include "udf_i.h"
42#include "udf_sb.h" 41#include "udf_sb.h"
@@ -228,6 +227,28 @@ const struct file_operations udf_file_operations = {
228 .llseek = generic_file_llseek, 227 .llseek = generic_file_llseek,
229}; 228};
230 229
230static int udf_setattr(struct dentry *dentry, struct iattr *attr)
231{
232 struct inode *inode = dentry->d_inode;
233 int error;
234
235 error = inode_change_ok(inode, attr);
236 if (error)
237 return error;
238
239 if ((attr->ia_valid & ATTR_SIZE) &&
240 attr->ia_size != i_size_read(inode)) {
241 error = vmtruncate(inode, attr->ia_size);
242 if (error)
243 return error;
244 }
245
246 setattr_copy(inode, attr);
247 mark_inode_dirty(inode);
248 return 0;
249}
250
231const struct inode_operations udf_file_inode_operations = { 251const struct inode_operations udf_file_inode_operations = {
252 .setattr = udf_setattr,
232 .truncate = udf_truncate, 253 .truncate = udf_truncate,
233}; 254};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 18cd7111185..75d9304d0dc 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -31,8 +31,6 @@ void udf_free_inode(struct inode *inode)
31 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
32 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
33 33
34 clear_inode(inode);
35
36 mutex_lock(&sbi->s_alloc_mutex); 34 mutex_lock(&sbi->s_alloc_mutex);
37 if (sbi->s_lvid_bh) { 35 if (sbi->s_lvid_bh) {
38 struct logicalVolIntegrityDescImpUse *lvidiu = 36 struct logicalVolIntegrityDescImpUse *lvidiu =
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 124852bcf6f..fc48f37aa2d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -68,37 +68,23 @@ static void udf_update_extents(struct inode *,
68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
69 69
70 70
71void udf_delete_inode(struct inode *inode) 71void udf_evict_inode(struct inode *inode)
72{
73 truncate_inode_pages(&inode->i_data, 0);
74
75 if (is_bad_inode(inode))
76 goto no_delete;
77
78 inode->i_size = 0;
79 udf_truncate(inode);
80 lock_kernel();
81
82 udf_update_inode(inode, IS_SYNC(inode));
83 udf_free_inode(inode);
84
85 unlock_kernel();
86 return;
87
88no_delete:
89 clear_inode(inode);
90}
91
92/*
93 * If we are going to release inode from memory, we truncate last inode extent
94 * to proper length. We could use drop_inode() but it's called under inode_lock
95 * and thus we cannot mark inode dirty there. We use clear_inode() but we have
96 * to make sure to write inode as it's not written automatically.
97 */
98void udf_clear_inode(struct inode *inode)
99{ 72{
100 struct udf_inode_info *iinfo = UDF_I(inode); 73 struct udf_inode_info *iinfo = UDF_I(inode);
74 int want_delete = 0;
75
76 truncate_inode_pages(&inode->i_data, 0);
101 77
78 if (!inode->i_nlink && !is_bad_inode(inode)) {
79 want_delete = 1;
80 inode->i_size = 0;
81 udf_truncate(inode);
82 lock_kernel();
83 udf_update_inode(inode, IS_SYNC(inode));
84 unlock_kernel();
85 }
86 invalidate_inode_buffers(inode);
87 end_writeback(inode);
102 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 88 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
103 inode->i_size != iinfo->i_lenExtents) { 89 inode->i_size != iinfo->i_lenExtents) {
104 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 90 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
@@ -108,9 +94,13 @@ void udf_clear_inode(struct inode *inode)
108 (unsigned long long)inode->i_size, 94 (unsigned long long)inode->i_size,
109 (unsigned long long)iinfo->i_lenExtents); 95 (unsigned long long)iinfo->i_lenExtents);
110 } 96 }
111
112 kfree(iinfo->i_ext.i_data); 97 kfree(iinfo->i_ext.i_data);
113 iinfo->i_ext.i_data = NULL; 98 iinfo->i_ext.i_data = NULL;
99 if (want_delete) {
100 lock_kernel();
101 udf_free_inode(inode);
102 unlock_kernel();
103 }
114} 104}
115 105
116static int udf_writepage(struct page *page, struct writeback_control *wbc) 106static int udf_writepage(struct page *page, struct writeback_control *wbc)
@@ -127,9 +117,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
127 loff_t pos, unsigned len, unsigned flags, 117 loff_t pos, unsigned len, unsigned flags,
128 struct page **pagep, void **fsdata) 118 struct page **pagep, void **fsdata)
129{ 119{
130 *pagep = NULL; 120 int ret;
131 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 121
132 udf_get_block); 122 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
123 if (unlikely(ret)) {
124 loff_t isize = mapping->host->i_size;
125 if (pos + len > isize)
126 vmtruncate(mapping->host, isize);
127 }
128
129 return ret;
133} 130}
134 131
135static sector_t udf_bmap(struct address_space *mapping, sector_t block) 132static sector_t udf_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193..6d8dc02baeb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1101 inc_nlink(inode); 1101 inc_nlink(inode);
1102 inode->i_ctime = current_fs_time(inode->i_sb); 1102 inode->i_ctime = current_fs_time(inode->i_sb);
1103 mark_inode_dirty(inode); 1103 mark_inode_dirty(inode);
1104 atomic_inc(&inode->i_count); 1104 ihold(inode);
1105 d_instantiate(dentry, inode); 1105 d_instantiate(dentry, inode);
1106 unlock_kernel(); 1106 unlock_kernel();
1107 1107
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 612d1e2e285..4a5c7c61836 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
107} 107}
108 108
109/* UDF filesystem type */ 109/* UDF filesystem type */
110static int udf_get_sb(struct file_system_type *fs_type, 110static struct dentry *udf_mount(struct file_system_type *fs_type,
111 int flags, const char *dev_name, void *data, 111 int flags, const char *dev_name, void *data)
112 struct vfsmount *mnt)
113{ 112{
114 return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); 113 return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
115} 114}
116 115
117static struct file_system_type udf_fstype = { 116static struct file_system_type udf_fstype = {
118 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
119 .name = "udf", 118 .name = "udf",
120 .get_sb = udf_get_sb, 119 .mount = udf_mount,
121 .kill_sb = kill_block_super, 120 .kill_sb = kill_block_super,
122 .fs_flags = FS_REQUIRES_DEV, 121 .fs_flags = FS_REQUIRES_DEV,
123}; 122};
@@ -175,8 +174,7 @@ static const struct super_operations udf_sb_ops = {
175 .alloc_inode = udf_alloc_inode, 174 .alloc_inode = udf_alloc_inode,
176 .destroy_inode = udf_destroy_inode, 175 .destroy_inode = udf_destroy_inode,
177 .write_inode = udf_write_inode, 176 .write_inode = udf_write_inode,
178 .delete_inode = udf_delete_inode, 177 .evict_inode = udf_evict_inode,
179 .clear_inode = udf_clear_inode,
180 .put_super = udf_put_super, 178 .put_super = udf_put_super,
181 .sync_fs = udf_sync_fs, 179 .sync_fs = udf_sync_fs,
182 .statfs = udf_statfs, 180 .statfs = udf_statfs,
@@ -1579,9 +1577,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1579{ 1577{
1580 struct anchorVolDescPtr *anchor; 1578 struct anchorVolDescPtr *anchor;
1581 long main_s, main_e, reserve_s, reserve_e; 1579 long main_s, main_e, reserve_s, reserve_e;
1582 struct udf_sb_info *sbi;
1583 1580
1584 sbi = UDF_SB(sb);
1585 anchor = (struct anchorVolDescPtr *)bh->b_data; 1581 anchor = (struct anchorVolDescPtr *)bh->b_data;
1586 1582
1587 /* Locate the main sequence */ 1583 /* Locate the main sequence */
@@ -1883,6 +1879,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1883 struct kernel_lb_addr rootdir, fileset; 1879 struct kernel_lb_addr rootdir, fileset;
1884 struct udf_sb_info *sbi; 1880 struct udf_sb_info *sbi;
1885 1881
1882 lock_kernel();
1883
1886 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1884 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1887 uopt.uid = -1; 1885 uopt.uid = -1;
1888 uopt.gid = -1; 1886 uopt.gid = -1;
@@ -1891,8 +1889,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1891 uopt.dmode = UDF_INVALID_MODE; 1889 uopt.dmode = UDF_INVALID_MODE;
1892 1890
1893 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1891 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1894 if (!sbi) 1892 if (!sbi) {
1893 unlock_kernel();
1895 return -ENOMEM; 1894 return -ENOMEM;
1895 }
1896 1896
1897 sb->s_fs_info = sbi; 1897 sb->s_fs_info = sbi;
1898 1898
@@ -2038,6 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2038 goto error_out; 2038 goto error_out;
2039 } 2039 }
2040 sb->s_maxbytes = MAX_LFS_FILESIZE; 2040 sb->s_maxbytes = MAX_LFS_FILESIZE;
2041 unlock_kernel();
2041 return 0; 2042 return 0;
2042 2043
2043error_out: 2044error_out:
@@ -2058,6 +2059,7 @@ error_out:
2058 kfree(sbi); 2059 kfree(sbi);
2059 sb->s_fs_info = NULL; 2060 sb->s_fs_info = NULL;
2060 2061
2062 unlock_kernel();
2061 return -EINVAL; 2063 return -EINVAL;
2062} 2064}
2063 2065
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 2bac0354891..6995ab1f430 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -139,8 +139,7 @@ extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
139extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 139extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
140extern void udf_truncate(struct inode *); 140extern void udf_truncate(struct inode *);
141extern void udf_read_inode(struct inode *); 141extern void udf_read_inode(struct inode *);
142extern void udf_delete_inode(struct inode *); 142extern void udf_evict_inode(struct inode *);
143extern void udf_clear_inode(struct inode *);
144extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 143extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
145extern long udf_block_map(struct inode *, sector_t); 144extern long udf_block_map(struct inode *, sector_t);
146extern int udf_extend_file(struct inode *, struct extent_position *, 145extern int udf_extend_file(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768..30c8f223253 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
4 help 5 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V 7 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d..46f7a807bbc 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
114 114
115 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 115 ubh_mark_buffer_dirty (USPI_UBH(uspi));
116 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 116 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
117 if (sb->s_flags & MS_SYNCHRONOUS) { 117 if (sb->s_flags & MS_SYNCHRONOUS)
118 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 118 ubh_sync_block(UCPI_UBH(ucpi));
119 ubh_wait_on_buffer (UCPI_UBH(ucpi));
120 }
121 sb->s_dirt = 1; 119 sb->s_dirt = 1;
122 120
123 unlock_super (sb); 121 unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
207 205
208 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 206 ubh_mark_buffer_dirty (USPI_UBH(uspi));
209 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 207 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
210 if (sb->s_flags & MS_SYNCHRONOUS) { 208 if (sb->s_flags & MS_SYNCHRONOUS)
211 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 209 ubh_sync_block(UCPI_UBH(ucpi));
212 ubh_wait_on_buffer (UCPI_UBH(ucpi));
213 }
214 210
215 if (overflow) { 211 if (overflow) {
216 fragment += count; 212 fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
558 554
559 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 555 ubh_mark_buffer_dirty (USPI_UBH(uspi));
560 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 556 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
561 if (sb->s_flags & MS_SYNCHRONOUS) { 557 if (sb->s_flags & MS_SYNCHRONOUS)
562 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 558 ubh_sync_block(UCPI_UBH(ucpi));
563 ubh_wait_on_buffer (UCPI_UBH(ucpi));
564 }
565 sb->s_dirt = 1; 559 sb->s_dirt = 1;
566 560
567 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); 561 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
680succed: 674succed:
681 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 675 ubh_mark_buffer_dirty (USPI_UBH(uspi));
682 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 676 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
683 if (sb->s_flags & MS_SYNCHRONOUS) { 677 if (sb->s_flags & MS_SYNCHRONOUS)
684 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 678 ubh_sync_block(UCPI_UBH(ucpi));
685 ubh_wait_on_buffer (UCPI_UBH(ucpi));
686 }
687 sb->s_dirt = 1; 679 sb->s_dirt = 1;
688 680
689 result += cgno * uspi->s_fpg; 681 result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index ec784756dc6..dbc90994715 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -95,8 +95,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
95 int err; 95 int err;
96 96
97 lock_page(page); 97 lock_page(page);
98 err = __ufs_write_begin(NULL, page->mapping, pos, len, 98 err = ufs_prepare_chunk(page, pos, len);
99 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
100 BUG_ON(err); 99 BUG_ON(err);
101 100
102 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); 101 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
@@ -381,8 +380,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
381got_it: 380got_it:
382 pos = page_offset(page) + 381 pos = page_offset(page) +
383 (char*)de - (char*)page_address(page); 382 (char*)de - (char*)page_address(page);
384 err = __ufs_write_begin(NULL, page->mapping, pos, rec_len, 383 err = ufs_prepare_chunk(page, pos, rec_len);
385 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
386 if (err) 384 if (err)
387 goto out_unlock; 385 goto out_unlock;
388 if (de->d_ino) { 386 if (de->d_ino) {
@@ -518,7 +516,6 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
518 struct page * page) 516 struct page * page)
519{ 517{
520 struct super_block *sb = inode->i_sb; 518 struct super_block *sb = inode->i_sb;
521 struct address_space *mapping = page->mapping;
522 char *kaddr = page_address(page); 519 char *kaddr = page_address(page);
523 unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); 520 unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
524 unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); 521 unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
@@ -549,8 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
549 546
550 pos = page_offset(page) + from; 547 pos = page_offset(page) + from;
551 lock_page(page); 548 lock_page(page);
552 err = __ufs_write_begin(NULL, mapping, pos, to - from, 549 err = ufs_prepare_chunk(page, pos, to - from);
553 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
554 BUG_ON(err); 550 BUG_ON(err);
555 if (pde) 551 if (pde)
556 pde->d_reclen = cpu_to_fs16(sb, to - from); 552 pde->d_reclen = cpu_to_fs16(sb, to - from);
@@ -577,8 +573,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
577 if (!page) 573 if (!page)
578 return -ENOMEM; 574 return -ENOMEM;
579 575
580 err = __ufs_write_begin(NULL, mapping, 0, chunk_size, 576 err = ufs_prepare_chunk(page, 0, chunk_size);
581 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
582 if (err) { 577 if (err) {
583 unlock_page(page); 578 unlock_page(page);
584 goto fail; 579 goto fail;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 594480e537d..2eabf04af3d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -94,8 +94,6 @@ void ufs_free_inode (struct inode * inode)
94 94
95 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
96 96
97 clear_inode (inode);
98
99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 97 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
100 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); 98 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
101 else { 99 else {
@@ -115,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
115 113
116 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 114 ubh_mark_buffer_dirty (USPI_UBH(uspi));
117 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 115 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
118 if (sb->s_flags & MS_SYNCHRONOUS) { 116 if (sb->s_flags & MS_SYNCHRONOUS)
119 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 117 ubh_sync_block(UCPI_UBH(ucpi));
120 ubh_wait_on_buffer (UCPI_UBH(ucpi));
121 }
122 118
123 sb->s_dirt = 1; 119 sb->s_dirt = 1;
124 unlock_super (sb); 120 unlock_super (sb);
@@ -158,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
158 154
159 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); 155 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
160 ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); 156 ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
161 if (sb->s_flags & MS_SYNCHRONOUS) { 157 if (sb->s_flags & MS_SYNCHRONOUS)
162 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 158 ubh_sync_block(UCPI_UBH(ucpi));
163 ubh_wait_on_buffer(UCPI_UBH(ucpi));
164 }
165 159
166 UFSD("EXIT\n"); 160 UFSD("EXIT\n");
167} 161}
@@ -292,10 +286,8 @@ cg_found:
292 } 286 }
293 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 287 ubh_mark_buffer_dirty (USPI_UBH(uspi));
294 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 288 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
295 if (sb->s_flags & MS_SYNCHRONOUS) { 289 if (sb->s_flags & MS_SYNCHRONOUS)
296 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 290 ubh_sync_block(UCPI_UBH(ucpi));
297 ubh_wait_on_buffer (UCPI_UBH(ucpi));
298 }
299 sb->s_dirt = 1; 291 sb->s_dirt = 1;
300 292
301 inode->i_ino = cg * uspi->s_ipg + bit; 293 inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 73fe773aa03..2b251f2093a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -558,20 +558,26 @@ static int ufs_readpage(struct file *file, struct page *page)
558 return block_read_full_page(page,ufs_getfrag_block); 558 return block_read_full_page(page,ufs_getfrag_block);
559} 559}
560 560
561int __ufs_write_begin(struct file *file, struct address_space *mapping, 561int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
562 loff_t pos, unsigned len, unsigned flags,
563 struct page **pagep, void **fsdata)
564{ 562{
565 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 563 return __block_write_begin(page, pos, len, ufs_getfrag_block);
566 ufs_getfrag_block);
567} 564}
568 565
569static int ufs_write_begin(struct file *file, struct address_space *mapping, 566static int ufs_write_begin(struct file *file, struct address_space *mapping,
570 loff_t pos, unsigned len, unsigned flags, 567 loff_t pos, unsigned len, unsigned flags,
571 struct page **pagep, void **fsdata) 568 struct page **pagep, void **fsdata)
572{ 569{
573 *pagep = NULL; 570 int ret;
574 return __ufs_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 571
572 ret = block_write_begin(mapping, pos, len, flags, pagep,
573 ufs_getfrag_block);
574 if (unlikely(ret)) {
575 loff_t isize = mapping->host->i_size;
576 if (pos + len > isize)
577 vmtruncate(mapping->host, isize);
578 }
579
580 return ret;
575} 581}
576 582
577static sector_t ufs_bmap(struct address_space *mapping, sector_t block) 583static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
@@ -905,24 +911,33 @@ int ufs_sync_inode (struct inode *inode)
905 return ufs_update_inode (inode, 1); 911 return ufs_update_inode (inode, 1);
906} 912}
907 913
908void ufs_delete_inode (struct inode * inode) 914void ufs_evict_inode(struct inode * inode)
909{ 915{
910 loff_t old_i_size; 916 int want_delete = 0;
917
918 if (!inode->i_nlink && !is_bad_inode(inode))
919 want_delete = 1;
911 920
912 truncate_inode_pages(&inode->i_data, 0); 921 truncate_inode_pages(&inode->i_data, 0);
913 if (is_bad_inode(inode)) 922 if (want_delete) {
914 goto no_delete; 923 loff_t old_i_size;
915 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 924 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
916 lock_kernel(); 925 lock_kernel();
917 mark_inode_dirty(inode); 926 mark_inode_dirty(inode);
918 ufs_update_inode(inode, IS_SYNC(inode)); 927 ufs_update_inode(inode, IS_SYNC(inode));
919 old_i_size = inode->i_size; 928 old_i_size = inode->i_size;
920 inode->i_size = 0; 929 inode->i_size = 0;
921 if (inode->i_blocks && ufs_truncate(inode, old_i_size)) 930 if (inode->i_blocks && ufs_truncate(inode, old_i_size))
922 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); 931 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
923 ufs_free_inode (inode); 932 unlock_kernel();
924 unlock_kernel(); 933 }
925 return; 934
926no_delete: 935 invalidate_inode_buffers(inode);
927 clear_inode(inode); /* We must guarantee clearing of inode... */ 936 end_writeback(inode);
937
938 if (want_delete) {
939 lock_kernel();
940 ufs_free_inode (inode);
941 unlock_kernel();
942 }
928} 943}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb..12f39b9e443 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
180 180
181 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
182 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
183 atomic_inc(&inode->i_count); 183 ihold(inode);
184 184
185 error = ufs_add_nondir(dentry, inode); 185 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 186 unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3ec5a9eb6ef..2c47daed56d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 696 unsigned maxsymlen;
697 int ret = -EINVAL; 697 int ret = -EINVAL;
698 698
699 lock_kernel();
700
699 uspi = NULL; 701 uspi = NULL;
700 ubh = NULL; 702 ubh = NULL;
701 flags = 0; 703 flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
1163 goto failed; 1165 goto failed;
1164 1166
1165 UFSD("EXIT\n"); 1167 UFSD("EXIT\n");
1168 unlock_kernel();
1166 return 0; 1169 return 0;
1167 1170
1168dalloc_failed: 1171dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
1174 kfree(sbi); 1177 kfree(sbi);
1175 sb->s_fs_info = NULL; 1178 sb->s_fs_info = NULL;
1176 UFSD("EXIT (FAILED)\n"); 1179 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1177 return ret; 1181 return ret;
1178 1182
1179failed_nomem: 1183failed_nomem:
1180 UFSD("EXIT (NOMEM)\n"); 1184 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1181 return -ENOMEM; 1186 return -ENOMEM;
1182} 1187}
1183 1188
@@ -1440,7 +1445,7 @@ static const struct super_operations ufs_super_ops = {
1440 .alloc_inode = ufs_alloc_inode, 1445 .alloc_inode = ufs_alloc_inode,
1441 .destroy_inode = ufs_destroy_inode, 1446 .destroy_inode = ufs_destroy_inode,
1442 .write_inode = ufs_write_inode, 1447 .write_inode = ufs_write_inode,
1443 .delete_inode = ufs_delete_inode, 1448 .evict_inode = ufs_evict_inode,
1444 .put_super = ufs_put_super, 1449 .put_super = ufs_put_super,
1445 .write_super = ufs_write_super, 1450 .write_super = ufs_write_super,
1446 .sync_fs = ufs_sync_fs, 1451 .sync_fs = ufs_sync_fs,
@@ -1449,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
1449 .show_options = ufs_show_options, 1454 .show_options = ufs_show_options,
1450}; 1455};
1451 1456
1452static int ufs_get_sb(struct file_system_type *fs_type, 1457static struct dentry *ufs_mount(struct file_system_type *fs_type,
1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1458 int flags, const char *dev_name, void *data)
1454{ 1459{
1455 return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); 1460 return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
1456} 1461}
1457 1462
1458static struct file_system_type ufs_fs_type = { 1463static struct file_system_type ufs_fs_type = {
1459 .owner = THIS_MODULE, 1464 .owner = THIS_MODULE,
1460 .name = "ufs", 1465 .name = "ufs",
1461 .get_sb = ufs_get_sb, 1466 .mount = ufs_mount,
1462 .kill_sb = kill_block_super, 1467 .kill_sb = kill_block_super,
1463 .fs_flags = FS_REQUIRES_DEV, 1468 .fs_flags = FS_REQUIRES_DEV,
1464}; 1469};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 589e01a465b..a58f9155fc9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
243 ubh_bforget(ind_ubh); 243 ubh_bforget(ind_ubh);
244 ind_ubh = NULL; 244 ind_ubh = NULL;
245 } 245 }
246 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { 246 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
247 ubh_ll_rw_block(SWRITE, ind_ubh); 247 ubh_sync_block(ind_ubh);
248 ubh_wait_on_buffer (ind_ubh);
249 }
250 ubh_brelse (ind_ubh); 248 ubh_brelse (ind_ubh);
251 249
252 UFSD("EXIT: ino %lu\n", inode->i_ino); 250 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
307 ubh_bforget(dind_bh); 305 ubh_bforget(dind_bh);
308 dind_bh = NULL; 306 dind_bh = NULL;
309 } 307 }
310 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { 308 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
311 ubh_ll_rw_block(SWRITE, dind_bh); 309 ubh_sync_block(dind_bh);
312 ubh_wait_on_buffer (dind_bh);
313 }
314 ubh_brelse (dind_bh); 310 ubh_brelse (dind_bh);
315 311
316 UFSD("EXIT: ino %lu\n", inode->i_ino); 312 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
367 ubh_bforget(tind_bh); 363 ubh_bforget(tind_bh);
368 tind_bh = NULL; 364 tind_bh = NULL;
369 } 365 }
370 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { 366 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
371 ubh_ll_rw_block(SWRITE, tind_bh); 367 ubh_sync_block(tind_bh);
372 ubh_wait_on_buffer (tind_bh);
373 }
374 ubh_brelse (tind_bh); 368 ubh_brelse (tind_bh);
375 369
376 UFSD("EXIT: ino %lu\n", inode->i_ino); 370 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -500,11 +494,6 @@ out:
500 return err; 494 return err;
501} 495}
502 496
503/*
504 * TODO:
505 * - truncate case should use proper ordering instead of using
506 * simple_setsize
507 */
508int ufs_setattr(struct dentry *dentry, struct iattr *attr) 497int ufs_setattr(struct dentry *dentry, struct iattr *attr)
509{ 498{
510 struct inode *inode = dentry->d_inode; 499 struct inode *inode = dentry->d_inode;
@@ -518,14 +507,17 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { 507 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
519 loff_t old_i_size = inode->i_size; 508 loff_t old_i_size = inode->i_size;
520 509
521 error = simple_setsize(inode, attr->ia_size); 510 /* XXX(truncate): truncate_setsize should be called last */
522 if (error) 511 truncate_setsize(inode, attr->ia_size);
523 return error; 512
524 error = ufs_truncate(inode, old_i_size); 513 error = ufs_truncate(inode, old_i_size);
525 if (error) 514 if (error)
526 return error; 515 return error;
527 } 516 }
528 return inode_setattr(inode, attr); 517
518 setattr_copy(inode, attr);
519 mark_inode_dirty(inode);
520 return 0;
529} 521}
530 522
531const struct inode_operations ufs_file_inode_operations = { 523const struct inode_operations ufs_file_inode_operations = {
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 179ae6b3180..c08782e1b48 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -108,7 +108,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, struct writeback_control *); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_evict_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); 113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
114 114
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4..d2c36d53fe6 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
113 } 113 }
114} 114}
115 115
116void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh) 116void ubh_sync_block(struct ufs_buffer_head *ubh)
117{ 117{
118 if (!ubh) 118 if (ubh) {
119 return; 119 unsigned i;
120 120
121 ll_rw_block(rw, ubh->count, ubh->bh); 121 for (i = 0; i < ubh->count; i++)
122} 122 write_dirty_buffer(ubh->bh[i], WRITE);
123 123
124void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) 124 for (i = 0; i < ubh->count; i++)
125{ 125 wait_on_buffer(ubh->bh[i]);
126 unsigned i; 126 }
127 if (!ubh)
128 return;
129 for ( i = 0; i < ubh->count; i++ )
130 wait_on_buffer (ubh->bh[i]);
131} 127}
132 128
133void ubh_bforget (struct ufs_buffer_head * ubh) 129void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 23ceed8c8fb..9f8775ce381 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -257,9 +257,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
257 257
258extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); 258extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
259extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); 259extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
260extern int __ufs_write_begin(struct file *file, struct address_space *mapping, 260extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
261 loff_t pos, unsigned len, unsigned flags,
262 struct page **pagep, void **fsdata);
263 261
264/* 262/*
265 * These functions manipulate ufs buffers 263 * These functions manipulate ufs buffers
@@ -271,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
271extern void ubh_brelse_uspi (struct ufs_sb_private_info *); 269extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
272extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); 270extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
273extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); 271extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
274extern void ubh_ll_rw_block(int, struct ufs_buffer_head *); 272extern void ubh_sync_block(struct ufs_buffer_head *);
275extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
276extern void ubh_bforget (struct ufs_buffer_head *); 273extern void ubh_bforget (struct ufs_buffer_head *);
277extern int ubh_buffer_dirty (struct ufs_buffer_head *); 274extern int ubh_buffer_dirty (struct ufs_buffer_head *);
278#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) 275#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d37..179b5869065 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
126 * must be owner or have write permission. 126 * must be owner or have write permission.
127 * Else, update from *times, must be owner or super user. 127 * Else, update from *times, must be owner or super user.
128 */ 128 */
129long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags) 129long do_utimes(int dfd, const char __user *filename, struct timespec *times,
130 int flags)
130{ 131{
131 int error = -EINVAL; 132 int error = -EINVAL;
132 133
@@ -170,7 +171,7 @@ out:
170 return error; 171 return error;
171} 172}
172 173
173SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename, 174SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
174 struct timespec __user *, utimes, int, flags) 175 struct timespec __user *, utimes, int, flags)
175{ 176{
176 struct timespec tstimes[2]; 177 struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
188 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); 189 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
189} 190}
190 191
191SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename, 192SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
192 struct timeval __user *, utimes) 193 struct timeval __user *, utimes)
193{ 194{
194 struct timeval times[2]; 195 struct timeval times[2];
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f0..6100ec0fa1d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
22config XFS_QUOTA 22config XFS_QUOTA
23 bool "XFS Quota support" 23 bool "XFS Quota support"
24 depends on XFS_FS 24 depends on XFS_FS
25 select QUOTACTL
25 help 26 help
26 If you say Y here, you will be able to set limits for disk usage on 27 If you say Y here, you will be able to set limits for disk usage on
27 a per user and/or a per group basis under XFS. XFS considers quota 28 a per user and/or a per group basis under XFS. XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c8fb13f83b3..0dce969d6ca 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -87,11 +87,9 @@ xfs-y += xfs_alloc.o \
87 xfs_trans_buf.o \ 87 xfs_trans_buf.o \
88 xfs_trans_extfree.o \ 88 xfs_trans_extfree.o \
89 xfs_trans_inode.o \ 89 xfs_trans_inode.o \
90 xfs_trans_item.o \
91 xfs_utils.o \ 90 xfs_utils.o \
92 xfs_vnodeops.o \ 91 xfs_vnodeops.o \
93 xfs_rw.o \ 92 xfs_rw.o
94 xfs_dmops.o
95 93
96xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o 94xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o
97 95
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 9f769b5b38f..b2771862fd3 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 xfs_itrace_entry(ip); 228 trace_xfs_check_acl(ip);
229 229
230 /* 230 /*
231 * If there is no attribute fork no ACL exists on this inode and 231 * If there is no attribute fork no ACL exists on this inode and
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34640d6dbdc..c9af48fffcd 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_ag.h" 23#include "xfs_ag.h"
24#include "xfs_dir2.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 27#include "xfs_dinode.h"
34#include "xfs_inode.h" 28#include "xfs_inode.h"
35#include "xfs_alloc.h" 29#include "xfs_alloc.h"
36#include "xfs_btree.h"
37#include "xfs_error.h" 30#include "xfs_error.h"
38#include "xfs_rw.h" 31#include "xfs_rw.h"
39#include "xfs_iomap.h" 32#include "xfs_iomap.h"
@@ -92,18 +85,15 @@ void
92xfs_count_page_state( 85xfs_count_page_state(
93 struct page *page, 86 struct page *page,
94 int *delalloc, 87 int *delalloc,
95 int *unmapped,
96 int *unwritten) 88 int *unwritten)
97{ 89{
98 struct buffer_head *bh, *head; 90 struct buffer_head *bh, *head;
99 91
100 *delalloc = *unmapped = *unwritten = 0; 92 *delalloc = *unwritten = 0;
101 93
102 bh = head = page_buffers(page); 94 bh = head = page_buffers(page);
103 do { 95 do {
104 if (buffer_uptodate(bh) && !buffer_mapped(bh)) 96 if (buffer_unwritten(bh))
105 (*unmapped) = 1;
106 else if (buffer_unwritten(bh))
107 (*unwritten) = 1; 97 (*unwritten) = 1;
108 else if (buffer_delay(bh)) 98 else if (buffer_delay(bh))
109 (*delalloc) = 1; 99 (*delalloc) = 1;
@@ -212,23 +202,17 @@ xfs_setfilesize(
212} 202}
213 203
214/* 204/*
215 * Schedule IO completion handling on a xfsdatad if this was 205 * Schedule IO completion handling on the final put of an ioend.
216 * the final hold on this ioend. If we are asked to wait,
217 * flush the workqueue.
218 */ 206 */
219STATIC void 207STATIC void
220xfs_finish_ioend( 208xfs_finish_ioend(
221 xfs_ioend_t *ioend, 209 struct xfs_ioend *ioend)
222 int wait)
223{ 210{
224 if (atomic_dec_and_test(&ioend->io_remaining)) { 211 if (atomic_dec_and_test(&ioend->io_remaining)) {
225 struct workqueue_struct *wq; 212 if (ioend->io_type == IO_UNWRITTEN)
226 213 queue_work(xfsconvertd_workqueue, &ioend->io_work);
227 wq = (ioend->io_type == IO_UNWRITTEN) ? 214 else
228 xfsconvertd_workqueue : xfsdatad_workqueue; 215 queue_work(xfsdatad_workqueue, &ioend->io_work);
229 queue_work(wq, &ioend->io_work);
230 if (wait)
231 flush_workqueue(wq);
232 } 216 }
233} 217}
234 218
@@ -272,11 +256,25 @@ xfs_end_io(
272 */ 256 */
273 if (error == EAGAIN) { 257 if (error == EAGAIN) {
274 atomic_inc(&ioend->io_remaining); 258 atomic_inc(&ioend->io_remaining);
275 xfs_finish_ioend(ioend, 0); 259 xfs_finish_ioend(ioend);
276 /* ensure we don't spin on blocked ioends */ 260 /* ensure we don't spin on blocked ioends */
277 delay(1); 261 delay(1);
278 } else 262 } else {
263 if (ioend->io_iocb)
264 aio_complete(ioend->io_iocb, ioend->io_result, 0);
279 xfs_destroy_ioend(ioend); 265 xfs_destroy_ioend(ioend);
266 }
267}
268
269/*
270 * Call IO completion handling in caller context on the final put of an ioend.
271 */
272STATIC void
273xfs_finish_ioend_sync(
274 struct xfs_ioend *ioend)
275{
276 if (atomic_dec_and_test(&ioend->io_remaining))
277 xfs_end_io(&ioend->io_work);
280} 278}
281 279
282/* 280/*
@@ -309,6 +307,8 @@ xfs_alloc_ioend(
309 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); 307 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
310 ioend->io_offset = 0; 308 ioend->io_offset = 0;
311 ioend->io_size = 0; 309 ioend->io_size = 0;
310 ioend->io_iocb = NULL;
311 ioend->io_result = 0;
312 312
313 INIT_WORK(&ioend->io_work, xfs_end_io); 313 INIT_WORK(&ioend->io_work, xfs_end_io);
314 return ioend; 314 return ioend;
@@ -358,7 +358,7 @@ xfs_end_bio(
358 bio->bi_end_io = NULL; 358 bio->bi_end_io = NULL;
359 bio_put(bio); 359 bio_put(bio);
360 360
361 xfs_finish_ioend(ioend, 0); 361 xfs_finish_ioend(ioend);
362} 362}
363 363
364STATIC void 364STATIC void
@@ -500,7 +500,7 @@ xfs_submit_ioend(
500 } 500 }
501 if (bio) 501 if (bio)
502 xfs_submit_ioend_bio(wbc, ioend, bio); 502 xfs_submit_ioend_bio(wbc, ioend, bio);
503 xfs_finish_ioend(ioend, 0); 503 xfs_finish_ioend(ioend);
504 } while ((ioend = next) != NULL); 504 } while ((ioend = next) != NULL);
505} 505}
506 506
@@ -614,31 +614,30 @@ xfs_map_at_offset(
614STATIC unsigned int 614STATIC unsigned int
615xfs_probe_page( 615xfs_probe_page(
616 struct page *page, 616 struct page *page,
617 unsigned int pg_offset, 617 unsigned int pg_offset)
618 int mapped)
619{ 618{
619 struct buffer_head *bh, *head;
620 int ret = 0; 620 int ret = 0;
621 621
622 if (PageWriteback(page)) 622 if (PageWriteback(page))
623 return 0; 623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
624 630
625 if (page->mapping && PageDirty(page)) { 631 bh = head = page_buffers(page);
626 if (page_has_buffers(page)) { 632 do {
627 struct buffer_head *bh, *head; 633 if (!buffer_uptodate(bh))
628 634 break;
629 bh = head = page_buffers(page); 635 if (!buffer_mapped(bh))
630 do { 636 break;
631 if (!buffer_uptodate(bh)) 637 ret += bh->b_size;
632 break; 638 if (ret >= pg_offset)
633 if (mapped != buffer_mapped(bh)) 639 break;
634 break; 640 } while ((bh = bh->b_this_page) != head);
635 ret += bh->b_size;
636 if (ret >= pg_offset)
637 break;
638 } while ((bh = bh->b_this_page) != head);
639 } else
640 ret = mapped ? 0 : PAGE_CACHE_SIZE;
641 }
642 641
643 return ret; 642 return ret;
644} 643}
@@ -648,8 +647,7 @@ xfs_probe_cluster(
648 struct inode *inode, 647 struct inode *inode,
649 struct page *startpage, 648 struct page *startpage,
650 struct buffer_head *bh, 649 struct buffer_head *bh,
651 struct buffer_head *head, 650 struct buffer_head *head)
652 int mapped)
653{ 651{
654 struct pagevec pvec; 652 struct pagevec pvec;
655 pgoff_t tindex, tlast, tloff; 653 pgoff_t tindex, tlast, tloff;
@@ -658,7 +656,7 @@ xfs_probe_cluster(
658 656
659 /* First sum forwards in this page */ 657 /* First sum forwards in this page */
660 do { 658 do {
661 if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) 659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
662 return total; 660 return total;
663 total += bh->b_size; 661 total += bh->b_size;
664 } while ((bh = bh->b_this_page) != head); 662 } while ((bh = bh->b_this_page) != head);
@@ -692,7 +690,7 @@ xfs_probe_cluster(
692 pg_offset = PAGE_CACHE_SIZE; 690 pg_offset = PAGE_CACHE_SIZE;
693 691
694 if (page->index == tindex && trylock_page(page)) { 692 if (page->index == tindex && trylock_page(page)) {
695 pg_len = xfs_probe_page(page, pg_offset, mapped); 693 pg_len = xfs_probe_page(page, pg_offset);
696 unlock_page(page); 694 unlock_page(page);
697 } 695 }
698 696
@@ -761,7 +759,6 @@ xfs_convert_page(
761 struct xfs_bmbt_irec *imap, 759 struct xfs_bmbt_irec *imap,
762 xfs_ioend_t **ioendp, 760 xfs_ioend_t **ioendp,
763 struct writeback_control *wbc, 761 struct writeback_control *wbc,
764 int startio,
765 int all_bh) 762 int all_bh)
766{ 763{
767 struct buffer_head *bh, *head; 764 struct buffer_head *bh, *head;
@@ -832,19 +829,14 @@ xfs_convert_page(
832 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
833 830
834 xfs_map_at_offset(inode, bh, imap, offset); 831 xfs_map_at_offset(inode, bh, imap, offset);
835 if (startio) { 832 xfs_add_to_ioend(inode, bh, offset, type,
836 xfs_add_to_ioend(inode, bh, offset, 833 ioendp, done);
837 type, ioendp, done); 834
838 } else {
839 set_buffer_dirty(bh);
840 unlock_buffer(bh);
841 mark_buffer_dirty(bh);
842 }
843 page_dirty--; 835 page_dirty--;
844 count++; 836 count++;
845 } else { 837 } else {
846 type = IO_NEW; 838 type = IO_NEW;
847 if (buffer_mapped(bh) && all_bh && startio) { 839 if (buffer_mapped(bh) && all_bh) {
848 lock_buffer(bh); 840 lock_buffer(bh);
849 xfs_add_to_ioend(inode, bh, offset, 841 xfs_add_to_ioend(inode, bh, offset,
850 type, ioendp, done); 842 type, ioendp, done);
@@ -859,14 +851,12 @@ xfs_convert_page(
859 if (uptodate && bh == head) 851 if (uptodate && bh == head)
860 SetPageUptodate(page); 852 SetPageUptodate(page);
861 853
862 if (startio) { 854 if (count) {
863 if (count) { 855 if (--wbc->nr_to_write <= 0 &&
864 wbc->nr_to_write--; 856 wbc->sync_mode == WB_SYNC_NONE)
865 if (wbc->nr_to_write <= 0) 857 done = 1;
866 done = 1;
867 }
868 xfs_start_page_writeback(page, !page_dirty, count);
869 } 858 }
859 xfs_start_page_writeback(page, !page_dirty, count);
870 860
871 return done; 861 return done;
872 fail_unlock_page: 862 fail_unlock_page:
@@ -886,7 +876,6 @@ xfs_cluster_write(
886 struct xfs_bmbt_irec *imap, 876 struct xfs_bmbt_irec *imap,
887 xfs_ioend_t **ioendp, 877 xfs_ioend_t **ioendp,
888 struct writeback_control *wbc, 878 struct writeback_control *wbc,
889 int startio,
890 int all_bh, 879 int all_bh,
891 pgoff_t tlast) 880 pgoff_t tlast)
892{ 881{
@@ -902,7 +891,7 @@ xfs_cluster_write(
902 891
903 for (i = 0; i < pagevec_count(&pvec); i++) { 892 for (i = 0; i < pagevec_count(&pvec); i++) {
904 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 893 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
905 imap, ioendp, wbc, startio, all_bh); 894 imap, ioendp, wbc, all_bh);
906 if (done) 895 if (done)
907 break; 896 break;
908 } 897 }
@@ -981,7 +970,7 @@ xfs_aops_discard_page(
981 */ 970 */
982 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
983 XFS_BMAPI_ENTIRE, NULL, 0, &imap, 972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
984 &nimaps, NULL, NULL); 973 &nimaps, NULL);
985 974
986 if (error) { 975 if (error) {
987 /* something screwed, just bail */ 976 /* something screwed, just bail */
@@ -1009,7 +998,7 @@ xfs_aops_discard_page(
1009 */ 998 */
1010 xfs_bmap_init(&flist, &firstblock); 999 xfs_bmap_init(&flist, &firstblock);
1011 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, 1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1012 &flist, NULL, &done); 1001 &flist, &done);
1013 1002
1014 ASSERT(!flist.xbf_count && !flist.xbf_first); 1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1015 if (error) { 1004 if (error) {
@@ -1032,50 +1021,66 @@ out_invalidate:
1032} 1021}
1033 1022
1034/* 1023/*
1035 * Calling this without startio set means we are being asked to make a dirty 1024 * Write out a dirty page.
1036 * page ready for freeing it's buffers. When called with startio set then 1025 *
1037 * we are coming from writepage. 1026 * For delalloc space on the page we need to allocate space and flush it.
1027 * For unwritten space on the page we need to start the conversion to
1028 * regular allocated space.
1029 * For any other dirty buffer heads on the page we should flush them.
1038 * 1030 *
1039 * When called with startio set it is important that we write the WHOLE 1031 * If we detect that a transaction would be required to flush the page, we
1040 * page if possible. 1032 * have to check the process flags first, if we are already in a transaction
1041 * The bh->b_state's cannot know if any of the blocks or which block for 1033 * or disk I/O during allocations is off, we need to fail the writepage and
1042 * that matter are dirty due to mmap writes, and therefore bh uptodate is 1034 * redirty the page.
1043 * only valid if the page itself isn't completely uptodate. Some layers
1044 * may clear the page dirty flag prior to calling write page, under the
1045 * assumption the entire page will be written out; by not writing out the
1046 * whole page the page can be reused before all valid dirty data is
1047 * written out. Note: in the case of a page that has been dirty'd by
1048 * mapwrite and but partially setup by block_prepare_write the
1049 * bh->b_states's will not agree and only ones setup by BPW/BCW will have
1050 * valid state, thus the whole page must be written out thing.
1051 */ 1035 */
1052
1053STATIC int 1036STATIC int
1054xfs_page_state_convert( 1037xfs_vm_writepage(
1055 struct inode *inode, 1038 struct page *page,
1056 struct page *page, 1039 struct writeback_control *wbc)
1057 struct writeback_control *wbc,
1058 int startio,
1059 int unmapped) /* also implies page uptodate */
1060{ 1040{
1041 struct inode *inode = page->mapping->host;
1042 int delalloc, unwritten;
1061 struct buffer_head *bh, *head; 1043 struct buffer_head *bh, *head;
1062 struct xfs_bmbt_irec imap; 1044 struct xfs_bmbt_irec imap;
1063 xfs_ioend_t *ioend = NULL, *iohead = NULL; 1045 xfs_ioend_t *ioend = NULL, *iohead = NULL;
1064 loff_t offset; 1046 loff_t offset;
1065 unsigned long p_offset = 0;
1066 unsigned int type; 1047 unsigned int type;
1067 __uint64_t end_offset; 1048 __uint64_t end_offset;
1068 pgoff_t end_index, last_index; 1049 pgoff_t end_index, last_index;
1069 ssize_t size, len; 1050 ssize_t size, len;
1070 int flags, err, imap_valid = 0, uptodate = 1; 1051 int flags, err, imap_valid = 0, uptodate = 1;
1071 int page_dirty, count = 0; 1052 int count = 0;
1072 int trylock = 0; 1053 int all_bh = 0;
1073 int all_bh = unmapped;
1074 1054
1075 if (startio) { 1055 trace_xfs_writepage(inode, page, 0);
1076 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) 1056
1077 trylock |= BMAPI_TRYLOCK; 1057 ASSERT(page_has_buffers(page));
1078 } 1058
1059 /*
1060 * Refuse to write the page out if we are called from reclaim context.
1061 *
1062 * This avoids stack overflows when called from deeply used stacks in
1063 * random callers for direct reclaim or memcg reclaim. We explicitly
1064 * allow reclaim from kswapd as the stack usage there is relatively low.
1065 *
1066 * This should really be done by the core VM, but until that happens
1067 * filesystems like XFS, btrfs and ext4 have to take care of this
1068 * by themselves.
1069 */
1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
1071 goto redirty;
1072
1073 /*
1074 * We need a transaction if there are delalloc or unwritten buffers
1075 * on the page.
1076 *
1077 * If we need a transaction and the process flags say we are already
1078 * in a transaction, or no IO is allowed then mark the page dirty
1079 * again and leave the page as is.
1080 */
1081 xfs_count_page_state(page, &delalloc, &unwritten);
1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
1083 goto redirty;
1079 1084
1080 /* Is this page beyond the end of the file? */ 1085 /* Is this page beyond the end of the file? */
1081 offset = i_size_read(inode); 1086 offset = i_size_read(inode);
@@ -1084,50 +1089,33 @@ xfs_page_state_convert(
1084 if (page->index >= end_index) { 1089 if (page->index >= end_index) {
1085 if ((page->index >= end_index + 1) || 1090 if ((page->index >= end_index + 1) ||
1086 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { 1091 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
1087 if (startio) 1092 unlock_page(page);
1088 unlock_page(page);
1089 return 0; 1093 return 0;
1090 } 1094 }
1091 } 1095 }
1092 1096
1093 /*
1094 * page_dirty is initially a count of buffers on the page before
1095 * EOF and is decremented as we move each into a cleanable state.
1096 *
1097 * Derivation:
1098 *
1099 * End offset is the highest offset that this page should represent.
1100 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
1101 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
1102 * hence give us the correct page_dirty count. On any other page,
1103 * it will be zero and in that case we need page_dirty to be the
1104 * count of buffers on the page.
1105 */
1106 end_offset = min_t(unsigned long long, 1097 end_offset = min_t(unsigned long long,
1107 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); 1098 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
1099 offset);
1108 len = 1 << inode->i_blkbits; 1100 len = 1 << inode->i_blkbits;
1109 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
1110 PAGE_CACHE_SIZE);
1111 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
1112 page_dirty = p_offset / len;
1113 1101
1114 bh = head = page_buffers(page); 1102 bh = head = page_buffers(page);
1115 offset = page_offset(page); 1103 offset = page_offset(page);
1116 flags = BMAPI_READ; 1104 flags = BMAPI_READ;
1117 type = IO_NEW; 1105 type = IO_NEW;
1118 1106
1119 /* TODO: cleanup count and page_dirty */
1120
1121 do { 1107 do {
1122 if (offset >= end_offset) 1108 if (offset >= end_offset)
1123 break; 1109 break;
1124 if (!buffer_uptodate(bh)) 1110 if (!buffer_uptodate(bh))
1125 uptodate = 0; 1111 uptodate = 0;
1126 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { 1112
1127 /* 1113 /*
1128 * the iomap is actually still valid, but the ioend 1114 * A hole may still be marked uptodate because discard_buffer
1129 * isn't. shouldn't happen too often. 1115 * leaves the flag set.
1130 */ 1116 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1131 imap_valid = 0; 1119 imap_valid = 0;
1132 continue; 1120 continue;
1133 } 1121 }
@@ -1135,19 +1123,7 @@ xfs_page_state_convert(
1135 if (imap_valid) 1123 if (imap_valid)
1136 imap_valid = xfs_imap_valid(inode, &imap, offset); 1124 imap_valid = xfs_imap_valid(inode, &imap, offset);
1137 1125
1138 /* 1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1139 * First case, map an unwritten extent and prepare for
1140 * extent state conversion transaction on completion.
1141 *
1142 * Second case, allocate space for a delalloc buffer.
1143 * We can return EAGAIN here in the release page case.
1144 *
1145 * Third case, an unmapped buffer was found, and we are
1146 * in a path where we need to write the whole page out.
1147 */
1148 if (buffer_unwritten(bh) || buffer_delay(bh) ||
1149 ((buffer_uptodate(bh) || PageUptodate(page)) &&
1150 !buffer_mapped(bh) && (unmapped || startio))) {
1151 int new_ioend = 0; 1127 int new_ioend = 0;
1152 1128
1153 /* 1129 /*
@@ -1161,15 +1137,15 @@ xfs_page_state_convert(
1161 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1162 } else if (buffer_delay(bh)) { 1138 } else if (buffer_delay(bh)) {
1163 type = IO_DELAY; 1139 type = IO_DELAY;
1164 flags = BMAPI_ALLOCATE | trylock; 1140 flags = BMAPI_ALLOCATE;
1165 } else { 1141
1166 type = IO_NEW; 1142 if (wbc->sync_mode == WB_SYNC_NONE)
1167 flags = BMAPI_WRITE | BMAPI_MMAP; 1143 flags |= BMAPI_TRYLOCK;
1168 } 1144 }
1169 1145
1170 if (!imap_valid) { 1146 if (!imap_valid) {
1171 /* 1147 /*
1172 * if we didn't have a valid mapping then we 1148 * If we didn't have a valid mapping then we
1173 * need to ensure that we put the new mapping 1149 * need to ensure that we put the new mapping
1174 * in a new ioend structure. This needs to be 1150 * in a new ioend structure. This needs to be
1175 * done to ensure that the ioends correctly 1151 * done to ensure that the ioends correctly
@@ -1177,14 +1153,7 @@ xfs_page_state_convert(
1177 * for unwritten extent conversion. 1153 * for unwritten extent conversion.
1178 */ 1154 */
1179 new_ioend = 1; 1155 new_ioend = 1;
1180 if (type == IO_NEW) { 1156 err = xfs_map_blocks(inode, offset, len,
1181 size = xfs_probe_cluster(inode,
1182 page, bh, head, 0);
1183 } else {
1184 size = len;
1185 }
1186
1187 err = xfs_map_blocks(inode, offset, size,
1188 &imap, flags); 1157 &imap, flags);
1189 if (err) 1158 if (err)
1190 goto error; 1159 goto error;
@@ -1193,19 +1162,11 @@ xfs_page_state_convert(
1193 } 1162 }
1194 if (imap_valid) { 1163 if (imap_valid) {
1195 xfs_map_at_offset(inode, bh, &imap, offset); 1164 xfs_map_at_offset(inode, bh, &imap, offset);
1196 if (startio) { 1165 xfs_add_to_ioend(inode, bh, offset, type,
1197 xfs_add_to_ioend(inode, bh, offset, 1166 &ioend, new_ioend);
1198 type, &ioend,
1199 new_ioend);
1200 } else {
1201 set_buffer_dirty(bh);
1202 unlock_buffer(bh);
1203 mark_buffer_dirty(bh);
1204 }
1205 page_dirty--;
1206 count++; 1167 count++;
1207 } 1168 }
1208 } else if (buffer_uptodate(bh) && startio) { 1169 } else if (buffer_uptodate(bh)) {
1209 /* 1170 /*
1210 * we got here because the buffer is already mapped. 1171 * we got here because the buffer is already mapped.
1211 * That means it must already have extents allocated 1172 * That means it must already have extents allocated
@@ -1213,8 +1174,7 @@ xfs_page_state_convert(
1213 */ 1174 */
1214 if (!imap_valid || flags != BMAPI_READ) { 1175 if (!imap_valid || flags != BMAPI_READ) {
1215 flags = BMAPI_READ; 1176 flags = BMAPI_READ;
1216 size = xfs_probe_cluster(inode, page, bh, 1177 size = xfs_probe_cluster(inode, page, bh, head);
1217 head, 1);
1218 err = xfs_map_blocks(inode, offset, size, 1178 err = xfs_map_blocks(inode, offset, size,
1219 &imap, flags); 1179 &imap, flags);
1220 if (err) 1180 if (err)
@@ -1233,18 +1193,16 @@ xfs_page_state_convert(
1233 */ 1193 */
1234 type = IO_NEW; 1194 type = IO_NEW;
1235 if (trylock_buffer(bh)) { 1195 if (trylock_buffer(bh)) {
1236 ASSERT(buffer_mapped(bh));
1237 if (imap_valid) 1196 if (imap_valid)
1238 all_bh = 1; 1197 all_bh = 1;
1239 xfs_add_to_ioend(inode, bh, offset, type, 1198 xfs_add_to_ioend(inode, bh, offset, type,
1240 &ioend, !imap_valid); 1199 &ioend, !imap_valid);
1241 page_dirty--;
1242 count++; 1200 count++;
1243 } else { 1201 } else {
1244 imap_valid = 0; 1202 imap_valid = 0;
1245 } 1203 }
1246 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 1204 } else if (PageUptodate(page)) {
1247 (unmapped || startio)) { 1205 ASSERT(buffer_mapped(bh));
1248 imap_valid = 0; 1206 imap_valid = 0;
1249 } 1207 }
1250 1208
@@ -1256,8 +1214,7 @@ xfs_page_state_convert(
1256 if (uptodate && bh == head) 1214 if (uptodate && bh == head)
1257 SetPageUptodate(page); 1215 SetPageUptodate(page);
1258 1216
1259 if (startio) 1217 xfs_start_page_writeback(page, 1, count);
1260 xfs_start_page_writeback(page, 1, count);
1261 1218
1262 if (ioend && imap_valid) { 1219 if (ioend && imap_valid) {
1263 xfs_off_t end_index; 1220 xfs_off_t end_index;
@@ -1275,131 +1232,30 @@ xfs_page_state_convert(
1275 end_index = last_index; 1232 end_index = last_index;
1276 1233
1277 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1234 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1278 wbc, startio, all_bh, end_index); 1235 wbc, all_bh, end_index);
1279 } 1236 }
1280 1237
1281 if (iohead) 1238 if (iohead)
1282 xfs_submit_ioend(wbc, iohead); 1239 xfs_submit_ioend(wbc, iohead);
1283 1240
1284 return page_dirty; 1241 return 0;
1285 1242
1286error: 1243error:
1287 if (iohead) 1244 if (iohead)
1288 xfs_cancel_ioend(iohead); 1245 xfs_cancel_ioend(iohead);
1289 1246
1290 /* 1247 if (err == -EAGAIN)
1291 * If it's delalloc and we have nowhere to put it, 1248 goto redirty;
1292 * throw it away, unless the lower layers told
1293 * us to try again.
1294 */
1295 if (err != -EAGAIN) {
1296 if (!unmapped)
1297 xfs_aops_discard_page(page);
1298 ClearPageUptodate(page);
1299 }
1300 return err;
1301}
1302 1249
1303/* 1250 xfs_aops_discard_page(page);
1304 * writepage: Called from one of two places: 1251 ClearPageUptodate(page);
1305 * 1252 unlock_page(page);
1306 * 1. we are flushing a delalloc buffer head. 1253 return err;
1307 *
1308 * 2. we are writing out a dirty page. Typically the page dirty
1309 * state is cleared before we get here. In this case is it
1310 * conceivable we have no buffer heads.
1311 *
1312 * For delalloc space on the page we need to allocate space and
1313 * flush it. For unmapped buffer heads on the page we should
1314 * allocate space if the page is uptodate. For any other dirty
1315 * buffer heads on the page we should flush them.
1316 *
1317 * If we detect that a transaction would be required to flush
1318 * the page, we have to check the process flags first, if we
1319 * are already in a transaction or disk I/O during allocations
1320 * is off, we need to fail the writepage and redirty the page.
1321 */
1322
1323STATIC int
1324xfs_vm_writepage(
1325 struct page *page,
1326 struct writeback_control *wbc)
1327{
1328 int error;
1329 int need_trans;
1330 int delalloc, unmapped, unwritten;
1331 struct inode *inode = page->mapping->host;
1332
1333 trace_xfs_writepage(inode, page, 0);
1334
1335 /*
1336 * Refuse to write the page out if we are called from reclaim context.
1337 *
1338 * This is primarily to avoid stack overflows when called from deep
1339 * used stacks in random callers for direct reclaim, but disabling
1340 * reclaim for kswap is a nice side-effect as kswapd causes rather
1341 * suboptimal I/O patters, too.
1342 *
1343 * This should really be done by the core VM, but until that happens
1344 * filesystems like XFS, btrfs and ext4 have to take care of this
1345 * by themselves.
1346 */
1347 if (current->flags & PF_MEMALLOC)
1348 goto out_fail;
1349
1350 /*
1351 * We need a transaction if:
1352 * 1. There are delalloc buffers on the page
1353 * 2. The page is uptodate and we have unmapped buffers
1354 * 3. The page is uptodate and we have no buffers
1355 * 4. There are unwritten buffers on the page
1356 */
1357
1358 if (!page_has_buffers(page)) {
1359 unmapped = 1;
1360 need_trans = 1;
1361 } else {
1362 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1363 if (!PageUptodate(page))
1364 unmapped = 0;
1365 need_trans = delalloc + unmapped + unwritten;
1366 }
1367
1368 /*
1369 * If we need a transaction and the process flags say
1370 * we are already in a transaction, or no IO is allowed
1371 * then mark the page dirty again and leave the page
1372 * as is.
1373 */
1374 if (current_test_flags(PF_FSTRANS) && need_trans)
1375 goto out_fail;
1376
1377 /*
1378 * Delay hooking up buffer heads until we have
1379 * made our go/no-go decision.
1380 */
1381 if (!page_has_buffers(page))
1382 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1383
1384 /*
1385 * Convert delayed allocate, unwritten or unmapped space
1386 * to real space and flush out to disk.
1387 */
1388 error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1389 if (error == -EAGAIN)
1390 goto out_fail;
1391 if (unlikely(error < 0))
1392 goto out_unlock;
1393
1394 return 0;
1395 1254
1396out_fail: 1255redirty:
1397 redirty_page_for_writepage(wbc, page); 1256 redirty_page_for_writepage(wbc, page);
1398 unlock_page(page); 1257 unlock_page(page);
1399 return 0; 1258 return 0;
1400out_unlock:
1401 unlock_page(page);
1402 return error;
1403} 1259}
1404 1260
1405STATIC int 1261STATIC int
@@ -1413,65 +1269,27 @@ xfs_vm_writepages(
1413 1269
1414/* 1270/*
1415 * Called to move a page into cleanable state - and from there 1271 * Called to move a page into cleanable state - and from there
1416 * to be released. Possibly the page is already clean. We always 1272 * to be released. The page should already be clean. We always
1417 * have buffer heads in this call. 1273 * have buffer heads in this call.
1418 * 1274 *
1419 * Returns 0 if the page is ok to release, 1 otherwise. 1275 * Returns 1 if the page is ok to release, 0 otherwise.
1420 *
1421 * Possible scenarios are:
1422 *
1423 * 1. We are being called to release a page which has been written
1424 * to via regular I/O. buffer heads will be dirty and possibly
1425 * delalloc. If no delalloc buffer heads in this case then we
1426 * can just return zero.
1427 *
1428 * 2. We are called to release a page which has been written via
1429 * mmap, all we need to do is ensure there is no delalloc
1430 * state in the buffer heads, if not we can let the caller
1431 * free them and we should come back later via writepage.
1432 */ 1276 */
1433STATIC int 1277STATIC int
1434xfs_vm_releasepage( 1278xfs_vm_releasepage(
1435 struct page *page, 1279 struct page *page,
1436 gfp_t gfp_mask) 1280 gfp_t gfp_mask)
1437{ 1281{
1438 struct inode *inode = page->mapping->host; 1282 int delalloc, unwritten;
1439 int dirty, delalloc, unmapped, unwritten;
1440 struct writeback_control wbc = {
1441 .sync_mode = WB_SYNC_ALL,
1442 .nr_to_write = 1,
1443 };
1444 1283
1445 trace_xfs_releasepage(inode, page, 0); 1284 trace_xfs_releasepage(page->mapping->host, page, 0);
1446 1285
1447 if (!page_has_buffers(page)) 1286 xfs_count_page_state(page, &delalloc, &unwritten);
1448 return 0;
1449
1450 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1451 if (!delalloc && !unwritten)
1452 goto free_buffers;
1453 1287
1454 if (!(gfp_mask & __GFP_FS)) 1288 if (WARN_ON(delalloc))
1455 return 0; 1289 return 0;
1456 1290 if (WARN_ON(unwritten))
1457 /* If we are already inside a transaction or the thread cannot
1458 * do I/O, we cannot release this page.
1459 */
1460 if (current_test_flags(PF_FSTRANS))
1461 return 0; 1291 return 0;
1462 1292
1463 /*
1464 * Convert delalloc space to real space, do not flush the
1465 * data out to disk, that will be done by the caller.
1466 * Never need to allocate space here - we will always
1467 * come back to writepage in that case.
1468 */
1469 dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1470 if (dirty == 0 && !unwritten)
1471 goto free_buffers;
1472 return 0;
1473
1474free_buffers:
1475 return try_to_free_buffers(page); 1293 return try_to_free_buffers(page);
1476} 1294}
1477 1295
@@ -1481,9 +1299,9 @@ __xfs_get_blocks(
1481 sector_t iblock, 1299 sector_t iblock,
1482 struct buffer_head *bh_result, 1300 struct buffer_head *bh_result,
1483 int create, 1301 int create,
1484 int direct, 1302 int direct)
1485 bmapi_flags_t flags)
1486{ 1303{
1304 int flags = create ? BMAPI_WRITE : BMAPI_READ;
1487 struct xfs_bmbt_irec imap; 1305 struct xfs_bmbt_irec imap;
1488 xfs_off_t offset; 1306 xfs_off_t offset;
1489 ssize_t size; 1307 ssize_t size;
@@ -1498,8 +1316,11 @@ __xfs_get_blocks(
1498 if (!create && direct && offset >= i_size_read(inode)) 1316 if (!create && direct && offset >= i_size_read(inode))
1499 return 0; 1317 return 0;
1500 1318
1501 error = xfs_iomap(XFS_I(inode), offset, size, 1319 if (direct && create)
1502 create ? flags : BMAPI_READ, &imap, &nimap, &new); 1320 flags |= BMAPI_DIRECT;
1321
1322 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
1323 &new);
1503 if (error) 1324 if (error)
1504 return -error; 1325 return -error;
1505 if (nimap == 0) 1326 if (nimap == 0)
@@ -1579,8 +1400,7 @@ xfs_get_blocks(
1579 struct buffer_head *bh_result, 1400 struct buffer_head *bh_result,
1580 int create) 1401 int create)
1581{ 1402{
1582 return __xfs_get_blocks(inode, iblock, 1403 return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
1583 bh_result, create, 0, BMAPI_WRITE);
1584} 1404}
1585 1405
1586STATIC int 1406STATIC int
@@ -1590,61 +1410,59 @@ xfs_get_blocks_direct(
1590 struct buffer_head *bh_result, 1410 struct buffer_head *bh_result,
1591 int create) 1411 int create)
1592{ 1412{
1593 return __xfs_get_blocks(inode, iblock, 1413 return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1594 bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1595} 1414}
1596 1415
1416/*
1417 * Complete a direct I/O write request.
1418 *
1419 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1420 * need to issue a transaction to convert the range from unwritten to written
1421 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1422 * to do this and we are done. But in case this was a successfull AIO
1423 * request this handler is called from interrupt context, from which we
1424 * can't start transactions. In that case offload the I/O completion to
1425 * the workqueues we also use for buffered I/O completion.
1426 */
1597STATIC void 1427STATIC void
1598xfs_end_io_direct( 1428xfs_end_io_direct_write(
1599 struct kiocb *iocb, 1429 struct kiocb *iocb,
1600 loff_t offset, 1430 loff_t offset,
1601 ssize_t size, 1431 ssize_t size,
1602 void *private) 1432 void *private,
1433 int ret,
1434 bool is_async)
1603{ 1435{
1604 xfs_ioend_t *ioend = iocb->private; 1436 struct xfs_ioend *ioend = iocb->private;
1605 1437
1606 /* 1438 /*
1607 * Non-NULL private data means we need to issue a transaction to 1439 * blockdev_direct_IO can return an error even after the I/O
1608 * convert a range from unwritten to written extents. This needs 1440 * completion handler was called. Thus we need to protect
1609 * to happen from process context but aio+dio I/O completion 1441 * against double-freeing.
1610 * happens from irq context so we need to defer it to a workqueue.
1611 * This is not necessary for synchronous direct I/O, but we do
1612 * it anyway to keep the code uniform and simpler.
1613 *
1614 * Well, if only it were that simple. Because synchronous direct I/O
1615 * requires extent conversion to occur *before* we return to userspace,
1616 * we have to wait for extent conversion to complete. Look at the
1617 * iocb that has been passed to us to determine if this is AIO or
1618 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1619 * workqueue and wait for it to complete.
1620 *
1621 * The core direct I/O code might be changed to always call the
1622 * completion handler in the future, in which case all this can
1623 * go away.
1624 */ 1442 */
1443 iocb->private = NULL;
1444
1625 ioend->io_offset = offset; 1445 ioend->io_offset = offset;
1626 ioend->io_size = size; 1446 ioend->io_size = size;
1627 if (ioend->io_type == IO_READ) { 1447 if (private && size > 0)
1628 xfs_finish_ioend(ioend, 0); 1448 ioend->io_type = IO_UNWRITTEN;
1629 } else if (private && size > 0) { 1449
1630 xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); 1450 if (is_async) {
1631 } else {
1632 /* 1451 /*
1633 * A direct I/O write ioend starts it's life in unwritten 1452 * If we are converting an unwritten extent we need to delay
1634 * state in case they map an unwritten extent. This write 1453 * the AIO completion until after the unwrittent extent
1635 * didn't map an unwritten extent so switch it's completion 1454 * conversion has completed, otherwise do it ASAP.
1636 * handler.
1637 */ 1455 */
1638 ioend->io_type = IO_NEW; 1456 if (ioend->io_type == IO_UNWRITTEN) {
1639 xfs_finish_ioend(ioend, 0); 1457 ioend->io_iocb = iocb;
1458 ioend->io_result = ret;
1459 } else {
1460 aio_complete(iocb, ret, 0);
1461 }
1462 xfs_finish_ioend(ioend);
1463 } else {
1464 xfs_finish_ioend_sync(ioend);
1640 } 1465 }
1641
1642 /*
1643 * blockdev_direct_IO can return an error even after the I/O
1644 * completion handler was called. Thus we need to protect
1645 * against double-freeing.
1646 */
1647 iocb->private = NULL;
1648} 1466}
1649 1467
1650STATIC ssize_t 1468STATIC ssize_t
@@ -1655,26 +1473,45 @@ xfs_vm_direct_IO(
1655 loff_t offset, 1473 loff_t offset,
1656 unsigned long nr_segs) 1474 unsigned long nr_segs)
1657{ 1475{
1658 struct file *file = iocb->ki_filp; 1476 struct inode *inode = iocb->ki_filp->f_mapping->host;
1659 struct inode *inode = file->f_mapping->host; 1477 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1660 struct block_device *bdev; 1478 ssize_t ret;
1661 ssize_t ret;
1662
1663 bdev = xfs_find_bdev_for_inode(inode);
1664 1479
1665 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1480 if (rw & WRITE) {
1666 IO_UNWRITTEN : IO_READ); 1481 iocb->private = xfs_alloc_ioend(inode, IO_NEW);
1667 1482
1668 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1483 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1669 offset, nr_segs, 1484 offset, nr_segs,
1670 xfs_get_blocks_direct, 1485 xfs_get_blocks_direct,
1671 xfs_end_io_direct); 1486 xfs_end_io_direct_write, NULL, 0);
1487 if (ret != -EIOCBQUEUED && iocb->private)
1488 xfs_destroy_ioend(iocb->private);
1489 } else {
1490 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1491 offset, nr_segs,
1492 xfs_get_blocks_direct,
1493 NULL, NULL, 0);
1494 }
1672 1495
1673 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1674 xfs_destroy_ioend(iocb->private);
1675 return ret; 1496 return ret;
1676} 1497}
1677 1498
1499STATIC void
1500xfs_vm_write_failed(
1501 struct address_space *mapping,
1502 loff_t to)
1503{
1504 struct inode *inode = mapping->host;
1505
1506 if (to > inode->i_size) {
1507 struct iattr ia = {
1508 .ia_valid = ATTR_SIZE | ATTR_FORCE,
1509 .ia_size = inode->i_size,
1510 };
1511 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
1512 }
1513}
1514
1678STATIC int 1515STATIC int
1679xfs_vm_write_begin( 1516xfs_vm_write_begin(
1680 struct file *file, 1517 struct file *file,
@@ -1685,9 +1522,31 @@ xfs_vm_write_begin(
1685 struct page **pagep, 1522 struct page **pagep,
1686 void **fsdata) 1523 void **fsdata)
1687{ 1524{
1688 *pagep = NULL; 1525 int ret;
1689 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1526
1690 xfs_get_blocks); 1527 ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
1528 pagep, xfs_get_blocks);
1529 if (unlikely(ret))
1530 xfs_vm_write_failed(mapping, pos + len);
1531 return ret;
1532}
1533
1534STATIC int
1535xfs_vm_write_end(
1536 struct file *file,
1537 struct address_space *mapping,
1538 loff_t pos,
1539 unsigned len,
1540 unsigned copied,
1541 struct page *page,
1542 void *fsdata)
1543{
1544 int ret;
1545
1546 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1547 if (unlikely(ret < len))
1548 xfs_vm_write_failed(mapping, pos + len);
1549 return ret;
1691} 1550}
1692 1551
1693STATIC sector_t 1552STATIC sector_t
@@ -1698,7 +1557,7 @@ xfs_vm_bmap(
1698 struct inode *inode = (struct inode *)mapping->host; 1557 struct inode *inode = (struct inode *)mapping->host;
1699 struct xfs_inode *ip = XFS_I(inode); 1558 struct xfs_inode *ip = XFS_I(inode);
1700 1559
1701 xfs_itrace_entry(XFS_I(inode)); 1560 trace_xfs_vm_bmap(XFS_I(inode));
1702 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1561 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1703 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1562 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1704 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1563 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1732,7 +1591,7 @@ const struct address_space_operations xfs_address_space_operations = {
1732 .releasepage = xfs_vm_releasepage, 1591 .releasepage = xfs_vm_releasepage,
1733 .invalidatepage = xfs_vm_invalidatepage, 1592 .invalidatepage = xfs_vm_invalidatepage,
1734 .write_begin = xfs_vm_write_begin, 1593 .write_begin = xfs_vm_write_begin,
1735 .write_end = generic_write_end, 1594 .write_end = xfs_vm_write_end,
1736 .bmap = xfs_vm_bmap, 1595 .bmap = xfs_vm_bmap,
1737 .direct_IO = xfs_vm_direct_IO, 1596 .direct_IO = xfs_vm_direct_IO,
1738 .migratepage = buffer_migrate_page, 1597 .migratepage = buffer_migrate_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df..c5057fb6237 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
37 size_t io_size; /* size of the extent */ 37 size_t io_size; /* size of the extent */
38 xfs_off_t io_offset; /* offset in the file */ 38 xfs_off_t io_offset; /* offset in the file */
39 struct work_struct io_work; /* xfsdatad work queue */ 39 struct work_struct io_work; /* xfsdatad work queue */
40 struct kiocb *io_iocb;
41 int io_result;
40} xfs_ioend_t; 42} xfs_ioend_t;
41 43
42extern const struct address_space_operations xfs_address_space_operations; 44extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
45extern void xfs_ioend_init(void); 47extern void xfs_ioend_init(void);
46extern void xfs_ioend_wait(struct xfs_inode *); 48extern void xfs_ioend_wait(struct xfs_inode *);
47 49
48extern void xfs_count_page_state(struct page *, int *, int *, int *); 50extern void xfs_count_page_state(struct page *, int *, int *);
49 51
50#endif /* __XFS_AOPS_H__ */ 52#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2ee3f7a6016..63fd2c07cb5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -39,7 +39,6 @@
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h" 40#include "xfs_log.h"
41#include "xfs_ag.h" 41#include "xfs_ag.h"
42#include "xfs_dmapi.h"
43#include "xfs_mount.h" 42#include "xfs_mount.h"
44#include "xfs_trace.h" 43#include "xfs_trace.h"
45 44
@@ -189,8 +188,8 @@ _xfs_buf_initialize(
189 atomic_set(&bp->b_hold, 1); 188 atomic_set(&bp->b_hold, 1);
190 init_completion(&bp->b_iowait); 189 init_completion(&bp->b_iowait);
191 INIT_LIST_HEAD(&bp->b_list); 190 INIT_LIST_HEAD(&bp->b_list);
192 INIT_LIST_HEAD(&bp->b_hash_list); 191 RB_CLEAR_NODE(&bp->b_rbnode);
193 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 192 sema_init(&bp->b_sema, 0); /* held, no waiters */
194 XB_SET_OWNER(bp); 193 XB_SET_OWNER(bp);
195 bp->b_target = target; 194 bp->b_target = target;
196 bp->b_file_offset = range_base; 195 bp->b_file_offset = range_base;
@@ -263,8 +262,6 @@ xfs_buf_free(
263{ 262{
264 trace_xfs_buf_free(bp, _RET_IP_); 263 trace_xfs_buf_free(bp, _RET_IP_);
265 264
266 ASSERT(list_empty(&bp->b_hash_list));
267
268 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
269 uint i; 266 uint i;
270 267
@@ -423,8 +420,10 @@ _xfs_buf_find(
423{ 420{
424 xfs_off_t range_base; 421 xfs_off_t range_base;
425 size_t range_length; 422 size_t range_length;
426 xfs_bufhash_t *hash; 423 struct xfs_perag *pag;
427 xfs_buf_t *bp, *n; 424 struct rb_node **rbp;
425 struct rb_node *parent;
426 xfs_buf_t *bp;
428 427
429 range_base = (ioff << BBSHIFT); 428 range_base = (ioff << BBSHIFT);
430 range_length = (isize << BBSHIFT); 429 range_length = (isize << BBSHIFT);
@@ -433,20 +432,38 @@ _xfs_buf_find(
433 ASSERT(!(range_length < (1 << btp->bt_sshift))); 432 ASSERT(!(range_length < (1 << btp->bt_sshift)));
434 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
435 434
436 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 435 /* get tree root */
437 436 pag = xfs_perag_get(btp->bt_mount,
438 spin_lock(&hash->bh_lock); 437 xfs_daddr_to_agno(btp->bt_mount, ioff));
439 438
440 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 439 /* walk tree */
441 ASSERT(btp == bp->b_target); 440 spin_lock(&pag->pag_buf_lock);
442 if (bp->b_file_offset == range_base && 441 rbp = &pag->pag_buf_tree.rb_node;
443 bp->b_buffer_length == range_length) { 442 parent = NULL;
443 bp = NULL;
444 while (*rbp) {
445 parent = *rbp;
446 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
447
448 if (range_base < bp->b_file_offset)
449 rbp = &(*rbp)->rb_left;
450 else if (range_base > bp->b_file_offset)
451 rbp = &(*rbp)->rb_right;
452 else {
444 /* 453 /*
445 * If we look at something, bring it to the 454 * found a block offset match. If the range doesn't
446 * front of the list for next time. 455 * match, the only way this is allowed is if the buffer
456 * in the cache is stale and the transaction that made
457 * it stale has not yet committed. i.e. we are
458 * reallocating a busy extent. Skip this buffer and
459 * continue searching to the right for an exact match.
447 */ 460 */
461 if (bp->b_buffer_length != range_length) {
462 ASSERT(bp->b_flags & XBF_STALE);
463 rbp = &(*rbp)->rb_right;
464 continue;
465 }
448 atomic_inc(&bp->b_hold); 466 atomic_inc(&bp->b_hold);
449 list_move(&bp->b_hash_list, &hash->bh_list);
450 goto found; 467 goto found;
451 } 468 }
452 } 469 }
@@ -455,17 +472,21 @@ _xfs_buf_find(
455 if (new_bp) { 472 if (new_bp) {
456 _xfs_buf_initialize(new_bp, btp, range_base, 473 _xfs_buf_initialize(new_bp, btp, range_base,
457 range_length, flags); 474 range_length, flags);
458 new_bp->b_hash = hash; 475 rb_link_node(&new_bp->b_rbnode, parent, rbp);
459 list_add(&new_bp->b_hash_list, &hash->bh_list); 476 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
477 /* the buffer keeps the perag reference until it is freed */
478 new_bp->b_pag = pag;
479 spin_unlock(&pag->pag_buf_lock);
460 } else { 480 } else {
461 XFS_STATS_INC(xb_miss_locked); 481 XFS_STATS_INC(xb_miss_locked);
482 spin_unlock(&pag->pag_buf_lock);
483 xfs_perag_put(pag);
462 } 484 }
463
464 spin_unlock(&hash->bh_lock);
465 return new_bp; 485 return new_bp;
466 486
467found: 487found:
468 spin_unlock(&hash->bh_lock); 488 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag);
469 490
470 /* Attempt to get the semaphore without sleeping, 491 /* Attempt to get the semaphore without sleeping,
471 * if this does not work then we need to drop the 492 * if this does not work then we need to drop the
@@ -579,9 +600,9 @@ _xfs_buf_read(
579 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 600 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
580 601
581 status = xfs_buf_iorequest(bp); 602 status = xfs_buf_iorequest(bp);
582 if (!status && !(flags & XBF_ASYNC)) 603 if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
583 status = xfs_buf_iowait(bp); 604 return status;
584 return status; 605 return xfs_buf_iowait(bp);
585} 606}
586 607
587xfs_buf_t * 608xfs_buf_t *
@@ -631,8 +652,7 @@ void
631xfs_buf_readahead( 652xfs_buf_readahead(
632 xfs_buftarg_t *target, 653 xfs_buftarg_t *target,
633 xfs_off_t ioff, 654 xfs_off_t ioff,
634 size_t isize, 655 size_t isize)
635 xfs_buf_flags_t flags)
636{ 656{
637 struct backing_dev_info *bdi; 657 struct backing_dev_info *bdi;
638 658
@@ -640,8 +660,42 @@ xfs_buf_readahead(
640 if (bdi_read_congested(bdi)) 660 if (bdi_read_congested(bdi))
641 return; 661 return;
642 662
643 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 663 xfs_buf_read(target, ioff, isize,
644 xfs_buf_read(target, ioff, isize, flags); 664 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
665}
666
667/*
668 * Read an uncached buffer from disk. Allocates and returns a locked
669 * buffer containing the disk contents or nothing.
670 */
671struct xfs_buf *
672xfs_buf_read_uncached(
673 struct xfs_mount *mp,
674 struct xfs_buftarg *target,
675 xfs_daddr_t daddr,
676 size_t length,
677 int flags)
678{
679 xfs_buf_t *bp;
680 int error;
681
682 bp = xfs_buf_get_uncached(target, length, flags);
683 if (!bp)
684 return NULL;
685
686 /* set up the buffer for a read IO */
687 xfs_buf_lock(bp);
688 XFS_BUF_SET_ADDR(bp, daddr);
689 XFS_BUF_READ(bp);
690 XFS_BUF_BUSY(bp);
691
692 xfsbdstrat(mp, bp);
693 error = xfs_buf_iowait(bp);
694 if (error || bp->b_error) {
695 xfs_buf_relse(bp);
696 return NULL;
697 }
698 return bp;
645} 699}
646 700
647xfs_buf_t * 701xfs_buf_t *
@@ -713,9 +767,10 @@ xfs_buf_associate_memory(
713} 767}
714 768
715xfs_buf_t * 769xfs_buf_t *
716xfs_buf_get_noaddr( 770xfs_buf_get_uncached(
771 struct xfs_buftarg *target,
717 size_t len, 772 size_t len,
718 xfs_buftarg_t *target) 773 int flags)
719{ 774{
720 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 775 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
721 int error, i; 776 int error, i;
@@ -731,7 +786,7 @@ xfs_buf_get_noaddr(
731 goto fail_free_buf; 786 goto fail_free_buf;
732 787
733 for (i = 0; i < page_count; i++) { 788 for (i = 0; i < page_count; i++) {
734 bp->b_pages[i] = alloc_page(GFP_KERNEL); 789 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
735 if (!bp->b_pages[i]) 790 if (!bp->b_pages[i])
736 goto fail_free_mem; 791 goto fail_free_mem;
737 } 792 }
@@ -746,7 +801,7 @@ xfs_buf_get_noaddr(
746 801
747 xfs_buf_unlock(bp); 802 xfs_buf_unlock(bp);
748 803
749 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 804 trace_xfs_buf_get_uncached(bp, _RET_IP_);
750 return bp; 805 return bp;
751 806
752 fail_free_mem: 807 fail_free_mem:
@@ -780,29 +835,30 @@ void
780xfs_buf_rele( 835xfs_buf_rele(
781 xfs_buf_t *bp) 836 xfs_buf_t *bp)
782{ 837{
783 xfs_bufhash_t *hash = bp->b_hash; 838 struct xfs_perag *pag = bp->b_pag;
784 839
785 trace_xfs_buf_rele(bp, _RET_IP_); 840 trace_xfs_buf_rele(bp, _RET_IP_);
786 841
787 if (unlikely(!hash)) { 842 if (!pag) {
788 ASSERT(!bp->b_relse); 843 ASSERT(!bp->b_relse);
844 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
789 if (atomic_dec_and_test(&bp->b_hold)) 845 if (atomic_dec_and_test(&bp->b_hold))
790 xfs_buf_free(bp); 846 xfs_buf_free(bp);
791 return; 847 return;
792 } 848 }
793 849
850 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
794 ASSERT(atomic_read(&bp->b_hold) > 0); 851 ASSERT(atomic_read(&bp->b_hold) > 0);
795 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 852 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
796 if (bp->b_relse) { 853 if (bp->b_relse) {
797 atomic_inc(&bp->b_hold); 854 atomic_inc(&bp->b_hold);
798 spin_unlock(&hash->bh_lock); 855 spin_unlock(&pag->pag_buf_lock);
799 (*(bp->b_relse)) (bp); 856 bp->b_relse(bp);
800 } else if (bp->b_flags & XBF_FS_MANAGED) {
801 spin_unlock(&hash->bh_lock);
802 } else { 857 } else {
803 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
804 list_del_init(&bp->b_hash_list); 859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
805 spin_unlock(&hash->bh_lock); 860 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag);
806 xfs_buf_free(bp); 862 xfs_buf_free(bp);
807 } 863 }
808 } 864 }
@@ -865,7 +921,7 @@ xfs_buf_lock(
865 trace_xfs_buf_lock(bp, _RET_IP_); 921 trace_xfs_buf_lock(bp, _RET_IP_);
866 922
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 923 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0); 924 xfs_log_force(bp->b_target->bt_mount, 0);
869 if (atomic_read(&bp->b_io_remaining)) 925 if (atomic_read(&bp->b_io_remaining))
870 blk_run_address_space(bp->b_target->bt_mapping); 926 blk_run_address_space(bp->b_target->bt_mapping);
871 down(&bp->b_sema); 927 down(&bp->b_sema);
@@ -897,36 +953,6 @@ xfs_buf_unlock(
897 trace_xfs_buf_unlock(bp, _RET_IP_); 953 trace_xfs_buf_unlock(bp, _RET_IP_);
898} 954}
899 955
900
901/*
902 * Pinning Buffer Storage in Memory
903 * Ensure that no attempt to force a buffer to disk will succeed.
904 */
905void
906xfs_buf_pin(
907 xfs_buf_t *bp)
908{
909 trace_xfs_buf_pin(bp, _RET_IP_);
910 atomic_inc(&bp->b_pin_count);
911}
912
913void
914xfs_buf_unpin(
915 xfs_buf_t *bp)
916{
917 trace_xfs_buf_unpin(bp, _RET_IP_);
918
919 if (atomic_dec_and_test(&bp->b_pin_count))
920 wake_up_all(&bp->b_waiters);
921}
922
923int
924xfs_buf_ispin(
925 xfs_buf_t *bp)
926{
927 return atomic_read(&bp->b_pin_count);
928}
929
930STATIC void 956STATIC void
931xfs_buf_wait_unpin( 957xfs_buf_wait_unpin(
932 xfs_buf_t *bp) 958 xfs_buf_t *bp)
@@ -960,19 +986,7 @@ xfs_buf_iodone_work(
960 xfs_buf_t *bp = 986 xfs_buf_t *bp =
961 container_of(work, xfs_buf_t, b_iodone_work); 987 container_of(work, xfs_buf_t, b_iodone_work);
962 988
963 /* 989 if (bp->b_iodone)
964 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
965 * ordered flag and reissue them. Because we can't tell the higher
966 * layers directly that they should not issue ordered I/O anymore, they
967 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
968 */
969 if ((bp->b_error == EOPNOTSUPP) &&
970 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
971 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
972 bp->b_flags &= ~XBF_ORDERED;
973 bp->b_flags |= _XFS_BARRIER_FAILED;
974 xfs_buf_iorequest(bp);
975 } else if (bp->b_iodone)
976 (*(bp->b_iodone))(bp); 990 (*(bp->b_iodone))(bp);
977 else if (bp->b_flags & XBF_ASYNC) 991 else if (bp->b_flags & XBF_ASYNC)
978 xfs_buf_relse(bp); 992 xfs_buf_relse(bp);
@@ -1018,13 +1032,11 @@ xfs_bwrite(
1018{ 1032{
1019 int error; 1033 int error;
1020 1034
1021 bp->b_strat = xfs_bdstrat_cb;
1022 bp->b_mount = mp;
1023 bp->b_flags |= XBF_WRITE; 1035 bp->b_flags |= XBF_WRITE;
1024 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1036 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1025 1037
1026 xfs_buf_delwri_dequeue(bp); 1038 xfs_buf_delwri_dequeue(bp);
1027 xfs_buf_iostrategy(bp); 1039 xfs_bdstrat_cb(bp);
1028 1040
1029 error = xfs_buf_iowait(bp); 1041 error = xfs_buf_iowait(bp);
1030 if (error) 1042 if (error)
@@ -1040,9 +1052,6 @@ xfs_bdwrite(
1040{ 1052{
1041 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1053 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1042 1054
1043 bp->b_strat = xfs_bdstrat_cb;
1044 bp->b_mount = mp;
1045
1046 bp->b_flags &= ~XBF_READ; 1055 bp->b_flags &= ~XBF_READ;
1047 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1056 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1048 1057
@@ -1051,7 +1060,7 @@ xfs_bdwrite(
1051 1060
1052/* 1061/*
1053 * Called when we want to stop a buffer from getting written or read. 1062 * Called when we want to stop a buffer from getting written or read.
1054 * We attach the EIO error, muck with its flags, and call biodone 1063 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1055 * so that the proper iodone callbacks get called. 1064 * so that the proper iodone callbacks get called.
1056 */ 1065 */
1057STATIC int 1066STATIC int
@@ -1068,22 +1077,21 @@ xfs_bioerror(
1068 XFS_BUF_ERROR(bp, EIO); 1077 XFS_BUF_ERROR(bp, EIO);
1069 1078
1070 /* 1079 /*
1071 * We're calling biodone, so delete XBF_DONE flag. 1080 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1072 */ 1081 */
1073 XFS_BUF_UNREAD(bp); 1082 XFS_BUF_UNREAD(bp);
1074 XFS_BUF_UNDELAYWRITE(bp); 1083 XFS_BUF_UNDELAYWRITE(bp);
1075 XFS_BUF_UNDONE(bp); 1084 XFS_BUF_UNDONE(bp);
1076 XFS_BUF_STALE(bp); 1085 XFS_BUF_STALE(bp);
1077 1086
1078 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 1087 xfs_buf_ioend(bp, 0);
1079 xfs_biodone(bp);
1080 1088
1081 return EIO; 1089 return EIO;
1082} 1090}
1083 1091
1084/* 1092/*
1085 * Same as xfs_bioerror, except that we are releasing the buffer 1093 * Same as xfs_bioerror, except that we are releasing the buffer
1086 * here ourselves, and avoiding the biodone call. 1094 * here ourselves, and avoiding the xfs_buf_ioend call.
1087 * This is meant for userdata errors; metadata bufs come with 1095 * This is meant for userdata errors; metadata bufs come with
1088 * iodone functions attached, so that we can track down errors. 1096 * iodone functions attached, so that we can track down errors.
1089 */ 1097 */
@@ -1105,7 +1113,6 @@ xfs_bioerror_relse(
1105 XFS_BUF_DONE(bp); 1113 XFS_BUF_DONE(bp);
1106 XFS_BUF_STALE(bp); 1114 XFS_BUF_STALE(bp);
1107 XFS_BUF_CLR_IODONE_FUNC(bp); 1115 XFS_BUF_CLR_IODONE_FUNC(bp);
1108 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1109 if (!(fl & XBF_ASYNC)) { 1116 if (!(fl & XBF_ASYNC)) {
1110 /* 1117 /*
1111 * Mark b_error and B_ERROR _both_. 1118 * Mark b_error and B_ERROR _both_.
@@ -1133,7 +1140,7 @@ int
1133xfs_bdstrat_cb( 1140xfs_bdstrat_cb(
1134 struct xfs_buf *bp) 1141 struct xfs_buf *bp)
1135{ 1142{
1136 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1143 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1137 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1144 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1138 /* 1145 /*
1139 * Metadata write that didn't get logged but 1146 * Metadata write that didn't get logged but
@@ -1235,7 +1242,7 @@ _xfs_buf_ioapply(
1235 1242
1236 if (bp->b_flags & XBF_ORDERED) { 1243 if (bp->b_flags & XBF_ORDERED) {
1237 ASSERT(!(bp->b_flags & XBF_READ)); 1244 ASSERT(!(bp->b_flags & XBF_READ));
1238 rw = WRITE_BARRIER; 1245 rw = WRITE_FLUSH_FUA;
1239 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1246 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1240 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1247 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1241 bp->b_flags &= ~_XBF_RUN_QUEUES; 1248 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1311,8 +1318,19 @@ submit_io:
1311 if (size) 1318 if (size)
1312 goto next_chunk; 1319 goto next_chunk;
1313 } else { 1320 } else {
1314 bio_put(bio); 1321 /*
1322 * if we get here, no pages were added to the bio. However,
1323 * we can't just error out here - if the pages are locked then
1324 * we have to unlock them otherwise we can hang on a later
1325 * access to the page.
1326 */
1315 xfs_buf_ioerror(bp, EIO); 1327 xfs_buf_ioerror(bp, EIO);
1328 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1329 int i;
1330 for (i = 0; i < bp->b_page_count; i++)
1331 unlock_page(bp->b_pages[i]);
1332 }
1333 bio_put(bio);
1316 } 1334 }
1317} 1335}
1318 1336
@@ -1428,63 +1446,24 @@ xfs_buf_iomove(
1428 */ 1446 */
1429void 1447void
1430xfs_wait_buftarg( 1448xfs_wait_buftarg(
1431 xfs_buftarg_t *btp) 1449 struct xfs_buftarg *btp)
1432{
1433 xfs_buf_t *bp, *n;
1434 xfs_bufhash_t *hash;
1435 uint i;
1436
1437 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1438 hash = &btp->bt_hash[i];
1439again:
1440 spin_lock(&hash->bh_lock);
1441 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1442 ASSERT(btp == bp->b_target);
1443 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1444 spin_unlock(&hash->bh_lock);
1445 /*
1446 * Catch superblock reference count leaks
1447 * immediately
1448 */
1449 BUG_ON(bp->b_bn == 0);
1450 delay(100);
1451 goto again;
1452 }
1453 }
1454 spin_unlock(&hash->bh_lock);
1455 }
1456}
1457
1458/*
1459 * Allocate buffer hash table for a given target.
1460 * For devices containing metadata (i.e. not the log/realtime devices)
1461 * we need to allocate a much larger hash table.
1462 */
1463STATIC void
1464xfs_alloc_bufhash(
1465 xfs_buftarg_t *btp,
1466 int external)
1467{ 1450{
1468 unsigned int i; 1451 struct xfs_perag *pag;
1452 uint i;
1469 1453
1470 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1454 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
1471 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1455 pag = xfs_perag_get(btp->bt_mount, i);
1472 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1456 spin_lock(&pag->pag_buf_lock);
1473 sizeof(xfs_bufhash_t)); 1457 while (rb_first(&pag->pag_buf_tree)) {
1474 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1458 spin_unlock(&pag->pag_buf_lock);
1475 spin_lock_init(&btp->bt_hash[i].bh_lock); 1459 delay(100);
1476 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1460 spin_lock(&pag->pag_buf_lock);
1461 }
1462 spin_unlock(&pag->pag_buf_lock);
1463 xfs_perag_put(pag);
1477 } 1464 }
1478} 1465}
1479 1466
1480STATIC void
1481xfs_free_bufhash(
1482 xfs_buftarg_t *btp)
1483{
1484 kmem_free_large(btp->bt_hash);
1485 btp->bt_hash = NULL;
1486}
1487
1488/* 1467/*
1489 * buftarg list for delwrite queue processing 1468 * buftarg list for delwrite queue processing
1490 */ 1469 */
@@ -1517,7 +1496,6 @@ xfs_free_buftarg(
1517 xfs_flush_buftarg(btp, 1); 1496 xfs_flush_buftarg(btp, 1);
1518 if (mp->m_flags & XFS_MOUNT_BARRIER) 1497 if (mp->m_flags & XFS_MOUNT_BARRIER)
1519 xfs_blkdev_issue_flush(btp); 1498 xfs_blkdev_issue_flush(btp);
1520 xfs_free_bufhash(btp);
1521 iput(btp->bt_mapping->host); 1499 iput(btp->bt_mapping->host);
1522 1500
1523 /* Unregister the buftarg first so that we don't get a 1501 /* Unregister the buftarg first so that we don't get a
@@ -1602,6 +1580,7 @@ xfs_mapping_buftarg(
1602 XFS_BUFTARG_NAME(btp)); 1580 XFS_BUFTARG_NAME(btp));
1603 return ENOMEM; 1581 return ENOMEM;
1604 } 1582 }
1583 inode->i_ino = get_next_ino();
1605 inode->i_mode = S_IFBLK; 1584 inode->i_mode = S_IFBLK;
1606 inode->i_bdev = bdev; 1585 inode->i_bdev = bdev;
1607 inode->i_rdev = bdev->bd_dev; 1586 inode->i_rdev = bdev->bd_dev;
@@ -1639,6 +1618,7 @@ out_error:
1639 1618
1640xfs_buftarg_t * 1619xfs_buftarg_t *
1641xfs_alloc_buftarg( 1620xfs_alloc_buftarg(
1621 struct xfs_mount *mp,
1642 struct block_device *bdev, 1622 struct block_device *bdev,
1643 int external, 1623 int external,
1644 const char *fsname) 1624 const char *fsname)
@@ -1647,6 +1627,7 @@ xfs_alloc_buftarg(
1647 1627
1648 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1628 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1649 1629
1630 btp->bt_mount = mp;
1650 btp->bt_dev = bdev->bd_dev; 1631 btp->bt_dev = bdev->bd_dev;
1651 btp->bt_bdev = bdev; 1632 btp->bt_bdev = bdev;
1652 if (xfs_setsize_buftarg_early(btp, bdev)) 1633 if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1655,7 +1636,6 @@ xfs_alloc_buftarg(
1655 goto error; 1636 goto error;
1656 if (xfs_alloc_delwrite_queue(btp, fsname)) 1637 if (xfs_alloc_delwrite_queue(btp, fsname))
1657 goto error; 1638 goto error;
1658 xfs_alloc_bufhash(btp, external);
1659 return btp; 1639 return btp;
1660 1640
1661error: 1641error:
@@ -1804,7 +1784,7 @@ xfs_buf_delwri_split(
1804 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1784 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1805 ASSERT(bp->b_flags & XBF_DELWRI); 1785 ASSERT(bp->b_flags & XBF_DELWRI);
1806 1786
1807 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1787 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
1808 if (!force && 1788 if (!force &&
1809 time_before(jiffies, bp->b_queuetime + age)) { 1789 time_before(jiffies, bp->b_queuetime + age)) {
1810 xfs_buf_unlock(bp); 1790 xfs_buf_unlock(bp);
@@ -1889,7 +1869,7 @@ xfsbufd(
1889 struct xfs_buf *bp; 1869 struct xfs_buf *bp;
1890 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1870 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1891 list_del_init(&bp->b_list); 1871 list_del_init(&bp->b_list);
1892 xfs_buf_iostrategy(bp); 1872 xfs_bdstrat_cb(bp);
1893 count++; 1873 count++;
1894 } 1874 }
1895 if (count) 1875 if (count)
@@ -1936,7 +1916,7 @@ xfs_flush_buftarg(
1936 bp->b_flags &= ~XBF_ASYNC; 1916 bp->b_flags &= ~XBF_ASYNC;
1937 list_add(&bp->b_list, &wait_list); 1917 list_add(&bp->b_list, &wait_list);
1938 } 1918 }
1939 xfs_buf_iostrategy(bp); 1919 xfs_bdstrat_cb(bp);
1940 } 1920 }
1941 1921
1942 if (wait) { 1922 if (wait) {
@@ -1946,7 +1926,7 @@ xfs_flush_buftarg(
1946 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1926 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1947 1927
1948 list_del_init(&bp->b_list); 1928 list_del_init(&bp->b_list);
1949 xfs_iowait(bp); 1929 xfs_buf_iowait(bp);
1950 xfs_buf_relse(bp); 1930 xfs_buf_relse(bp);
1951 } 1931 }
1952 } 1932 }
@@ -1962,7 +1942,8 @@ xfs_buf_init(void)
1962 if (!xfs_buf_zone) 1942 if (!xfs_buf_zone)
1963 goto out; 1943 goto out;
1964 1944
1965 xfslogd_workqueue = create_workqueue("xfslogd"); 1945 xfslogd_workqueue = alloc_workqueue("xfslogd",
1946 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1966 if (!xfslogd_workqueue) 1947 if (!xfslogd_workqueue)
1967 goto out_free_buf_zone; 1948 goto out_free_buf_zone;
1968 1949
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 5fbecefa5df..383a3f37cf9 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,48 @@ typedef enum {
44 XBRW_ZERO = 3, /* Zero target memory */ 44 XBRW_ZERO = 3, /* Zero target memory */
45} xfs_buf_rw_t; 45} xfs_buf_rw_t;
46 46
47typedef enum { 47#define XBF_READ (1 << 0) /* buffer intended for reading from device */
48 XBF_READ = (1 << 0), /* buffer intended for reading from device */ 48#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
49 XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ 49#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */
50 XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ 50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
51 XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52 XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53 XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54 XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
56 XBF_ORDERED = (1 << 11), /* use ordered writes */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ 57
58 XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ 58/* flags used only as arguments to access routines */
59 59#define XBF_LOCK (1 << 14)/* lock requested */
60 /* flags used only as arguments to access routines */ 60#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */
61 XBF_LOCK = (1 << 14), /* lock requested */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
62 XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ 62
63 XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ 63/* flags used only internally */
64 64#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
65 /* flags used only internally */ 65#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
66 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 66#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
67 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
68 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
69 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
70 68
71 /* 69/*
72 * Special flag for supporting metadata blocks smaller than a FSB. 70 * Special flag for supporting metadata blocks smaller than a FSB.
73 * 71 *
74 * In this case we can have multiple xfs_buf_t on a single page and 72 * In this case we can have multiple xfs_buf_t on a single page and
75 * need to lock out concurrent xfs_buf_t readers as they only 73 * need to lock out concurrent xfs_buf_t readers as they only
76 * serialise access to the buffer. 74 * serialise access to the buffer.
77 * 75 *
78 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation 76 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
79 * between reads of the page. Hence we can have one thread read the 77 * between reads of the page. Hence we can have one thread read the
80 * page and modify it, but then race with another thread that thinks 78 * page and modify it, but then race with another thread that thinks
81 * the page is not up-to-date and hence reads it again. 79 * the page is not up-to-date and hence reads it again.
82 * 80 *
83 * The result is that the first modifcation to the page is lost. 81 * The result is that the first modifcation to the page is lost.
84 * This sort of AGF/AGI reading race can happen when unlinking inodes 82 * This sort of AGF/AGI reading race can happen when unlinking inodes
85 * that require truncation and results in the AGI unlinked list 83 * that require truncation and results in the AGI unlinked list
86 * modifications being lost. 84 * modifications being lost.
87 */ 85 */
88 _XBF_PAGE_LOCKED = (1 << 22), 86#define _XBF_PAGE_LOCKED (1 << 22)
89 87
90 /* 88typedef unsigned int xfs_buf_flags_t;
91 * If we try a barrier write, but it fails we have to communicate
92 * this to the upper layers. Unfortunately b_error gets overwritten
93 * when the buffer is re-issued so we have to add another flag to
94 * keep this information.
95 */
96 _XFS_BARRIER_FAILED = (1 << 23),
97} xfs_buf_flags_t;
98 89
99#define XFS_BUF_FLAGS \ 90#define XFS_BUF_FLAGS \
100 { XBF_READ, "READ" }, \ 91 { XBF_READ, "READ" }, \
@@ -104,7 +95,6 @@ typedef enum {
104 { XBF_DONE, "DONE" }, \ 95 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 96 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 97 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 98 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 99 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 100 { XBF_LOCK, "LOCK" }, /* should never be set */\
@@ -114,8 +104,7 @@ typedef enum {
114 { _XBF_PAGES, "PAGES" }, \ 104 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119 108
120 109
121typedef enum { 110typedef enum {
@@ -132,15 +121,11 @@ typedef struct xfs_buftarg {
132 dev_t bt_dev; 121 dev_t bt_dev;
133 struct block_device *bt_bdev; 122 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 123 struct address_space *bt_mapping;
124 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 125 unsigned int bt_bsize;
136 unsigned int bt_sshift; 126 unsigned int bt_sshift;
137 size_t bt_smask; 127 size_t bt_smask;
138 128
139 /* per device buffer hash table */
140 uint bt_hashmask;
141 uint bt_hashshift;
142 xfs_bufhash_t *bt_hash;
143
144 /* per device delwri queue */ 129 /* per device delwri queue */
145 struct task_struct *bt_task; 130 struct task_struct *bt_task;
146 struct list_head bt_list; 131 struct list_head bt_list;
@@ -168,35 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
168#define XB_PAGES 2 153#define XB_PAGES 2
169 154
170typedef struct xfs_buf { 155typedef struct xfs_buf {
156 /*
157 * first cacheline holds all the fields needed for an uncontended cache
158 * hit to be fully processed. The semaphore straddles the cacheline
159 * boundary, but the counter and lock sits on the first cacheline,
160 * which is the only bit that is touched if we hit the semaphore
161 * fast-path on locking.
162 */
163 struct rb_node b_rbnode; /* rbtree node */
164 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */
167 xfs_buf_flags_t b_flags; /* status flags */
171 struct semaphore b_sema; /* semaphore for lockables */ 168 struct semaphore b_sema; /* semaphore for lockables */
172 unsigned long b_queuetime; /* time buffer was queued */ 169
173 atomic_t b_pin_count; /* pin count */
174 wait_queue_head_t b_waiters; /* unpin waiters */ 170 wait_queue_head_t b_waiters; /* unpin waiters */
175 struct list_head b_list; 171 struct list_head b_list;
176 xfs_buf_flags_t b_flags; /* status flags */ 172 struct xfs_perag *b_pag; /* contains rbtree root */
177 struct list_head b_hash_list; /* hash table list */
178 xfs_bufhash_t *b_hash; /* hash table list start */
179 xfs_buftarg_t *b_target; /* buffer target (device) */ 173 xfs_buftarg_t *b_target; /* buffer target (device) */
180 atomic_t b_hold; /* reference count */
181 xfs_daddr_t b_bn; /* block number for I/O */ 174 xfs_daddr_t b_bn; /* block number for I/O */
182 xfs_off_t b_file_offset; /* offset in file */
183 size_t b_buffer_length;/* size of buffer in bytes */
184 size_t b_count_desired;/* desired transfer size */ 175 size_t b_count_desired;/* desired transfer size */
185 void *b_addr; /* virtual address of buffer */ 176 void *b_addr; /* virtual address of buffer */
186 struct work_struct b_iodone_work; 177 struct work_struct b_iodone_work;
187 atomic_t b_io_remaining; /* #outstanding I/O requests */
188 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 178 xfs_buf_iodone_t b_iodone; /* I/O completion function */
189 xfs_buf_relse_t b_relse; /* releasing function */ 179 xfs_buf_relse_t b_relse; /* releasing function */
190 xfs_buf_bdstrat_t b_strat; /* pre-write function */
191 struct completion b_iowait; /* queue for I/O waiters */ 180 struct completion b_iowait; /* queue for I/O waiters */
192 void *b_fspriv; 181 void *b_fspriv;
193 void *b_fspriv2; 182 void *b_fspriv2;
194 struct xfs_mount *b_mount;
195 unsigned short b_error; /* error code on I/O */
196 unsigned int b_page_count; /* size of page array */
197 unsigned int b_offset; /* page offset in first page */
198 struct page **b_pages; /* array of page pointers */ 183 struct page **b_pages; /* array of page pointers */
199 struct page *b_page_array[XB_PAGES]; /* inline pages */ 184 struct page *b_page_array[XB_PAGES]; /* inline pages */
185 unsigned long b_queuetime; /* time buffer was queued */
186 atomic_t b_pin_count; /* pin count */
187 atomic_t b_io_remaining; /* #outstanding I/O requests */
188 unsigned int b_page_count; /* size of page array */
189 unsigned int b_offset; /* page offset in first page */
190 unsigned short b_error; /* error code on I/O */
200#ifdef XFS_BUF_LOCK_TRACKING 191#ifdef XFS_BUF_LOCK_TRACKING
201 int b_last_holder; 192 int b_last_holder;
202#endif 193#endif
@@ -215,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
215 xfs_buf_flags_t); 206 xfs_buf_flags_t);
216 207
217extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 208extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
218extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 209extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
219extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 210extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
220extern void xfs_buf_hold(xfs_buf_t *); 211extern void xfs_buf_hold(xfs_buf_t *);
221extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 212extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
222 xfs_buf_flags_t); 213struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
214 struct xfs_buftarg *target,
215 xfs_daddr_t daddr, size_t length, int flags);
223 216
224/* Releasing Buffers */ 217/* Releasing Buffers */
225extern void xfs_buf_free(xfs_buf_t *); 218extern void xfs_buf_free(xfs_buf_t *);
@@ -244,11 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
244extern int xfs_buf_iowait(xfs_buf_t *); 237extern int xfs_buf_iowait(xfs_buf_t *);
245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 238extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
246 xfs_buf_rw_t); 239 xfs_buf_rw_t);
247 240#define xfs_buf_zero(bp, off, len) \
248static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 241 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
249{
250 return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
251}
252 242
253static inline int xfs_buf_geterror(xfs_buf_t *bp) 243static inline int xfs_buf_geterror(xfs_buf_t *bp)
254{ 244{
@@ -258,11 +248,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
258/* Buffer Utility Routines */ 248/* Buffer Utility Routines */
259extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 249extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
260 250
261/* Pinning Buffer Storage in Memory */
262extern void xfs_buf_pin(xfs_buf_t *);
263extern void xfs_buf_unpin(xfs_buf_t *);
264extern int xfs_buf_ispin(xfs_buf_t *);
265
266/* Delayed Write Buffer Routines */ 251/* Delayed Write Buffer Routines */
267extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 252extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *); 253extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -288,8 +273,6 @@ extern void xfs_buf_terminate(void);
288 XFS_BUF_DONE(bp); \ 273 XFS_BUF_DONE(bp); \
289 } while (0) 274 } while (0)
290 275
291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
292
293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 276#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
294#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 277#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
295#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 278#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -326,8 +309,6 @@ extern void xfs_buf_terminate(void);
326#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) 309#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
327#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) 310#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
328#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) 311#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
329#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
330#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
331 312
332#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) 313#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
333#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) 314#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
@@ -351,7 +332,7 @@ extern void xfs_buf_terminate(void);
351#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 332#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
352#define XFS_BUF_SET_REF(bp, ref) do { } while (0) 333#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
353 334
354#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) 335#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
355 336
356#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) 337#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
357#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) 338#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
@@ -370,27 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
370 xfs_buf_rele(bp); 351 xfs_buf_rele(bp);
371} 352}
372 353
373#define xfs_bpin(bp) xfs_buf_pin(bp)
374#define xfs_bunpin(bp) xfs_buf_unpin(bp)
375#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
376
377#define xfs_biomove(bp, off, len, data, rw) \
378 xfs_buf_iomove((bp), (off), (len), (data), \
379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
380
381#define xfs_biozero(bp, off, len) \
382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
383
384#define xfs_iowait(bp) xfs_buf_iowait(bp)
385
386#define xfs_baread(target, rablkno, ralen) \
387 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
388
389
390/* 354/*
391 * Handling of buftargs. 355 * Handling of buftargs.
392 */ 356 */
393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 357extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
358 struct block_device *, int, const char *);
394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 359extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 360extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 361extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b609..00000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685ee..00000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DMAPI_PRIV_H__
19#define __XFS_DMAPI_PRIV_H__
20
21/*
22 * Based on IO_ISDIRECT, decide which i_ flag is set.
23 */
24#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
25 DM_FLAGS_IMUX : 0)
26#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
27
28#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index e7839ee49e4..3764d74790e 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
23#include "xfs_sb.h" 23#include "xfs_sb.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dir2.h" 25#include "xfs_dir2.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_export.h" 27#include "xfs_export.h"
29#include "xfs_vnodeops.h" 28#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 30#include "xfs_inode.h"
32#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_trace.h"
33 33
34/* 34/*
35 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -132,8 +132,7 @@ xfs_nfs_get_inode(
132 * fine and not an indication of a corrupted filesystem as clients can 132 * fine and not an indication of a corrupted filesystem as clients can
133 * send invalid file handles and we have to handle it gracefully.. 133 * send invalid file handles and we have to handle it gracefully..
134 */ 134 */
135 error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 135 error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
136 XFS_ILOCK_SHARED, &ip);
137 if (error) { 136 if (error) {
138 /* 137 /*
139 * EINVAL means the inode cluster doesn't exist anymore. 138 * EINVAL means the inode cluster doesn't exist anymore.
@@ -148,11 +147,10 @@ xfs_nfs_get_inode(
148 } 147 }
149 148
150 if (ip->i_d.di_gen != generation) { 149 if (ip->i_d.di_gen != generation) {
151 xfs_iput_new(ip, XFS_ILOCK_SHARED); 150 IRELE(ip);
152 return ERR_PTR(-ENOENT); 151 return ERR_PTR(-ENOENT);
153 } 152 }
154 153
155 xfs_iunlock(ip, XFS_ILOCK_SHARED);
156 return VFS_I(ip); 154 return VFS_I(ip);
157} 155}
158 156
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 257a56b127c..ba8ad422a16 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_sb.h" 23#include "xfs_sb.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h" 28#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_dinode.h" 29#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
39#include "xfs_bmap.h" 32#include "xfs_bmap.h"
40#include "xfs_error.h" 33#include "xfs_error.h"
41#include "xfs_rw.h"
42#include "xfs_vnodeops.h" 34#include "xfs_vnodeops.h"
43#include "xfs_da_btree.h" 35#include "xfs_da_btree.h"
44#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
@@ -108,7 +100,7 @@ xfs_file_fsync(
108 int error = 0; 100 int error = 0;
109 int log_flushed = 0; 101 int log_flushed = 0;
110 102
111 xfs_itrace_entry(ip); 103 trace_xfs_file_fsync(ip);
112 104
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 105 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO); 106 return -XFS_ERROR(EIO);
@@ -166,8 +158,7 @@ xfs_file_fsync(
166 * transaction. So we play it safe and fire off the 158 * transaction. So we play it safe and fire off the
167 * transaction anyway. 159 * transaction anyway.
168 */ 160 */
169 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 161 xfs_trans_ijoin(tp, ip);
170 xfs_trans_ihold(tp, ip);
171 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 162 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
172 xfs_trans_set_sync(tp); 163 xfs_trans_set_sync(tp);
173 error = _xfs_trans_commit(tp, 0, &log_flushed); 164 error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -275,20 +266,6 @@ xfs_file_aio_read(
275 mutex_lock(&inode->i_mutex); 266 mutex_lock(&inode->i_mutex);
276 xfs_ilock(ip, XFS_IOLOCK_SHARED); 267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
277 268
278 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
279 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
280 int iolock = XFS_IOLOCK_SHARED;
281
282 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
283 dmflags, &iolock);
284 if (ret) {
285 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
286 if (unlikely(ioflags & IO_ISDIRECT))
287 mutex_unlock(&inode->i_mutex);
288 return ret;
289 }
290 }
291
292 if (unlikely(ioflags & IO_ISDIRECT)) { 269 if (unlikely(ioflags & IO_ISDIRECT)) {
293 if (inode->i_mapping->nrpages) { 270 if (inode->i_mapping->nrpages) {
294 ret = -xfs_flushinval_pages(ip, 271 ret = -xfs_flushinval_pages(ip,
@@ -321,7 +298,6 @@ xfs_file_splice_read(
321 unsigned int flags) 298 unsigned int flags)
322{ 299{
323 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 300 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
324 struct xfs_mount *mp = ip->i_mount;
325 int ioflags = 0; 301 int ioflags = 0;
326 ssize_t ret; 302 ssize_t ret;
327 303
@@ -335,18 +311,6 @@ xfs_file_splice_read(
335 311
336 xfs_ilock(ip, XFS_IOLOCK_SHARED); 312 xfs_ilock(ip, XFS_IOLOCK_SHARED);
337 313
338 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
339 int iolock = XFS_IOLOCK_SHARED;
340 int error;
341
342 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
343 FILP_DELAY_FLAG(infilp), &iolock);
344 if (error) {
345 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
346 return -error;
347 }
348 }
349
350 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
351 315
352 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 316 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -367,7 +331,6 @@ xfs_file_splice_write(
367{ 331{
368 struct inode *inode = outfilp->f_mapping->host; 332 struct inode *inode = outfilp->f_mapping->host;
369 struct xfs_inode *ip = XFS_I(inode); 333 struct xfs_inode *ip = XFS_I(inode);
370 struct xfs_mount *mp = ip->i_mount;
371 xfs_fsize_t isize, new_size; 334 xfs_fsize_t isize, new_size;
372 int ioflags = 0; 335 int ioflags = 0;
373 ssize_t ret; 336 ssize_t ret;
@@ -382,18 +345,6 @@ xfs_file_splice_write(
382 345
383 xfs_ilock(ip, XFS_IOLOCK_EXCL); 346 xfs_ilock(ip, XFS_IOLOCK_EXCL);
384 347
385 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
386 int iolock = XFS_IOLOCK_EXCL;
387 int error;
388
389 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
390 FILP_DELAY_FLAG(outfilp), &iolock);
391 if (error) {
392 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
393 return -error;
394 }
395 }
396
397 new_size = *ppos + count; 348 new_size = *ppos + count;
398 349
399 xfs_ilock(ip, XFS_ILOCK_EXCL); 350 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -463,7 +414,7 @@ xfs_zero_last_block(
463 last_fsb = XFS_B_TO_FSBT(mp, isize); 414 last_fsb = XFS_B_TO_FSBT(mp, isize);
464 nimaps = 1; 415 nimaps = 1;
465 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 416 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
466 &nimaps, NULL, NULL); 417 &nimaps, NULL);
467 if (error) { 418 if (error) {
468 return error; 419 return error;
469 } 420 }
@@ -558,7 +509,7 @@ xfs_zero_eof(
558 nimaps = 1; 509 nimaps = 1;
559 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 510 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
560 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 511 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
561 0, NULL, 0, &imap, &nimaps, NULL, NULL); 512 0, NULL, 0, &imap, &nimaps, NULL);
562 if (error) { 513 if (error) {
563 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
564 return error; 515 return error;
@@ -627,7 +578,6 @@ xfs_file_aio_write(
627 int ioflags = 0; 578 int ioflags = 0;
628 xfs_fsize_t isize, new_size; 579 xfs_fsize_t isize, new_size;
629 int iolock; 580 int iolock;
630 int eventsent = 0;
631 size_t ocount = 0, count; 581 size_t ocount = 0, count;
632 int need_i_mutex; 582 int need_i_mutex;
633 583
@@ -673,33 +623,6 @@ start:
673 goto out_unlock_mutex; 623 goto out_unlock_mutex;
674 } 624 }
675 625
676 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
677 !(ioflags & IO_INVIS) && !eventsent)) {
678 int dmflags = FILP_DELAY_FLAG(file);
679
680 if (need_i_mutex)
681 dmflags |= DM_FLAGS_IMUX;
682
683 xfs_iunlock(ip, XFS_ILOCK_EXCL);
684 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
685 pos, count, dmflags, &iolock);
686 if (error) {
687 goto out_unlock_internal;
688 }
689 xfs_ilock(ip, XFS_ILOCK_EXCL);
690 eventsent = 1;
691
692 /*
693 * The iolock was dropped and reacquired in XFS_SEND_DATA
694 * so we have to recheck the size when appending.
695 * We will only "goto start;" once, since having sent the
696 * event prevents another call to XFS_SEND_DATA, which is
697 * what allows the size to change in the first place.
698 */
699 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
700 goto start;
701 }
702
703 if (ioflags & IO_ISDIRECT) { 626 if (ioflags & IO_ISDIRECT) {
704 xfs_buftarg_t *target = 627 xfs_buftarg_t *target =
705 XFS_IS_REALTIME_INODE(ip) ? 628 XFS_IS_REALTIME_INODE(ip) ?
@@ -830,22 +753,6 @@ write_retry:
830 xfs_iunlock(ip, XFS_ILOCK_EXCL); 753 xfs_iunlock(ip, XFS_ILOCK_EXCL);
831 } 754 }
832 755
833 if (ret == -ENOSPC &&
834 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
835 xfs_iunlock(ip, iolock);
836 if (need_i_mutex)
837 mutex_unlock(&inode->i_mutex);
838 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
839 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
840 0, 0, 0); /* Delay flag intentionally unused */
841 if (need_i_mutex)
842 mutex_lock(&inode->i_mutex);
843 xfs_ilock(ip, iolock);
844 if (error)
845 goto out_unlock_internal;
846 goto start;
847 }
848
849 error = -ret; 756 error = -ret;
850 if (ret <= 0) 757 if (ret <= 0)
851 goto out_unlock_internal; 758 goto out_unlock_internal;
@@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = {
1014 .open = xfs_file_open, 921 .open = xfs_file_open,
1015 .release = xfs_file_release, 922 .release = xfs_file_release,
1016 .fsync = xfs_file_fsync, 923 .fsync = xfs_file_fsync,
1017#ifdef HAVE_FOP_OPEN_EXEC
1018 .open_exec = xfs_file_open_exec,
1019#endif
1020}; 924};
1021 925
1022const struct file_operations xfs_dir_file_operations = { 926const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7..ed88ed16811 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
21#include "xfs_inode.h" 21#include "xfs_inode.h"
22#include "xfs_trace.h" 22#include "xfs_trace.h"
23 23
24int fs_noerr(void) { return 0; }
25int fs_nosys(void) { return ENOSYS; }
26void fs_noval(void) { return; }
27
28/* 24/*
29 * note: all filemap functions return negative error codes. These 25 * note: all filemap functions return negative error codes. These
30 * need to be inverted before returning to the xfs core functions. 26 * need to be inverted before returning to the xfs core functions.
@@ -36,10 +32,9 @@ xfs_tosspages(
36 xfs_off_t last, 32 xfs_off_t last,
37 int fiopt) 33 int fiopt)
38{ 34{
39 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
40 36 last &= ~(PAGE_SIZE - 1);
41 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
42 truncate_inode_pages(mapping, first);
43} 38}
44 39
45int 40int
@@ -54,12 +49,11 @@ xfs_flushinval_pages(
54 49
55 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
56 51
57 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
58 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
59 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
60 if (!ret) 55 if (!ret)
61 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
62 }
63 return -ret; 57 return -ret;
64} 58}
65 59
@@ -75,10 +69,9 @@ xfs_flush_pages(
75 int ret = 0; 69 int ret = 0;
76 int ret2; 70 int ret2;
77 71
78 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
80 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
81 }
82 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
83 return ret; 76 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -95,7 +88,9 @@ xfs_wait_on_pages(
95{ 88{
96 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
97 90
98 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
99 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
100 return 0; 95 return 0;
101} 96}
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FS_SUBR_H__
19#define __XFS_FS_SUBR_H__
20
21extern int fs_noerr(void);
22extern int fs_nosys(void);
23extern void fs_noval(void);
24
25#endif /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02..76e81cff70b 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061..00000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e59a8106283..2ea238f6d38 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_ioctl.h" 31#include "xfs_ioctl.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
41#include "xfs_itable.h" 33#include "xfs_itable.h"
42#include "xfs_error.h" 34#include "xfs_error.h"
43#include "xfs_rw.h"
44#include "xfs_attr.h" 35#include "xfs_attr.h"
45#include "xfs_bmap.h" 36#include "xfs_bmap.h"
46#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
@@ -794,10 +785,12 @@ xfs_ioc_fsgetxattr(
794{ 785{
795 struct fsxattr fa; 786 struct fsxattr fa;
796 787
788 memset(&fa, 0, sizeof(struct fsxattr));
789
797 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
798 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
799 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
800 fa.fsx_projid = ip->i_d.di_projid; 793 fa.fsx_projid = xfs_get_projid(ip);
801 794
802 if (attr) { 795 if (attr) {
803 if (ip->i_afp) { 796 if (ip->i_afp) {
@@ -908,7 +901,7 @@ xfs_ioctl_setattr(
908 struct xfs_dquot *olddquot = NULL; 901 struct xfs_dquot *olddquot = NULL;
909 int code; 902 int code;
910 903
911 xfs_itrace_entry(ip); 904 trace_xfs_ioctl_setattr(ip);
912 905
913 if (mp->m_flags & XFS_MOUNT_RDONLY) 906 if (mp->m_flags & XFS_MOUNT_RDONLY)
914 return XFS_ERROR(EROFS); 907 return XFS_ERROR(EROFS);
@@ -916,6 +909,13 @@ xfs_ioctl_setattr(
916 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
917 910
918 /* 911 /*
912 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 */
914 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
915 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL);
917
918 /*
919 * If disk quotas is on, we make sure that the dquots do exist on disk, 919 * If disk quotas is on, we make sure that the dquots do exist on disk,
920 * before we start any other transactions. Trying to do this later 920 * before we start any other transactions. Trying to do this later
921 * is messy. We don't care to take a readlock to look at the ids 921 * is messy. We don't care to take a readlock to look at the ids
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 961 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 962 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 963 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 964 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 965 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 967 capable(CAP_FOWNER) ?
@@ -1043,8 +1043,7 @@ xfs_ioctl_setattr(
1043 } 1043 }
1044 } 1044 }
1045 1045
1046 xfs_trans_ijoin(tp, ip, lock_flags); 1046 xfs_trans_ijoin(tp, ip);
1047 xfs_trans_ihold(tp, ip);
1048 1047
1049 /* 1048 /*
1050 * Change file ownership. Must be the owner or privileged. 1049 * Change file ownership. Must be the owner or privileged.
@@ -1064,12 +1063,12 @@ xfs_ioctl_setattr(
1064 * Change the ownerships and register quota modifications 1063 * Change the ownerships and register quota modifications
1065 * in the transaction. 1064 * in the transaction.
1066 */ 1065 */
1067 if (ip->i_d.di_projid != fa->fsx_projid) { 1066 if (xfs_get_projid(ip) != fa->fsx_projid) {
1068 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1069 olddquot = xfs_qm_vop_chown(tp, ip, 1068 olddquot = xfs_qm_vop_chown(tp, ip,
1070 &ip->i_gdquot, gdqp); 1069 &ip->i_gdquot, gdqp);
1071 } 1070 }
1072 ip->i_d.di_projid = fa->fsx_projid; 1071 xfs_set_projid(ip, fa->fsx_projid);
1073 1072
1074 /* 1073 /*
1075 * We may have to rev the inode as well as 1074 * We may have to rev the inode as well as
@@ -1089,8 +1088,8 @@ xfs_ioctl_setattr(
1089 xfs_diflags_to_linux(ip); 1088 xfs_diflags_to_linux(ip);
1090 } 1089 }
1091 1090
1091 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1092 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1092 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1093 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1094 1093
1095 XFS_STATS_INC(xs_ig_attrchg); 1094 XFS_STATS_INC(xs_ig_attrchg);
1096 1095
@@ -1116,16 +1115,7 @@ xfs_ioctl_setattr(
1116 xfs_qm_dqrele(udqp); 1115 xfs_qm_dqrele(udqp);
1117 xfs_qm_dqrele(gdqp); 1116 xfs_qm_dqrele(gdqp);
1118 1117
1119 if (code) 1118 return code;
1120 return code;
1121
1122 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
1123 XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
1124 NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
1125 (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
1126 }
1127
1128 return 0;
1129 1119
1130 error_return: 1120 error_return:
1131 xfs_qm_dqrele(udqp); 1121 xfs_qm_dqrele(udqp);
@@ -1301,7 +1291,7 @@ xfs_file_ioctl(
1301 if (filp->f_mode & FMODE_NOCMTIME) 1291 if (filp->f_mode & FMODE_NOCMTIME)
1302 ioflags |= IO_INVIS; 1292 ioflags |= IO_INVIS;
1303 1293
1304 xfs_itrace_entry(ip); 1294 trace_xfs_file_ioctl(ip);
1305 1295
1306 switch (cmd) { 1296 switch (cmd) {
1307 case XFS_IOC_ALLOCSP: 1297 case XFS_IOC_ALLOCSP:
@@ -1311,7 +1301,8 @@ xfs_file_ioctl(
1311 case XFS_IOC_ALLOCSP64: 1301 case XFS_IOC_ALLOCSP64:
1312 case XFS_IOC_FREESP64: 1302 case XFS_IOC_FREESP64:
1313 case XFS_IOC_RESVSP64: 1303 case XFS_IOC_RESVSP64:
1314 case XFS_IOC_UNRESVSP64: { 1304 case XFS_IOC_UNRESVSP64:
1305 case XFS_IOC_ZERO_RANGE: {
1315 xfs_flock64_t bf; 1306 xfs_flock64_t bf;
1316 1307
1317 if (copy_from_user(&bf, arg, sizeof(bf))) 1308 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 52ed49e6465..b3486dfa552 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
28#include "xfs_trans.h" 28#include "xfs_trans.h"
29#include "xfs_sb.h" 29#include "xfs_sb.h"
30#include "xfs_ag.h" 30#include "xfs_ag.h"
31#include "xfs_dir2.h"
32#include "xfs_dmapi.h"
33#include "xfs_mount.h" 31#include "xfs_mount.h"
34#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_vnode.h" 33#include "xfs_vnode.h"
38#include "xfs_dinode.h" 34#include "xfs_dinode.h"
39#include "xfs_inode.h" 35#include "xfs_inode.h"
@@ -168,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
168 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
169 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
170 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
171 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
172 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
173 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
174 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -222,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
222 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
223 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
224 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
225 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
226 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
227 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -544,7 +542,7 @@ xfs_file_compat_ioctl(
544 if (filp->f_mode & FMODE_NOCMTIME) 542 if (filp->f_mode & FMODE_NOCMTIME)
545 ioflags |= IO_INVIS; 543 ioflags |= IO_INVIS;
546 544
547 xfs_itrace_entry(ip); 545 trace_xfs_file_compat_ioctl(ip);
548 546
549 switch (cmd) { 547 switch (cmd) {
550 /* No size or alignment issues on any arch */ 548 /* No size or alignment issues on any arch */
@@ -578,6 +576,7 @@ xfs_file_compat_ioctl(
578 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
579 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
580 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
581 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
582#else 581#else
583 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0..08b605792a9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 44f0b2de153..96107efc0c6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_alloc.h" 27#include "xfs_alloc.h"
29#include "xfs_dmapi.h"
30#include "xfs_quota.h" 28#include "xfs_quota.h"
31#include "xfs_mount.h" 29#include "xfs_mount.h"
32#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 31#include "xfs_dinode.h"
38#include "xfs_inode.h" 32#include "xfs_inode.h"
39#include "xfs_bmap.h" 33#include "xfs_bmap.h"
40#include "xfs_btree.h"
41#include "xfs_ialloc.h"
42#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
43#include "xfs_error.h" 35#include "xfs_error.h"
44#include "xfs_itable.h" 36#include "xfs_itable.h"
@@ -88,7 +80,7 @@ xfs_mark_inode_dirty_sync(
88{ 80{
89 struct inode *inode = VFS_I(ip); 81 struct inode *inode = VFS_I(ip);
90 82
91 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) 83 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
92 mark_inode_dirty_sync(inode); 84 mark_inode_dirty_sync(inode);
93} 85}
94 86
@@ -98,46 +90,11 @@ xfs_mark_inode_dirty(
98{ 90{
99 struct inode *inode = VFS_I(ip); 91 struct inode *inode = VFS_I(ip);
100 92
101 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) 93 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
102 mark_inode_dirty(inode); 94 mark_inode_dirty(inode);
103} 95}
104 96
105/* 97/*
106 * Change the requested timestamp in the given inode.
107 * We don't lock across timestamp updates, and we don't log them but
108 * we do record the fact that there is dirty information in core.
109 */
110void
111xfs_ichgtime(
112 xfs_inode_t *ip,
113 int flags)
114{
115 struct inode *inode = VFS_I(ip);
116 timespec_t tv;
117 int sync_it = 0;
118
119 tv = current_fs_time(inode->i_sb);
120
121 if ((flags & XFS_ICHGTIME_MOD) &&
122 !timespec_equal(&inode->i_mtime, &tv)) {
123 inode->i_mtime = tv;
124 sync_it = 1;
125 }
126 if ((flags & XFS_ICHGTIME_CHG) &&
127 !timespec_equal(&inode->i_ctime, &tv)) {
128 inode->i_ctime = tv;
129 sync_it = 1;
130 }
131
132 /*
133 * Update complete - now make sure everyone knows that the inode
134 * is dirty.
135 */
136 if (sync_it)
137 xfs_mark_inode_dirty_sync(ip);
138}
139
140/*
141 * Hook in SELinux. This is not quite correct yet, what we really need 98 * Hook in SELinux. This is not quite correct yet, what we really need
142 * here (as we do for default ACLs) is a mechanism by which creation of 99 * here (as we do for default ACLs) is a mechanism by which creation of
143 * these attrs can be journalled at inode creation time (along with the 100 * these attrs can be journalled at inode creation time (along with the
@@ -232,7 +189,7 @@ xfs_vn_mknod(
232 } 189 }
233 190
234 xfs_dentry_to_name(&name, dentry); 191 xfs_dentry_to_name(&name, dentry);
235 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 192 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
236 if (unlikely(error)) 193 if (unlikely(error))
237 goto out_free_acl; 194 goto out_free_acl;
238 195
@@ -360,7 +317,7 @@ xfs_vn_link(
360 if (unlikely(error)) 317 if (unlikely(error))
361 return -error; 318 return -error;
362 319
363 atomic_inc(&inode->i_count); 320 ihold(inode);
364 d_instantiate(dentry, inode); 321 d_instantiate(dentry, inode);
365 return 0; 322 return 0;
366} 323}
@@ -405,7 +362,7 @@ xfs_vn_symlink(
405 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 362 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
406 xfs_dentry_to_name(&name, dentry); 363 xfs_dentry_to_name(&name, dentry);
407 364
408 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 365 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
409 if (unlikely(error)) 366 if (unlikely(error))
410 goto out; 367 goto out;
411 368
@@ -496,7 +453,7 @@ xfs_vn_getattr(
496 struct xfs_inode *ip = XFS_I(inode); 453 struct xfs_inode *ip = XFS_I(inode);
497 struct xfs_mount *mp = ip->i_mount; 454 struct xfs_mount *mp = ip->i_mount;
498 455
499 xfs_itrace_entry(ip); 456 trace_xfs_getattr(ip);
500 457
501 if (XFS_FORCED_SHUTDOWN(mp)) 458 if (XFS_FORCED_SHUTDOWN(mp))
502 return XFS_ERROR(EIO); 459 return XFS_ERROR(EIO);
@@ -548,21 +505,6 @@ xfs_vn_setattr(
548 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
549} 506}
550 507
551/*
552 * block_truncate_page can return an error, but we can't propagate it
553 * at all here. Leave a complaint + stack trace in the syslog because
554 * this could be bad. If it is bad, we need to propagate the error further.
555 */
556STATIC void
557xfs_vn_truncate(
558 struct inode *inode)
559{
560 int error;
561 error = block_truncate_page(inode->i_mapping, inode->i_size,
562 xfs_get_blocks);
563 WARN_ON(error);
564}
565
566STATIC long 508STATIC long
567xfs_vn_fallocate( 509xfs_vn_fallocate(
568 struct inode *inode, 510 struct inode *inode,
@@ -687,7 +629,7 @@ xfs_vn_fiemap(
687 fieinfo->fi_extents_max + 1; 629 fieinfo->fi_extents_max + 1;
688 bm.bmv_count = min_t(__s32, bm.bmv_count, 630 bm.bmv_count = min_t(__s32, bm.bmv_count,
689 (PAGE_SIZE * 16 / sizeof(struct getbmapx))); 631 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
690 bm.bmv_iflags = BMV_IF_PREALLOC; 632 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
691 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 633 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
692 bm.bmv_iflags |= BMV_IF_ATTRFORK; 634 bm.bmv_iflags |= BMV_IF_ATTRFORK;
693 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) 635 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
@@ -702,7 +644,6 @@ xfs_vn_fiemap(
702 644
703static const struct inode_operations xfs_inode_operations = { 645static const struct inode_operations xfs_inode_operations = {
704 .check_acl = xfs_check_acl, 646 .check_acl = xfs_check_acl,
705 .truncate = xfs_vn_truncate,
706 .getattr = xfs_vn_getattr, 647 .getattr = xfs_vn_getattr,
707 .setattr = xfs_vn_setattr, 648 .setattr = xfs_vn_setattr,
708 .setxattr = generic_setxattr, 649 .setxattr = generic_setxattr,
@@ -819,7 +760,9 @@ xfs_setup_inode(
819 760
820 inode->i_ino = ip->i_ino; 761 inode->i_ino = ip->i_ino;
821 inode->i_state = I_NEW; 762 inode->i_state = I_NEW;
822 inode_add_to_lists(ip->i_mount->m_super, inode); 763
764 inode_sb_list_add(inode);
765 insert_inode_hash(inode);
823 766
824 inode->i_mode = ip->i_d.di_mode; 767 inode->i_mode = ip->i_d.di_mode;
825 inode->i_nlink = ip->i_d.di_nlink; 768 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a70..214ddd71ff7 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
71#include <linux/random.h> 71#include <linux/random.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/writeback.h> 73#include <linux/writeback.h>
74#include <linux/capability.h>
74 75
75#include <asm/page.h> 76#include <asm/page.h>
76#include <asm/div64.h> 77#include <asm/div64.h>
@@ -79,15 +80,12 @@
79#include <asm/byteorder.h> 80#include <asm/byteorder.h>
80#include <asm/unaligned.h> 81#include <asm/unaligned.h>
81 82
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 83#include <xfs_vnode.h>
84#include <xfs_stats.h> 84#include <xfs_stats.h>
85#include <xfs_sysctl.h> 85#include <xfs_sysctl.h>
86#include <xfs_iops.h> 86#include <xfs_iops.h>
87#include <xfs_aops.h> 87#include <xfs_aops.h>
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_fs_subr.h>
91#include <xfs_buf.h> 89#include <xfs_buf.h>
92 90
93/* 91/*
@@ -145,7 +143,7 @@
145#define SYNCHRONIZE() barrier() 143#define SYNCHRONIZE() barrier()
146#define __return_address __builtin_return_address(0) 144#define __return_address __builtin_return_address(0)
147 145
148#define dfltprid 0 146#define XFS_PROJID_DEFAULT 0
149#define MAXPATHLEN 1024 147#define MAXPATHLEN 1024
150 148
151#define MIN(a,b) (min(a,b)) 149#define MIN(a,b) (min(a,b))
@@ -157,8 +155,6 @@
157 */ 155 */
158#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) 156#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
159#define xfs_stack_trace() dump_stack() 157#define xfs_stack_trace() dump_stack()
160#define xfs_itruncate_data(ip, off) \
161 (-vmtruncate(VFS_I(ip), (off)))
162 158
163 159
164/* Move the kernel do_div definition off to one side */ 160/* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 067cafbfc63..29b9d642e93 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 19#include "xfs_sb.h"
21#include "xfs_inum.h" 20#include "xfs_inum.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
@@ -69,15 +68,15 @@ xfs_fs_set_xstate(
69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 68 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
70 return -ENOSYS; 69 return -ENOSYS;
71 70
72 if (uflags & XFS_QUOTA_UDQ_ACCT) 71 if (uflags & FS_QUOTA_UDQ_ACCT)
73 flags |= XFS_UQUOTA_ACCT; 72 flags |= XFS_UQUOTA_ACCT;
74 if (uflags & XFS_QUOTA_PDQ_ACCT) 73 if (uflags & FS_QUOTA_PDQ_ACCT)
75 flags |= XFS_PQUOTA_ACCT; 74 flags |= XFS_PQUOTA_ACCT;
76 if (uflags & XFS_QUOTA_GDQ_ACCT) 75 if (uflags & FS_QUOTA_GDQ_ACCT)
77 flags |= XFS_GQUOTA_ACCT; 76 flags |= XFS_GQUOTA_ACCT;
78 if (uflags & XFS_QUOTA_UDQ_ENFD) 77 if (uflags & FS_QUOTA_UDQ_ENFD)
79 flags |= XFS_UQUOTA_ENFD; 78 flags |= XFS_UQUOTA_ENFD;
80 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) 79 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
81 flags |= XFS_OQUOTA_ENFD; 80 flags |= XFS_OQUOTA_ENFD;
82 81
83 switch (op) { 82 switch (op) {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 80938c736c2..9f3a78fe6ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 28#include "xfs_quota.h"
30#include "xfs_mount.h" 29#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 31#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 33#include "xfs_dinode.h"
37#include "xfs_inode.h" 34#include "xfs_inode.h"
38#include "xfs_btree.h" 35#include "xfs_btree.h"
@@ -43,12 +40,10 @@
43#include "xfs_error.h" 40#include "xfs_error.h"
44#include "xfs_itable.h" 41#include "xfs_itable.h"
45#include "xfs_fsops.h" 42#include "xfs_fsops.h"
46#include "xfs_rw.h"
47#include "xfs_attr.h" 43#include "xfs_attr.h"
48#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
49#include "xfs_utils.h" 45#include "xfs_utils.h"
50#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
51#include "xfs_version.h"
52#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
53#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
54#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -94,7 +89,6 @@ mempool_t *xfs_ioend_pool;
94#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and 89#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
95 * unwritten extent conversion */ 90 * unwritten extent conversion */
96#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ 91#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
97#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
98#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ 92#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
99#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ 93#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
100#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ 94#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
@@ -116,9 +110,6 @@ mempool_t *xfs_ioend_pool;
116#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
117#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
118#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ 113#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ 114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
124 115
@@ -172,15 +163,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
172STATIC int 163STATIC int
173xfs_parseargs( 164xfs_parseargs(
174 struct xfs_mount *mp, 165 struct xfs_mount *mp,
175 char *options, 166 char *options)
176 char **mtpt)
177{ 167{
178 struct super_block *sb = mp->m_super; 168 struct super_block *sb = mp->m_super;
179 char *this_char, *value, *eov; 169 char *this_char, *value, *eov;
180 int dsunit = 0; 170 int dsunit = 0;
181 int dswidth = 0; 171 int dswidth = 0;
182 int iosize = 0; 172 int iosize = 0;
183 int dmapi_implies_ikeep = 1;
184 __uint8_t iosizelog = 0; 173 __uint8_t iosizelog = 0;
185 174
186 /* 175 /*
@@ -243,15 +232,10 @@ xfs_parseargs(
243 if (!mp->m_logname) 232 if (!mp->m_logname)
244 return ENOMEM; 233 return ENOMEM;
245 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 234 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
246 if (!value || !*value) { 235 cmn_err(CE_WARN,
247 cmn_err(CE_WARN, 236 "XFS: %s option not allowed on this system",
248 "XFS: %s option requires an argument", 237 this_char);
249 this_char); 238 return EINVAL;
250 return EINVAL;
251 }
252 *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
253 if (!*mtpt)
254 return ENOMEM;
255 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 239 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
256 if (!value || !*value) { 240 if (!value || !*value) {
257 cmn_err(CE_WARN, 241 cmn_err(CE_WARN,
@@ -288,8 +272,6 @@ xfs_parseargs(
288 mp->m_flags &= ~XFS_MOUNT_GRPID; 272 mp->m_flags &= ~XFS_MOUNT_GRPID;
289 } else if (!strcmp(this_char, MNTOPT_WSYNC)) { 273 } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
290 mp->m_flags |= XFS_MOUNT_WSYNC; 274 mp->m_flags |= XFS_MOUNT_WSYNC;
291 } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
292 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
293 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 275 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
294 mp->m_flags |= XFS_MOUNT_NORECOVERY; 276 mp->m_flags |= XFS_MOUNT_NORECOVERY;
295 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 277 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -329,7 +311,6 @@ xfs_parseargs(
329 } else if (!strcmp(this_char, MNTOPT_IKEEP)) { 311 } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
330 mp->m_flags |= XFS_MOUNT_IKEEP; 312 mp->m_flags |= XFS_MOUNT_IKEEP;
331 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { 313 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
332 dmapi_implies_ikeep = 0;
333 mp->m_flags &= ~XFS_MOUNT_IKEEP; 314 mp->m_flags &= ~XFS_MOUNT_IKEEP;
334 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { 315 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
335 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; 316 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -370,12 +351,6 @@ xfs_parseargs(
370 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 351 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
371 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 352 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
372 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 353 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
373 } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
374 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_XDSM)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI;
377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG; 355 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN, 356 cmn_err(CE_WARN,
@@ -387,9 +362,11 @@ xfs_parseargs(
387 cmn_err(CE_WARN, 362 cmn_err(CE_WARN,
388 "XFS: ihashsize no longer used, option is deprecated."); 363 "XFS: ihashsize no longer used, option is deprecated.");
389 } else if (!strcmp(this_char, "osyncisdsync")) { 364 } else if (!strcmp(this_char, "osyncisdsync")) {
390 /* no-op, this is now the default */
391 cmn_err(CE_WARN, 365 cmn_err(CE_WARN,
392 "XFS: osyncisdsync is now the default, option is deprecated."); 366 "XFS: osyncisdsync has no effect, option is deprecated.");
367 } else if (!strcmp(this_char, "osyncisosync")) {
368 cmn_err(CE_WARN,
369 "XFS: osyncisosync has no effect, option is deprecated.");
393 } else if (!strcmp(this_char, "irixsgid")) { 370 } else if (!strcmp(this_char, "irixsgid")) {
394 cmn_err(CE_WARN, 371 cmn_err(CE_WARN,
395 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 372 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -430,12 +407,6 @@ xfs_parseargs(
430 return EINVAL; 407 return EINVAL;
431 } 408 }
432 409
433 if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
434 printk("XFS: %s option needs the mount point option as well\n",
435 MNTOPT_DMAPI);
436 return EINVAL;
437 }
438
439 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 410 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
440 cmn_err(CE_WARN, 411 cmn_err(CE_WARN,
441 "XFS: sunit and swidth must be specified together"); 412 "XFS: sunit and swidth must be specified together");
@@ -449,18 +420,6 @@ xfs_parseargs(
449 return EINVAL; 420 return EINVAL;
450 } 421 }
451 422
452 /*
453 * Applications using DMI filesystems often expect the
454 * inode generation number to be monotonically increasing.
455 * If we delete inode chunks we break this assumption, so
456 * keep unused inode chunks on disk for DMI filesystems
457 * until we come up with a better solution.
458 * Note that if "ikeep" or "noikeep" mount options are
459 * supplied, then they are honored.
460 */
461 if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
462 mp->m_flags |= XFS_MOUNT_IKEEP;
463
464done: 423done:
465 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { 424 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
466 /* 425 /*
@@ -539,10 +498,8 @@ xfs_showargs(
539 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 498 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
540 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 499 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
541 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, 500 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
542 { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC },
543 { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, 501 { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 502 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 503 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, 504 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
548 { 0, NULL } 505 { 0, NULL }
@@ -619,7 +576,7 @@ xfs_max_file_offset(
619 576
620 /* Figure out maximum filesize, on Linux this can depend on 577 /* Figure out maximum filesize, on Linux this can depend on
621 * the filesystem blocksize (on 32 bit platforms). 578 * the filesystem blocksize (on 32 bit platforms).
622 * __block_prepare_write does this in an [unsigned] long... 579 * __block_write_begin does this in an [unsigned] long...
623 * page->index << (PAGE_CACHE_SHIFT - bbits) 580 * page->index << (PAGE_CACHE_SHIFT - bbits)
624 * So, for page sized blocks (4K on 32 bit platforms), 581 * So, for page sized blocks (4K on 32 bit platforms),
625 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 582 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -687,7 +644,7 @@ xfs_barrier_test(
687 XFS_BUF_ORDERED(sbp); 644 XFS_BUF_ORDERED(sbp);
688 645
689 xfsbdstrat(mp, sbp); 646 xfsbdstrat(mp, sbp);
690 error = xfs_iowait(sbp); 647 error = xfs_buf_iowait(sbp);
691 648
692 /* 649 /*
693 * Clear all the flags we set and possible error state in the 650 * Clear all the flags we set and possible error state in the
@@ -735,8 +692,7 @@ void
735xfs_blkdev_issue_flush( 692xfs_blkdev_issue_flush(
736 xfs_buftarg_t *buftarg) 693 xfs_buftarg_t *buftarg)
737{ 694{
738 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 695 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
739 BLKDEV_IFL_WAIT);
740} 696}
741 697
742STATIC void 698STATIC void
@@ -800,18 +756,20 @@ xfs_open_devices(
800 * Setup xfs_mount buffer target pointers 756 * Setup xfs_mount buffer target pointers
801 */ 757 */
802 error = ENOMEM; 758 error = ENOMEM;
803 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 759 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
804 if (!mp->m_ddev_targp) 760 if (!mp->m_ddev_targp)
805 goto out_close_rtdev; 761 goto out_close_rtdev;
806 762
807 if (rtdev) { 763 if (rtdev) {
808 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 764 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
765 mp->m_fsname);
809 if (!mp->m_rtdev_targp) 766 if (!mp->m_rtdev_targp)
810 goto out_free_ddev_targ; 767 goto out_free_ddev_targ;
811 } 768 }
812 769
813 if (logdev && logdev != ddev) { 770 if (logdev && logdev != ddev) {
814 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 771 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
772 mp->m_fsname);
815 if (!mp->m_logdev_targp) 773 if (!mp->m_logdev_targp)
816 goto out_free_rtdev_targ; 774 goto out_free_rtdev_targ;
817 } else { 775 } else {
@@ -947,7 +905,7 @@ xfs_fs_destroy_inode(
947{ 905{
948 struct xfs_inode *ip = XFS_I(inode); 906 struct xfs_inode *ip = XFS_I(inode);
949 907
950 xfs_itrace_entry(ip); 908 trace_xfs_destroy_inode(ip);
951 909
952 XFS_STATS_INC(vn_reclaim); 910 XFS_STATS_INC(vn_reclaim);
953 911
@@ -1014,12 +972,7 @@ xfs_fs_inode_init_once(
1014 972
1015/* 973/*
1016 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
1017 * we catch unlogged VFS level updates to the inode. Care must be taken 975 * we catch unlogged VFS level updates to the inode.
1018 * here - the transaction code calls mark_inode_dirty_sync() to mark the
1019 * VFS inode dirty in a transaction and clears the i_update_core field;
1020 * it must clear the field after calling mark_inode_dirty_sync() to
1021 * correctly indicate that the dirty state has been propagated into the
1022 * inode log item.
1023 * 976 *
1024 * We need the barrier() to maintain correct ordering between unlogged 977 * We need the barrier() to maintain correct ordering between unlogged
1025 * updates and the transaction commit code that clears the i_update_core 978 * updates and the transaction commit code that clears the i_update_core
@@ -1063,10 +1016,8 @@ xfs_log_inode(
1063 * an inode in another recent transaction. So we play it safe and 1016 * an inode in another recent transaction. So we play it safe and
1064 * fire off the transaction anyway. 1017 * fire off the transaction anyway.
1065 */ 1018 */
1066 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1019 xfs_trans_ijoin(tp, ip);
1067 xfs_trans_ihold(tp, ip);
1068 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1020 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1069 xfs_trans_set_sync(tp);
1070 error = xfs_trans_commit(tp, 0); 1021 error = xfs_trans_commit(tp, 0);
1071 xfs_ilock_demote(ip, XFS_ILOCK_EXCL); 1022 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1072 1023
@@ -1082,27 +1033,18 @@ xfs_fs_write_inode(
1082 struct xfs_mount *mp = ip->i_mount; 1033 struct xfs_mount *mp = ip->i_mount;
1083 int error = EAGAIN; 1034 int error = EAGAIN;
1084 1035
1085 xfs_itrace_entry(ip); 1036 trace_xfs_write_inode(ip);
1086 1037
1087 if (XFS_FORCED_SHUTDOWN(mp)) 1038 if (XFS_FORCED_SHUTDOWN(mp))
1088 return XFS_ERROR(EIO); 1039 return XFS_ERROR(EIO);
1089 1040
1090 if (wbc->sync_mode == WB_SYNC_ALL) { 1041 if (wbc->sync_mode == WB_SYNC_ALL) {
1091 /* 1042 /*
1092 * Make sure the inode has hit stable storage. By using the 1043 * Make sure the inode has made it it into the log. Instead
1093 * log and the fsync transactions we reduce the IOs we have 1044 * of forcing it all the way to stable storage using a
1094 * to do here from two (log and inode) to just the log. 1045 * synchronous transaction we let the log force inside the
1095 * 1046 * ->sync_fs call do that for thus, which reduces the number
1096 * Note: We still need to do a delwri write of the inode after 1047 * of synchronous log foces dramatically.
1097 * this to flush it to the backing buffer so that bulkstat
1098 * works properly if this is the first time the inode has been
1099 * written. Because we hold the ilock atomically over the
1100 * transaction commit and the inode flush we are guaranteed
1101 * that the inode is not pinned when it returns. If the flush
1102 * lock is already held, then the inode has already been
1103 * flushed once and we don't need to flush it again. Hence
1104 * the code will only flush the inode if it isn't already
1105 * being flushed.
1106 */ 1048 */
1107 xfs_ioend_wait(ip); 1049 xfs_ioend_wait(ip);
1108 xfs_ilock(ip, XFS_ILOCK_SHARED); 1050 xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1116,27 +1058,29 @@ xfs_fs_write_inode(
1116 * We make this non-blocking if the inode is contended, return 1058 * We make this non-blocking if the inode is contended, return
1117 * EAGAIN to indicate to the caller that they did not succeed. 1059 * EAGAIN to indicate to the caller that they did not succeed.
1118 * This prevents the flush path from blocking on inodes inside 1060 * This prevents the flush path from blocking on inodes inside
1119 * another operation right now, they get caught later by xfs_sync. 1061 * another operation right now, they get caught later by
1062 * xfs_sync.
1120 */ 1063 */
1121 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1064 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1122 goto out; 1065 goto out;
1123 }
1124 1066
1125 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1067 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1126 goto out_unlock; 1068 goto out_unlock;
1127 1069
1128 /* 1070 /*
1129 * Now we have the flush lock and the inode is not pinned, we can check 1071 * Now we have the flush lock and the inode is not pinned, we
1130 * if the inode is really clean as we know that there are no pending 1072 * can check if the inode is really clean as we know that
1131 * transaction completions, it is not waiting on the delayed write 1073 * there are no pending transaction completions, it is not
1132 * queue and there is no IO in progress. 1074 * waiting on the delayed write queue and there is no IO in
1133 */ 1075 * progress.
1134 if (xfs_inode_clean(ip)) { 1076 */
1135 xfs_ifunlock(ip); 1077 if (xfs_inode_clean(ip)) {
1136 error = 0; 1078 xfs_ifunlock(ip);
1137 goto out_unlock; 1079 error = 0;
1080 goto out_unlock;
1081 }
1082 error = xfs_iflush(ip, 0);
1138 } 1083 }
1139 error = xfs_iflush(ip, 0);
1140 1084
1141 out_unlock: 1085 out_unlock:
1142 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1086 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1151,12 +1095,15 @@ xfs_fs_write_inode(
1151} 1095}
1152 1096
1153STATIC void 1097STATIC void
1154xfs_fs_clear_inode( 1098xfs_fs_evict_inode(
1155 struct inode *inode) 1099 struct inode *inode)
1156{ 1100{
1157 xfs_inode_t *ip = XFS_I(inode); 1101 xfs_inode_t *ip = XFS_I(inode);
1158 1102
1159 xfs_itrace_entry(ip); 1103 trace_xfs_evict_inode(ip);
1104
1105 truncate_inode_pages(&inode->i_data, 0);
1106 end_writeback(inode);
1160 XFS_STATS_INC(vn_rele); 1107 XFS_STATS_INC(vn_rele);
1161 XFS_STATS_INC(vn_remove); 1108 XFS_STATS_INC(vn_remove);
1162 XFS_STATS_DEC(vn_active); 1109 XFS_STATS_DEC(vn_active);
@@ -1193,22 +1140,13 @@ xfs_fs_put_super(
1193{ 1140{
1194 struct xfs_mount *mp = XFS_M(sb); 1141 struct xfs_mount *mp = XFS_M(sb);
1195 1142
1143 /*
1144 * Unregister the memory shrinker before we tear down the mount
1145 * structure so we don't have memory reclaim racing with us here.
1146 */
1147 xfs_inode_shrinker_unregister(mp);
1196 xfs_syncd_stop(mp); 1148 xfs_syncd_stop(mp);
1197 1149
1198 if (!(sb->s_flags & MS_RDONLY)) {
1199 /*
1200 * XXX(hch): this should be SYNC_WAIT.
1201 *
1202 * Or more likely not needed at all because the VFS is already
1203 * calling ->sync_fs after shutting down all filestem
1204 * operations and just before calling ->put_super.
1205 */
1206 xfs_sync_data(mp, 0);
1207 xfs_sync_attr(mp, 0);
1208 }
1209
1210 XFS_SEND_PREUNMOUNT(mp);
1211
1212 /* 1150 /*
1213 * Blow away any referenced inode in the filestreams cache. 1151 * Blow away any referenced inode in the filestreams cache.
1214 * This can and will cause log traffic as inodes go inactive 1152 * This can and will cause log traffic as inodes go inactive
@@ -1218,14 +1156,10 @@ xfs_fs_put_super(
1218 1156
1219 XFS_bflush(mp->m_ddev_targp); 1157 XFS_bflush(mp->m_ddev_targp);
1220 1158
1221 XFS_SEND_UNMOUNT(mp);
1222
1223 xfs_unmountfs(mp); 1159 xfs_unmountfs(mp);
1224 xfs_freesb(mp); 1160 xfs_freesb(mp);
1225 xfs_inode_shrinker_unregister(mp);
1226 xfs_icsb_destroy_counters(mp); 1161 xfs_icsb_destroy_counters(mp);
1227 xfs_close_devices(mp); 1162 xfs_close_devices(mp);
1228 xfs_dmops_put(mp);
1229 xfs_free_fsname(mp); 1163 xfs_free_fsname(mp);
1230 kfree(mp); 1164 kfree(mp);
1231} 1165}
@@ -1287,6 +1221,7 @@ xfs_fs_statfs(
1287 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1221 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1288 __uint64_t fakeinos, id; 1222 __uint64_t fakeinos, id;
1289 xfs_extlen_t lsize; 1223 xfs_extlen_t lsize;
1224 __int64_t ffree;
1290 1225
1291 statp->f_type = XFS_SB_MAGIC; 1226 statp->f_type = XFS_SB_MAGIC;
1292 statp->f_namelen = MAXNAMELEN - 1; 1227 statp->f_namelen = MAXNAMELEN - 1;
@@ -1310,7 +1245,11 @@ xfs_fs_statfs(
1310 statp->f_files = min_t(typeof(statp->f_files), 1245 statp->f_files = min_t(typeof(statp->f_files),
1311 statp->f_files, 1246 statp->f_files,
1312 mp->m_maxicount); 1247 mp->m_maxicount);
1313 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1248
1249 /* make sure statp->f_ffree does not underflow */
1250 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1251 statp->f_ffree = max_t(__int64_t, ffree, 0);
1252
1314 spin_unlock(&mp->m_sb_lock); 1253 spin_unlock(&mp->m_sb_lock);
1315 1254
1316 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1255 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1463,7 +1402,7 @@ xfs_fs_freeze(
1463 1402
1464 xfs_save_resvblks(mp); 1403 xfs_save_resvblks(mp);
1465 xfs_quiesce_attr(mp); 1404 xfs_quiesce_attr(mp);
1466 return -xfs_fs_log_dummy(mp); 1405 return -xfs_fs_log_dummy(mp, SYNC_WAIT);
1467} 1406}
1468 1407
1469STATIC int 1408STATIC int
@@ -1543,7 +1482,6 @@ xfs_fs_fill_super(
1543 struct inode *root; 1482 struct inode *root;
1544 struct xfs_mount *mp = NULL; 1483 struct xfs_mount *mp = NULL;
1545 int flags = 0, error = ENOMEM; 1484 int flags = 0, error = ENOMEM;
1546 char *mtpt = NULL;
1547 1485
1548 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 1486 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
1549 if (!mp) 1487 if (!mp)
@@ -1559,7 +1497,7 @@ xfs_fs_fill_super(
1559 mp->m_super = sb; 1497 mp->m_super = sb;
1560 sb->s_fs_info = mp; 1498 sb->s_fs_info = mp;
1561 1499
1562 error = xfs_parseargs(mp, (char *)data, &mtpt); 1500 error = xfs_parseargs(mp, (char *)data);
1563 if (error) 1501 if (error)
1564 goto out_free_fsname; 1502 goto out_free_fsname;
1565 1503
@@ -1571,19 +1509,16 @@ xfs_fs_fill_super(
1571#endif 1509#endif
1572 sb->s_op = &xfs_super_operations; 1510 sb->s_op = &xfs_super_operations;
1573 1511
1574 error = xfs_dmops_get(mp);
1575 if (error)
1576 goto out_free_fsname;
1577
1578 if (silent) 1512 if (silent)
1579 flags |= XFS_MFSI_QUIET; 1513 flags |= XFS_MFSI_QUIET;
1580 1514
1581 error = xfs_open_devices(mp); 1515 error = xfs_open_devices(mp);
1582 if (error) 1516 if (error)
1583 goto out_put_dmops; 1517 goto out_free_fsname;
1584 1518
1585 if (xfs_icsb_init_counters(mp)) 1519 error = xfs_icsb_init_counters(mp);
1586 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1520 if (error)
1521 goto out_close_devices;
1587 1522
1588 error = xfs_readsb(mp, flags); 1523 error = xfs_readsb(mp, flags);
1589 if (error) 1524 if (error)
@@ -1608,8 +1543,6 @@ xfs_fs_fill_super(
1608 if (error) 1543 if (error)
1609 goto out_filestream_unmount; 1544 goto out_filestream_unmount;
1610 1545
1611 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
1612
1613 sb->s_magic = XFS_SB_MAGIC; 1546 sb->s_magic = XFS_SB_MAGIC;
1614 sb->s_blocksize = mp->m_sb.sb_blocksize; 1547 sb->s_blocksize = mp->m_sb.sb_blocksize;
1615 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1548 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1638,7 +1571,6 @@ xfs_fs_fill_super(
1638 1571
1639 xfs_inode_shrinker_register(mp); 1572 xfs_inode_shrinker_register(mp);
1640 1573
1641 kfree(mtpt);
1642 return 0; 1574 return 0;
1643 1575
1644 out_filestream_unmount: 1576 out_filestream_unmount:
@@ -1647,12 +1579,10 @@ xfs_fs_fill_super(
1647 xfs_freesb(mp); 1579 xfs_freesb(mp);
1648 out_destroy_counters: 1580 out_destroy_counters:
1649 xfs_icsb_destroy_counters(mp); 1581 xfs_icsb_destroy_counters(mp);
1582 out_close_devices:
1650 xfs_close_devices(mp); 1583 xfs_close_devices(mp);
1651 out_put_dmops:
1652 xfs_dmops_put(mp);
1653 out_free_fsname: 1584 out_free_fsname:
1654 xfs_free_fsname(mp); 1585 xfs_free_fsname(mp);
1655 kfree(mtpt);
1656 kfree(mp); 1586 kfree(mp);
1657 out: 1587 out:
1658 return -error; 1588 return -error;
@@ -1679,16 +1609,14 @@ xfs_fs_fill_super(
1679 goto out_free_sb; 1609 goto out_free_sb;
1680} 1610}
1681 1611
1682STATIC int 1612STATIC struct dentry *
1683xfs_fs_get_sb( 1613xfs_fs_mount(
1684 struct file_system_type *fs_type, 1614 struct file_system_type *fs_type,
1685 int flags, 1615 int flags,
1686 const char *dev_name, 1616 const char *dev_name,
1687 void *data, 1617 void *data)
1688 struct vfsmount *mnt)
1689{ 1618{
1690 return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, 1619 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1691 mnt);
1692} 1620}
1693 1621
1694static const struct super_operations xfs_super_operations = { 1622static const struct super_operations xfs_super_operations = {
@@ -1696,7 +1624,7 @@ static const struct super_operations xfs_super_operations = {
1696 .destroy_inode = xfs_fs_destroy_inode, 1624 .destroy_inode = xfs_fs_destroy_inode,
1697 .dirty_inode = xfs_fs_dirty_inode, 1625 .dirty_inode = xfs_fs_dirty_inode,
1698 .write_inode = xfs_fs_write_inode, 1626 .write_inode = xfs_fs_write_inode,
1699 .clear_inode = xfs_fs_clear_inode, 1627 .evict_inode = xfs_fs_evict_inode,
1700 .put_super = xfs_fs_put_super, 1628 .put_super = xfs_fs_put_super,
1701 .sync_fs = xfs_fs_sync_fs, 1629 .sync_fs = xfs_fs_sync_fs,
1702 .freeze_fs = xfs_fs_freeze, 1630 .freeze_fs = xfs_fs_freeze,
@@ -1709,7 +1637,7 @@ static const struct super_operations xfs_super_operations = {
1709static struct file_system_type xfs_fs_type = { 1637static struct file_system_type xfs_fs_type = {
1710 .owner = THIS_MODULE, 1638 .owner = THIS_MODULE,
1711 .name = "xfs", 1639 .name = "xfs",
1712 .get_sb = xfs_fs_get_sb, 1640 .mount = xfs_fs_mount,
1713 .kill_sb = kill_block_super, 1641 .kill_sb = kill_block_super,
1714 .fs_flags = FS_REQUIRES_DEV, 1642 .fs_flags = FS_REQUIRES_DEV,
1715}; 1643};
@@ -1759,6 +1687,12 @@ xfs_init_zones(void)
1759 if (!xfs_trans_zone) 1687 if (!xfs_trans_zone)
1760 goto out_destroy_ifork_zone; 1688 goto out_destroy_ifork_zone;
1761 1689
1690 xfs_log_item_desc_zone =
1691 kmem_zone_init(sizeof(struct xfs_log_item_desc),
1692 "xfs_log_item_desc");
1693 if (!xfs_log_item_desc_zone)
1694 goto out_destroy_trans_zone;
1695
1762 /* 1696 /*
1763 * The size of the zone allocated buf log item is the maximum 1697 * The size of the zone allocated buf log item is the maximum
1764 * size possible under XFS. This wastes a little bit of memory, 1698 * size possible under XFS. This wastes a little bit of memory,
@@ -1768,7 +1702,7 @@ xfs_init_zones(void)
1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / 1702 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1769 NBWORD) * sizeof(int))), "xfs_buf_item"); 1703 NBWORD) * sizeof(int))), "xfs_buf_item");
1770 if (!xfs_buf_item_zone) 1704 if (!xfs_buf_item_zone)
1771 goto out_destroy_trans_zone; 1705 goto out_destroy_log_item_desc_zone;
1772 1706
1773 xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + 1707 xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
1774 ((XFS_EFD_MAX_FAST_EXTENTS - 1) * 1708 ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1805,6 +1739,8 @@ xfs_init_zones(void)
1805 kmem_zone_destroy(xfs_efd_zone); 1739 kmem_zone_destroy(xfs_efd_zone);
1806 out_destroy_buf_item_zone: 1740 out_destroy_buf_item_zone:
1807 kmem_zone_destroy(xfs_buf_item_zone); 1741 kmem_zone_destroy(xfs_buf_item_zone);
1742 out_destroy_log_item_desc_zone:
1743 kmem_zone_destroy(xfs_log_item_desc_zone);
1808 out_destroy_trans_zone: 1744 out_destroy_trans_zone:
1809 kmem_zone_destroy(xfs_trans_zone); 1745 kmem_zone_destroy(xfs_trans_zone);
1810 out_destroy_ifork_zone: 1746 out_destroy_ifork_zone:
@@ -1835,6 +1771,7 @@ xfs_destroy_zones(void)
1835 kmem_zone_destroy(xfs_efi_zone); 1771 kmem_zone_destroy(xfs_efi_zone);
1836 kmem_zone_destroy(xfs_efd_zone); 1772 kmem_zone_destroy(xfs_efd_zone);
1837 kmem_zone_destroy(xfs_buf_item_zone); 1773 kmem_zone_destroy(xfs_buf_item_zone);
1774 kmem_zone_destroy(xfs_log_item_desc_zone);
1838 kmem_zone_destroy(xfs_trans_zone); 1775 kmem_zone_destroy(xfs_trans_zone);
1839 kmem_zone_destroy(xfs_ifork_zone); 1776 kmem_zone_destroy(xfs_ifork_zone);
1840 kmem_zone_destroy(xfs_dabuf_zone); 1777 kmem_zone_destroy(xfs_dabuf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 519618e9279..50a3266c999 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,23 +56,17 @@ extern void xfs_qm_exit(void);
56# define XFS_BIGFS_STRING 56# define XFS_BIGFS_STRING
57#endif 57#endif
58 58
59#ifdef CONFIG_XFS_DMAPI
60# define XFS_DMAPI_STRING "dmapi support, "
61#else
62# define XFS_DMAPI_STRING
63#endif
64
65#ifdef DEBUG 59#ifdef DEBUG
66# define XFS_DBG_STRING "debug" 60# define XFS_DBG_STRING "debug"
67#else 61#else
68# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
69#endif 63#endif
70 64
65#define XFS_VERSION_STRING "SGI XFS"
71#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
72 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
73 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
74 XFS_BIGFS_STRING \ 69 XFS_BIGFS_STRING \
75 XFS_DMAPI_STRING \
76 XFS_DBG_STRING /* DBG must be last */ 70 XFS_DBG_STRING /* DBG must be last */
77 71
78struct xfs_inode; 72struct xfs_inode;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a51a07c3a70..37d33254981 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,67 +24,54 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_inode.h" 29#include "xfs_inode.h"
37#include "xfs_dinode.h" 30#include "xfs_dinode.h"
38#include "xfs_error.h" 31#include "xfs_error.h"
39#include "xfs_mru_cache.h"
40#include "xfs_filestream.h" 32#include "xfs_filestream.h"
41#include "xfs_vnodeops.h" 33#include "xfs_vnodeops.h"
42#include "xfs_utils.h"
43#include "xfs_buf_item.h"
44#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
45#include "xfs_rw.h"
46#include "xfs_quota.h" 35#include "xfs_quota.h"
47#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
48 38
49#include <linux/kthread.h> 39#include <linux/kthread.h>
50#include <linux/freezer.h> 40#include <linux/freezer.h>
51 41
42/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between
45 * lookup reduction and stack usage. This is in the reclaim path, so we can't
46 * be too greedy.
47 */
48#define XFS_LOOKUP_BATCH 32
52 49
53STATIC xfs_inode_t * 50STATIC int
54xfs_inode_ag_lookup( 51xfs_inode_ag_walk_grab(
55 struct xfs_mount *mp, 52 struct xfs_inode *ip)
56 struct xfs_perag *pag,
57 uint32_t *first_index,
58 int tag)
59{ 53{
60 int nr_found; 54 struct inode *inode = VFS_I(ip);
61 struct xfs_inode *ip;
62 55
63 /* 56 /* nothing to sync during shutdown */
64 * use a gang lookup to find the next inode in the tree 57 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
65 * as the tree is sparse and a gang lookup walks to find 58 return EFSCORRUPTED;
66 * the number of objects requested. 59
67 */ 60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
68 if (tag == XFS_ICI_NO_TAG) { 61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 62 return ENOENT;
70 (void **)&ip, *first_index, 1); 63
71 } else { 64 /* If we can't grab the inode, it must on it's way to reclaim. */
72 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 65 if (!igrab(inode))
73 (void **)&ip, *first_index, 1, tag); 66 return ENOENT;
67
68 if (is_bad_inode(inode)) {
69 IRELE(ip);
70 return ENOENT;
74 } 71 }
75 if (!nr_found)
76 return NULL;
77 72
78 /* 73 /* inode is valid */
79 * Update the index for the next lookup. Catch overflows 74 return 0;
80 * into the next AG range which can occur if we have inodes
81 * in the last block of the AG and we are currently
82 * pointing to the last inode.
83 */
84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
86 return NULL;
87 return ip;
88} 75}
89 76
90STATIC int 77STATIC int
@@ -93,49 +80,75 @@ xfs_inode_ag_walk(
93 struct xfs_perag *pag, 80 struct xfs_perag *pag,
94 int (*execute)(struct xfs_inode *ip, 81 int (*execute)(struct xfs_inode *ip,
95 struct xfs_perag *pag, int flags), 82 struct xfs_perag *pag, int flags),
96 int flags, 83 int flags)
97 int tag,
98 int exclusive,
99 int *nr_to_scan)
100{ 84{
101 uint32_t first_index; 85 uint32_t first_index;
102 int last_error = 0; 86 int last_error = 0;
103 int skipped; 87 int skipped;
88 int done;
89 int nr_found;
104 90
105restart: 91restart:
92 done = 0;
106 skipped = 0; 93 skipped = 0;
107 first_index = 0; 94 first_index = 0;
95 nr_found = 0;
108 do { 96 do {
97 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
109 int error = 0; 98 int error = 0;
110 xfs_inode_t *ip; 99 int i;
111 100
112 if (exclusive) 101 read_lock(&pag->pag_ici_lock);
113 write_lock(&pag->pag_ici_lock); 102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
114 else 103 (void **)batch, first_index,
115 read_lock(&pag->pag_ici_lock); 104 XFS_LOOKUP_BATCH);
116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 105 if (!nr_found) {
117 if (!ip) { 106 read_unlock(&pag->pag_ici_lock);
118 if (exclusive)
119 write_unlock(&pag->pag_ici_lock);
120 else
121 read_unlock(&pag->pag_ici_lock);
122 break; 107 break;
123 } 108 }
124 109
125 /* execute releases pag->pag_ici_lock */ 110 /*
126 error = execute(ip, pag, flags); 111 * Grab the inodes before we drop the lock. if we found
127 if (error == EAGAIN) { 112 * nothing, nr == 0 and the loop will be skipped.
128 skipped++; 113 */
129 continue; 114 for (i = 0; i < nr_found; i++) {
115 struct xfs_inode *ip = batch[i];
116
117 if (done || xfs_inode_ag_walk_grab(ip))
118 batch[i] = NULL;
119
120 /*
121 * Update the index for the next lookup. Catch overflows
122 * into the next AG range which can occur if we have inodes
123 * in the last block of the AG and we are currently
124 * pointing to the last inode.
125 */
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1;
129 }
130
131 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock);
133
134 for (i = 0; i < nr_found; i++) {
135 if (!batch[i])
136 continue;
137 error = execute(batch[i], pag, flags);
138 IRELE(batch[i]);
139 if (error == EAGAIN) {
140 skipped++;
141 continue;
142 }
143 if (error && last_error != EFSCORRUPTED)
144 last_error = error;
130 } 145 }
131 if (error)
132 last_error = error;
133 146
134 /* bail out if the filesystem is corrupted. */ 147 /* bail out if the filesystem is corrupted. */
135 if (error == EFSCORRUPTED) 148 if (error == EFSCORRUPTED)
136 break; 149 break;
137 150
138 } while ((*nr_to_scan)--); 151 } while (nr_found && !done);
139 152
140 if (skipped) { 153 if (skipped) {
141 delay(1); 154 delay(1);
@@ -144,110 +157,32 @@ restart:
144 return last_error; 157 return last_error;
145} 158}
146 159
147/*
148 * Select the next per-ag structure to iterate during the walk. The reclaim
149 * walk is optimised only to walk AGs with reclaimable inodes in them.
150 */
151static struct xfs_perag *
152xfs_inode_ag_iter_next_pag(
153 struct xfs_mount *mp,
154 xfs_agnumber_t *first,
155 int tag)
156{
157 struct xfs_perag *pag = NULL;
158
159 if (tag == XFS_ICI_RECLAIM_TAG) {
160 int found;
161 int ref;
162
163 spin_lock(&mp->m_perag_lock);
164 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
165 (void **)&pag, *first, 1, tag);
166 if (found <= 0) {
167 spin_unlock(&mp->m_perag_lock);
168 return NULL;
169 }
170 *first = pag->pag_agno + 1;
171 /* open coded pag reference increment */
172 ref = atomic_inc_return(&pag->pag_ref);
173 spin_unlock(&mp->m_perag_lock);
174 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
175 } else {
176 pag = xfs_perag_get(mp, *first);
177 (*first)++;
178 }
179 return pag;
180}
181
182int 160int
183xfs_inode_ag_iterator( 161xfs_inode_ag_iterator(
184 struct xfs_mount *mp, 162 struct xfs_mount *mp,
185 int (*execute)(struct xfs_inode *ip, 163 int (*execute)(struct xfs_inode *ip,
186 struct xfs_perag *pag, int flags), 164 struct xfs_perag *pag, int flags),
187 int flags, 165 int flags)
188 int tag,
189 int exclusive,
190 int *nr_to_scan)
191{ 166{
192 struct xfs_perag *pag; 167 struct xfs_perag *pag;
193 int error = 0; 168 int error = 0;
194 int last_error = 0; 169 int last_error = 0;
195 xfs_agnumber_t ag; 170 xfs_agnumber_t ag;
196 int nr;
197 171
198 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
199 ag = 0; 172 ag = 0;
200 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 173 while ((pag = xfs_perag_get(mp, ag))) {
201 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 174 ag = pag->pag_agno + 1;
202 exclusive, &nr); 175 error = xfs_inode_ag_walk(mp, pag, execute, flags);
203 xfs_perag_put(pag); 176 xfs_perag_put(pag);
204 if (error) { 177 if (error) {
205 last_error = error; 178 last_error = error;
206 if (error == EFSCORRUPTED) 179 if (error == EFSCORRUPTED)
207 break; 180 break;
208 } 181 }
209 if (nr <= 0)
210 break;
211 } 182 }
212 if (nr_to_scan)
213 *nr_to_scan = nr;
214 return XFS_ERROR(last_error); 183 return XFS_ERROR(last_error);
215} 184}
216 185
217/* must be called with pag_ici_lock held and releases it */
218int
219xfs_sync_inode_valid(
220 struct xfs_inode *ip,
221 struct xfs_perag *pag)
222{
223 struct inode *inode = VFS_I(ip);
224 int error = EFSCORRUPTED;
225
226 /* nothing to sync during shutdown */
227 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
228 goto out_unlock;
229
230 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
231 error = ENOENT;
232 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
233 goto out_unlock;
234
235 /* If we can't grab the inode, it must on it's way to reclaim. */
236 if (!igrab(inode))
237 goto out_unlock;
238
239 if (is_bad_inode(inode)) {
240 IRELE(ip);
241 goto out_unlock;
242 }
243
244 /* inode is valid */
245 error = 0;
246out_unlock:
247 read_unlock(&pag->pag_ici_lock);
248 return error;
249}
250
251STATIC int 186STATIC int
252xfs_sync_inode_data( 187xfs_sync_inode_data(
253 struct xfs_inode *ip, 188 struct xfs_inode *ip,
@@ -258,10 +193,6 @@ xfs_sync_inode_data(
258 struct address_space *mapping = inode->i_mapping; 193 struct address_space *mapping = inode->i_mapping;
259 int error = 0; 194 int error = 0;
260 195
261 error = xfs_sync_inode_valid(ip, pag);
262 if (error)
263 return error;
264
265 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 196 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
266 goto out_wait; 197 goto out_wait;
267 198
@@ -278,7 +209,6 @@ xfs_sync_inode_data(
278 out_wait: 209 out_wait:
279 if (flags & SYNC_WAIT) 210 if (flags & SYNC_WAIT)
280 xfs_ioend_wait(ip); 211 xfs_ioend_wait(ip);
281 IRELE(ip);
282 return error; 212 return error;
283} 213}
284 214
@@ -290,10 +220,6 @@ xfs_sync_inode_attr(
290{ 220{
291 int error = 0; 221 int error = 0;
292 222
293 error = xfs_sync_inode_valid(ip, pag);
294 if (error)
295 return error;
296
297 xfs_ilock(ip, XFS_ILOCK_SHARED); 223 xfs_ilock(ip, XFS_ILOCK_SHARED);
298 if (xfs_inode_clean(ip)) 224 if (xfs_inode_clean(ip))
299 goto out_unlock; 225 goto out_unlock;
@@ -312,14 +238,13 @@ xfs_sync_inode_attr(
312 238
313 out_unlock: 239 out_unlock:
314 xfs_iunlock(ip, XFS_ILOCK_SHARED); 240 xfs_iunlock(ip, XFS_ILOCK_SHARED);
315 IRELE(ip);
316 return error; 241 return error;
317} 242}
318 243
319/* 244/*
320 * Write out pagecache data for the whole filesystem. 245 * Write out pagecache data for the whole filesystem.
321 */ 246 */
322int 247STATIC int
323xfs_sync_data( 248xfs_sync_data(
324 struct xfs_mount *mp, 249 struct xfs_mount *mp,
325 int flags) 250 int flags)
@@ -328,8 +253,7 @@ xfs_sync_data(
328 253
329 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 254 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
330 255
331 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 256 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
332 XFS_ICI_NO_TAG, 0, NULL);
333 if (error) 257 if (error)
334 return XFS_ERROR(error); 258 return XFS_ERROR(error);
335 259
@@ -340,48 +264,14 @@ xfs_sync_data(
340/* 264/*
341 * Write out inode metadata (attributes) for the whole filesystem. 265 * Write out inode metadata (attributes) for the whole filesystem.
342 */ 266 */
343int 267STATIC int
344xfs_sync_attr( 268xfs_sync_attr(
345 struct xfs_mount *mp, 269 struct xfs_mount *mp,
346 int flags) 270 int flags)
347{ 271{
348 ASSERT((flags & ~SYNC_WAIT) == 0); 272 ASSERT((flags & ~SYNC_WAIT) == 0);
349 273
350 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 274 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
351 XFS_ICI_NO_TAG, 0, NULL);
352}
353
354STATIC int
355xfs_commit_dummy_trans(
356 struct xfs_mount *mp,
357 uint flags)
358{
359 struct xfs_inode *ip = mp->m_rootip;
360 struct xfs_trans *tp;
361 int error;
362
363 /*
364 * Put a dummy transaction in the log to tell recovery
365 * that all others are OK.
366 */
367 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
368 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
369 if (error) {
370 xfs_trans_cancel(tp, 0);
371 return error;
372 }
373
374 xfs_ilock(ip, XFS_ILOCK_EXCL);
375
376 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
377 xfs_trans_ihold(tp, ip);
378 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
379 error = xfs_trans_commit(tp, 0);
380 xfs_iunlock(ip, XFS_ILOCK_EXCL);
381
382 /* the log force ensures this transaction is pushed to disk */
383 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
384 return error;
385} 275}
386 276
387STATIC int 277STATIC int
@@ -444,7 +334,7 @@ xfs_quiesce_data(
444 334
445 /* mark the log as covered if needed */ 335 /* mark the log as covered if needed */
446 if (xfs_log_need_covered(mp)) 336 if (xfs_log_need_covered(mp))
447 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); 337 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
448 338
449 /* flush data-only devices */ 339 /* flush data-only devices */
450 if (mp->m_rtdev_targp) 340 if (mp->m_rtdev_targp)
@@ -575,7 +465,7 @@ xfs_flush_inodes(
575/* 465/*
576 * Every sync period we need to unpin all items, reclaim inodes and sync 466 * Every sync period we need to unpin all items, reclaim inodes and sync
577 * disk quotas. We might need to cover the log to indicate that the 467 * disk quotas. We might need to cover the log to indicate that the
578 * filesystem is idle. 468 * filesystem is idle and not frozen.
579 */ 469 */
580STATIC void 470STATIC void
581xfs_sync_worker( 471xfs_sync_worker(
@@ -589,8 +479,9 @@ xfs_sync_worker(
589 xfs_reclaim_inodes(mp, 0); 479 xfs_reclaim_inodes(mp, 0);
590 /* dgc: errors ignored here */ 480 /* dgc: errors ignored here */
591 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 481 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
592 if (xfs_log_need_covered(mp)) 482 if (mp->m_super->s_frozen == SB_UNFROZEN &&
593 error = xfs_commit_dummy_trans(mp, 0); 483 xfs_log_need_covered(mp))
484 error = xfs_fs_log_dummy(mp, 0);
594 } 485 }
595 mp->m_sync_seq++; 486 mp->m_sync_seq++;
596 wake_up(&mp->m_wait_single_sync_task); 487 wake_up(&mp->m_wait_single_sync_task);
@@ -710,14 +601,11 @@ xfs_inode_set_reclaim_tag(
710 xfs_perag_put(pag); 601 xfs_perag_put(pag);
711} 602}
712 603
713void 604STATIC void
714__xfs_inode_clear_reclaim_tag( 605__xfs_inode_clear_reclaim(
715 xfs_mount_t *mp,
716 xfs_perag_t *pag, 606 xfs_perag_t *pag,
717 xfs_inode_t *ip) 607 xfs_inode_t *ip)
718{ 608{
719 radix_tree_tag_clear(&pag->pag_ici_root,
720 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
721 pag->pag_ici_reclaimable--; 609 pag->pag_ici_reclaimable--;
722 if (!pag->pag_ici_reclaimable) { 610 if (!pag->pag_ici_reclaimable) {
723 /* clear the reclaim tag from the perag radix tree */ 611 /* clear the reclaim tag from the perag radix tree */
@@ -731,6 +619,54 @@ __xfs_inode_clear_reclaim_tag(
731 } 619 }
732} 620}
733 621
622void
623__xfs_inode_clear_reclaim_tag(
624 xfs_mount_t *mp,
625 xfs_perag_t *pag,
626 xfs_inode_t *ip)
627{
628 radix_tree_tag_clear(&pag->pag_ici_root,
629 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
630 __xfs_inode_clear_reclaim(pag, ip);
631}
632
633/*
634 * Grab the inode for reclaim exclusively.
635 * Return 0 if we grabbed it, non-zero otherwise.
636 */
637STATIC int
638xfs_reclaim_inode_grab(
639 struct xfs_inode *ip,
640 int flags)
641{
642
643 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks.
647 */
648 if ((flags & SYNC_TRYLOCK) &&
649 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
650 return 1;
651 }
652
653 /*
654 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us.
657 */
658 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */
662 spin_unlock(&ip->i_flags_lock);
663 return 1;
664 }
665 __xfs_iflags_set(ip, XFS_IRECLAIM);
666 spin_unlock(&ip->i_flags_lock);
667 return 0;
668}
669
734/* 670/*
735 * Inodes in different states need to be treated differently, and the return 671 * Inodes in different states need to be treated differently, and the return
736 * value of xfs_iflush is not sufficient to get this right. The following table 672 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -789,23 +725,6 @@ xfs_reclaim_inode(
789{ 725{
790 int error = 0; 726 int error = 0;
791 727
792 /*
793 * The radix tree lock here protects a thread in xfs_iget from racing
794 * with us starting reclaim on the inode. Once we have the
795 * XFS_IRECLAIM flag set it will not touch us.
796 */
797 spin_lock(&ip->i_flags_lock);
798 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
799 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
800 /* ignore as it is already under reclaim */
801 spin_unlock(&ip->i_flags_lock);
802 write_unlock(&pag->pag_ici_lock);
803 return 0;
804 }
805 __xfs_iflags_set(ip, XFS_IRECLAIM);
806 spin_unlock(&ip->i_flags_lock);
807 write_unlock(&pag->pag_ici_lock);
808
809 xfs_ilock(ip, XFS_ILOCK_EXCL); 728 xfs_ilock(ip, XFS_ILOCK_EXCL);
810 if (!xfs_iflock_nowait(ip)) { 729 if (!xfs_iflock_nowait(ip)) {
811 if (!(sync_mode & SYNC_WAIT)) 730 if (!(sync_mode & SYNC_WAIT))
@@ -867,18 +786,161 @@ out:
867reclaim: 786reclaim:
868 xfs_ifunlock(ip); 787 xfs_ifunlock(ip);
869 xfs_iunlock(ip, XFS_ILOCK_EXCL); 788 xfs_iunlock(ip, XFS_ILOCK_EXCL);
870 xfs_ireclaim(ip); 789
790 XFS_STATS_INC(xs_ig_reclaims);
791 /*
792 * Remove the inode from the per-AG radix tree.
793 *
794 * Because radix_tree_delete won't complain even if the item was never
795 * added to the tree assert that it's been there before to catch
796 * problems with the inode life time early on.
797 */
798 write_lock(&pag->pag_ici_lock);
799 if (!radix_tree_delete(&pag->pag_ici_root,
800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
801 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip);
803 write_unlock(&pag->pag_ici_lock);
804
805 /*
806 * Here we do an (almost) spurious inode lock in order to coordinate
807 * with inode cache radix tree lookups. This is because the lookup
808 * can reference the inodes in the cache without taking references.
809 *
810 * We make that OK here by ensuring that we wait until the inode is
811 * unlocked after the lookup before we go ahead and free it. We get
812 * both the ilock and the iolock because the code may need to drop the
813 * ilock one but will still hold the iolock.
814 */
815 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
816 xfs_qm_dqdetach(ip);
817 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
818
819 xfs_inode_free(ip);
871 return error; 820 return error;
872 821
873} 822}
874 823
824/*
825 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
826 * corrupted, we still want to try to reclaim all the inodes. If we don't,
827 * then a shut down during filesystem unmount reclaim walk leak all the
828 * unreclaimed inodes.
829 */
830int
831xfs_reclaim_inodes_ag(
832 struct xfs_mount *mp,
833 int flags,
834 int *nr_to_scan)
835{
836 struct xfs_perag *pag;
837 int error = 0;
838 int last_error = 0;
839 xfs_agnumber_t ag;
840 int trylock = flags & SYNC_TRYLOCK;
841 int skipped;
842
843restart:
844 ag = 0;
845 skipped = 0;
846 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
847 unsigned long first_index = 0;
848 int done = 0;
849 int nr_found = 0;
850
851 ag = pag->pag_agno + 1;
852
853 if (trylock) {
854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
855 skipped++;
856 continue;
857 }
858 first_index = pag->pag_ici_reclaim_cursor;
859 } else
860 mutex_lock(&pag->pag_ici_reclaim_lock);
861
862 do {
863 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
864 int i;
865
866 write_lock(&pag->pag_ici_lock);
867 nr_found = radix_tree_gang_lookup_tag(
868 &pag->pag_ici_root,
869 (void **)batch, first_index,
870 XFS_LOOKUP_BATCH,
871 XFS_ICI_RECLAIM_TAG);
872 if (!nr_found) {
873 write_unlock(&pag->pag_ici_lock);
874 break;
875 }
876
877 /*
878 * Grab the inodes before we drop the lock. if we found
879 * nothing, nr == 0 and the loop will be skipped.
880 */
881 for (i = 0; i < nr_found; i++) {
882 struct xfs_inode *ip = batch[i];
883
884 if (done || xfs_reclaim_inode_grab(ip, flags))
885 batch[i] = NULL;
886
887 /*
888 * Update the index for the next lookup. Catch
889 * overflows into the next AG range which can
890 * occur if we have inodes in the last block of
891 * the AG and we are currently pointing to the
892 * last inode.
893 */
894 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
895 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
896 done = 1;
897 }
898
899 /* unlock now we've grabbed the inodes. */
900 write_unlock(&pag->pag_ici_lock);
901
902 for (i = 0; i < nr_found; i++) {
903 if (!batch[i])
904 continue;
905 error = xfs_reclaim_inode(batch[i], pag, flags);
906 if (error && last_error != EFSCORRUPTED)
907 last_error = error;
908 }
909
910 *nr_to_scan -= XFS_LOOKUP_BATCH;
911
912 } while (nr_found && !done && *nr_to_scan > 0);
913
914 if (trylock && !done)
915 pag->pag_ici_reclaim_cursor = first_index;
916 else
917 pag->pag_ici_reclaim_cursor = 0;
918 mutex_unlock(&pag->pag_ici_reclaim_lock);
919 xfs_perag_put(pag);
920 }
921
922 /*
923 * if we skipped any AG, and we still have scan count remaining, do
924 * another pass this time using blocking reclaim semantics (i.e
925 * waiting on the reclaim locks and ignoring the reclaim cursors). This
926 * ensure that when we get more reclaimers than AGs we block rather
927 * than spin trying to execute reclaim.
928 */
929 if (trylock && skipped && *nr_to_scan > 0) {
930 trylock = 0;
931 goto restart;
932 }
933 return XFS_ERROR(last_error);
934}
935
875int 936int
876xfs_reclaim_inodes( 937xfs_reclaim_inodes(
877 xfs_mount_t *mp, 938 xfs_mount_t *mp,
878 int mode) 939 int mode)
879{ 940{
880 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 941 int nr_to_scan = INT_MAX;
881 XFS_ICI_RECLAIM_TAG, 1, NULL); 942
943 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
882} 944}
883 945
884/* 946/*
@@ -900,17 +962,16 @@ xfs_reclaim_inode_shrink(
900 if (!(gfp_mask & __GFP_FS)) 962 if (!(gfp_mask & __GFP_FS))
901 return -1; 963 return -1;
902 964
903 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 965 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
904 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 966 /* terminate if we don't exhaust the scan */
905 /* if we don't exhaust the scan, don't bother coming back */
906 if (nr_to_scan > 0) 967 if (nr_to_scan > 0)
907 return -1; 968 return -1;
908 } 969 }
909 970
910 reclaimable = 0; 971 reclaimable = 0;
911 ag = 0; 972 ag = 0;
912 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 973 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
913 XFS_ICI_RECLAIM_TAG))) { 974 ag = pag->pag_agno + 1;
914 reclaimable += pag->pag_ici_reclaimable; 975 reclaimable += pag->pag_ici_reclaimable;
915 xfs_perag_put(pag); 976 xfs_perag_put(pag);
916 } 977 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index e28139aaa4a..32ba6628290 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
35int xfs_syncd_init(struct xfs_mount *mp); 35int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 36void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags);
40
41int xfs_quiesce_data(struct xfs_mount *mp); 38int xfs_quiesce_data(struct xfs_mount *mp);
42void xfs_quiesce_attr(struct xfs_mount *mp); 39void xfs_quiesce_attr(struct xfs_mount *mp);
43 40
@@ -50,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
50void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
51 struct xfs_inode *ip); 48 struct xfs_inode *ip);
52 49
53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 50int xfs_sync_inode_grab(struct xfs_inode *ip);
54int xfs_inode_ag_iterator(struct xfs_mount *mp, 51int xfs_inode_ag_iterator(struct xfs_mount *mp,
55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
56 int flags, int tag, int write_lock, int *nr_to_scan); 53 int flags);
57 54
58void xfs_inode_shrinker_register(struct xfs_mount *mp); 55void xfs_inode_shrinker_register(struct xfs_mount *mp);
59void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 56void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index d12be8470cb..88d25d4aa56 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,17 +24,13 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
32#include "xfs_dir2_sf.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h" 31#include "xfs_dinode.h"
35#include "xfs_inode.h" 32#include "xfs_inode.h"
36#include "xfs_btree.h" 33#include "xfs_btree.h"
37#include "xfs_dmapi.h"
38#include "xfs_mount.h" 34#include "xfs_mount.h"
39#include "xfs_ialloc.h" 35#include "xfs_ialloc.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 30282069090..acef2e98c59 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -317,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
317DEFINE_BUF_EVENT(xfs_buf_free); 317DEFINE_BUF_EVENT(xfs_buf_free);
318DEFINE_BUF_EVENT(xfs_buf_hold); 318DEFINE_BUF_EVENT(xfs_buf_hold);
319DEFINE_BUF_EVENT(xfs_buf_rele); 319DEFINE_BUF_EVENT(xfs_buf_rele);
320DEFINE_BUF_EVENT(xfs_buf_pin);
321DEFINE_BUF_EVENT(xfs_buf_unpin);
322DEFINE_BUF_EVENT(xfs_buf_iodone); 320DEFINE_BUF_EVENT(xfs_buf_iodone);
323DEFINE_BUF_EVENT(xfs_buf_iorequest); 321DEFINE_BUF_EVENT(xfs_buf_iorequest);
324DEFINE_BUF_EVENT(xfs_buf_bawrite); 322DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -327,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
327DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
328DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
329DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
330DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
331DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
332DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
333DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
334DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
335DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
336DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
337DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
338DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
339DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -541,7 +538,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
541DEFINE_LOCK_EVENT(xfs_ilock_demote); 538DEFINE_LOCK_EVENT(xfs_ilock_demote);
542DEFINE_LOCK_EVENT(xfs_iunlock); 539DEFINE_LOCK_EVENT(xfs_iunlock);
543 540
544DECLARE_EVENT_CLASS(xfs_iget_class, 541DECLARE_EVENT_CLASS(xfs_inode_class,
545 TP_PROTO(struct xfs_inode *ip), 542 TP_PROTO(struct xfs_inode *ip),
546 TP_ARGS(ip), 543 TP_ARGS(ip),
547 TP_STRUCT__entry( 544 TP_STRUCT__entry(
@@ -557,16 +554,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
557 __entry->ino) 554 __entry->ino)
558) 555)
559 556
560#define DEFINE_IGET_EVENT(name) \ 557#define DEFINE_INODE_EVENT(name) \
561DEFINE_EVENT(xfs_iget_class, name, \ 558DEFINE_EVENT(xfs_inode_class, name, \
562 TP_PROTO(struct xfs_inode *ip), \ 559 TP_PROTO(struct xfs_inode *ip), \
563 TP_ARGS(ip)) 560 TP_ARGS(ip))
564DEFINE_IGET_EVENT(xfs_iget_skip); 561DEFINE_INODE_EVENT(xfs_iget_skip);
565DEFINE_IGET_EVENT(xfs_iget_reclaim); 562DEFINE_INODE_EVENT(xfs_iget_reclaim);
566DEFINE_IGET_EVENT(xfs_iget_found); 563DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
567DEFINE_IGET_EVENT(xfs_iget_alloc); 564DEFINE_INODE_EVENT(xfs_iget_hit);
568 565DEFINE_INODE_EVENT(xfs_iget_miss);
569DECLARE_EVENT_CLASS(xfs_inode_class, 566
567DEFINE_INODE_EVENT(xfs_getattr);
568DEFINE_INODE_EVENT(xfs_setattr);
569DEFINE_INODE_EVENT(xfs_readlink);
570DEFINE_INODE_EVENT(xfs_alloc_file_space);
571DEFINE_INODE_EVENT(xfs_free_file_space);
572DEFINE_INODE_EVENT(xfs_readdir);
573#ifdef CONFIG_XFS_POSIX_ACL
574DEFINE_INODE_EVENT(xfs_check_acl);
575#endif
576DEFINE_INODE_EVENT(xfs_vm_bmap);
577DEFINE_INODE_EVENT(xfs_file_ioctl);
578DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
579DEFINE_INODE_EVENT(xfs_ioctl_setattr);
580DEFINE_INODE_EVENT(xfs_file_fsync);
581DEFINE_INODE_EVENT(xfs_destroy_inode);
582DEFINE_INODE_EVENT(xfs_write_inode);
583DEFINE_INODE_EVENT(xfs_evict_inode);
584
585DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
586DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
587
588DECLARE_EVENT_CLASS(xfs_iref_class,
570 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 589 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
571 TP_ARGS(ip, caller_ip), 590 TP_ARGS(ip, caller_ip),
572 TP_STRUCT__entry( 591 TP_STRUCT__entry(
@@ -591,20 +610,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
591 (char *)__entry->caller_ip) 610 (char *)__entry->caller_ip)
592) 611)
593 612
594#define DEFINE_INODE_EVENT(name) \ 613#define DEFINE_IREF_EVENT(name) \
595DEFINE_EVENT(xfs_inode_class, name, \ 614DEFINE_EVENT(xfs_iref_class, name, \
596 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 615 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
597 TP_ARGS(ip, caller_ip)) 616 TP_ARGS(ip, caller_ip))
598DEFINE_INODE_EVENT(xfs_ihold); 617DEFINE_IREF_EVENT(xfs_ihold);
599DEFINE_INODE_EVENT(xfs_irele); 618DEFINE_IREF_EVENT(xfs_irele);
600DEFINE_INODE_EVENT(xfs_inode_pin); 619DEFINE_IREF_EVENT(xfs_inode_pin);
601DEFINE_INODE_EVENT(xfs_inode_unpin); 620DEFINE_IREF_EVENT(xfs_inode_unpin);
602DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); 621DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
622
623DECLARE_EVENT_CLASS(xfs_namespace_class,
624 TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
625 TP_ARGS(dp, name),
626 TP_STRUCT__entry(
627 __field(dev_t, dev)
628 __field(xfs_ino_t, dp_ino)
629 __dynamic_array(char, name, name->len)
630 ),
631 TP_fast_assign(
632 __entry->dev = VFS_I(dp)->i_sb->s_dev;
633 __entry->dp_ino = dp->i_ino;
634 memcpy(__get_str(name), name->name, name->len);
635 ),
636 TP_printk("dev %d:%d dp ino 0x%llx name %s",
637 MAJOR(__entry->dev), MINOR(__entry->dev),
638 __entry->dp_ino,
639 __get_str(name))
640)
603 641
604/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 642#define DEFINE_NAMESPACE_EVENT(name) \
605DEFINE_INODE_EVENT(xfs_inode); 643DEFINE_EVENT(xfs_namespace_class, name, \
606#define xfs_itrace_entry(ip) \ 644 TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
607 trace_xfs_inode(ip, _THIS_IP_) 645 TP_ARGS(dp, name))
646DEFINE_NAMESPACE_EVENT(xfs_remove);
647DEFINE_NAMESPACE_EVENT(xfs_link);
648DEFINE_NAMESPACE_EVENT(xfs_lookup);
649DEFINE_NAMESPACE_EVENT(xfs_create);
650DEFINE_NAMESPACE_EVENT(xfs_symlink);
651
652TRACE_EVENT(xfs_rename,
653 TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
654 struct xfs_name *src_name, struct xfs_name *target_name),
655 TP_ARGS(src_dp, target_dp, src_name, target_name),
656 TP_STRUCT__entry(
657 __field(dev_t, dev)
658 __field(xfs_ino_t, src_dp_ino)
659 __field(xfs_ino_t, target_dp_ino)
660 __dynamic_array(char, src_name, src_name->len)
661 __dynamic_array(char, target_name, target_name->len)
662 ),
663 TP_fast_assign(
664 __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
665 __entry->src_dp_ino = src_dp->i_ino;
666 __entry->target_dp_ino = target_dp->i_ino;
667 memcpy(__get_str(src_name), src_name->name, src_name->len);
668 memcpy(__get_str(target_name), target_name->name, target_name->len);
669 ),
670 TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
671 " src name %s target name %s",
672 MAJOR(__entry->dev), MINOR(__entry->dev),
673 __entry->src_dp_ino,
674 __entry->target_dp_ino,
675 __get_str(src_name),
676 __get_str(target_name))
677)
608 678
609DECLARE_EVENT_CLASS(xfs_dquot_class, 679DECLARE_EVENT_CLASS(xfs_dquot_class,
610 TP_PROTO(struct xfs_dquot *dqp), 680 TP_PROTO(struct xfs_dquot *dqp),
@@ -684,9 +754,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
684DEFINE_DQUOT_EVENT(xfs_dqflush); 754DEFINE_DQUOT_EVENT(xfs_dqflush);
685DEFINE_DQUOT_EVENT(xfs_dqflush_force); 755DEFINE_DQUOT_EVENT(xfs_dqflush_force);
686DEFINE_DQUOT_EVENT(xfs_dqflush_done); 756DEFINE_DQUOT_EVENT(xfs_dqflush_done);
687/* not really iget events, but we re-use the format */
688DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
689DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
690 757
691DECLARE_EVENT_CLASS(xfs_loggrant_class, 758DECLARE_EVENT_CLASS(xfs_loggrant_class,
692 TP_PROTO(struct log *log, struct xlog_ticket *tic), 759 TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -834,33 +901,29 @@ DECLARE_EVENT_CLASS(xfs_page_class,
834 __field(loff_t, size) 901 __field(loff_t, size)
835 __field(unsigned long, offset) 902 __field(unsigned long, offset)
836 __field(int, delalloc) 903 __field(int, delalloc)
837 __field(int, unmapped)
838 __field(int, unwritten) 904 __field(int, unwritten)
839 ), 905 ),
840 TP_fast_assign( 906 TP_fast_assign(
841 int delalloc = -1, unmapped = -1, unwritten = -1; 907 int delalloc = -1, unwritten = -1;
842 908
843 if (page_has_buffers(page)) 909 if (page_has_buffers(page))
844 xfs_count_page_state(page, &delalloc, 910 xfs_count_page_state(page, &delalloc, &unwritten);
845 &unmapped, &unwritten);
846 __entry->dev = inode->i_sb->s_dev; 911 __entry->dev = inode->i_sb->s_dev;
847 __entry->ino = XFS_I(inode)->i_ino; 912 __entry->ino = XFS_I(inode)->i_ino;
848 __entry->pgoff = page_offset(page); 913 __entry->pgoff = page_offset(page);
849 __entry->size = i_size_read(inode); 914 __entry->size = i_size_read(inode);
850 __entry->offset = off; 915 __entry->offset = off;
851 __entry->delalloc = delalloc; 916 __entry->delalloc = delalloc;
852 __entry->unmapped = unmapped;
853 __entry->unwritten = unwritten; 917 __entry->unwritten = unwritten;
854 ), 918 ),
855 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " 919 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
856 "delalloc %d unmapped %d unwritten %d", 920 "delalloc %d unwritten %d",
857 MAJOR(__entry->dev), MINOR(__entry->dev), 921 MAJOR(__entry->dev), MINOR(__entry->dev),
858 __entry->ino, 922 __entry->ino,
859 __entry->pgoff, 923 __entry->pgoff,
860 __entry->size, 924 __entry->size,
861 __entry->offset, 925 __entry->offset,
862 __entry->delalloc, 926 __entry->delalloc,
863 __entry->unmapped,
864 __entry->unwritten) 927 __entry->unwritten)
865) 928)
866 929
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563..00000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 585e7633dfc..faf8e1a83a1 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_bmap.h" 31#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
42#include "xfs_error.h" 33#include "xfs_error.h"
43#include "xfs_itable.h" 34#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_space.h" 37#include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
64 flush lock - ditto. 54 flush lock - ditto.
65*/ 55*/
66 56
67STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
68
69#ifdef DEBUG 57#ifdef DEBUG
70xfs_buftarg_t *xfs_dqerror_target; 58xfs_buftarg_t *xfs_dqerror_target;
71int xfs_do_dqerror; 59int xfs_do_dqerror;
@@ -390,21 +378,14 @@ xfs_qm_dqalloc(
390 return (ESRCH); 378 return (ESRCH);
391 } 379 }
392 380
393 /* 381 xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
394 * xfs_trans_commit normally decrements the vnode ref count
395 * when it unlocks the inode. Since we want to keep the quota
396 * inode around, we bump the vnode ref count now.
397 */
398 IHOLD(quotip);
399
400 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
401 nmaps = 1; 382 nmaps = 1;
402 if ((error = xfs_bmapi(tp, quotip, 383 if ((error = xfs_bmapi(tp, quotip,
403 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, 384 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
404 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, 385 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
405 &firstblock, 386 &firstblock,
406 XFS_QM_DQALLOC_SPACE_RES(mp), 387 XFS_QM_DQALLOC_SPACE_RES(mp),
407 &map, &nmaps, &flist, NULL))) { 388 &map, &nmaps, &flist))) {
408 goto error0; 389 goto error0;
409 } 390 }
410 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); 391 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -482,87 +463,68 @@ xfs_qm_dqtobp(
482 uint flags) 463 uint flags)
483{ 464{
484 xfs_bmbt_irec_t map; 465 xfs_bmbt_irec_t map;
485 int nmaps, error; 466 int nmaps = 1, error;
486 xfs_buf_t *bp; 467 xfs_buf_t *bp;
487 xfs_inode_t *quotip; 468 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
488 xfs_mount_t *mp; 469 xfs_mount_t *mp = dqp->q_mount;
489 xfs_disk_dquot_t *ddq; 470 xfs_disk_dquot_t *ddq;
490 xfs_dqid_t id; 471 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
491 boolean_t newdquot;
492 xfs_trans_t *tp = (tpp ? *tpp : NULL); 472 xfs_trans_t *tp = (tpp ? *tpp : NULL);
493 473
494 mp = dqp->q_mount; 474 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
495 id = be32_to_cpu(dqp->q_core.d_id);
496 nmaps = 1;
497 newdquot = B_FALSE;
498 475
499 /* 476 xfs_ilock(quotip, XFS_ILOCK_SHARED);
500 * If we don't know where the dquot lives, find out. 477 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
501 */
502 if (dqp->q_blkno == (xfs_daddr_t) 0) {
503 /* We use the id as an index */
504 dqp->q_fileoffset = (xfs_fileoff_t)id /
505 mp->m_quotainfo->qi_dqperchunk;
506 nmaps = 1;
507 quotip = XFS_DQ_TO_QIP(dqp);
508 xfs_ilock(quotip, XFS_ILOCK_SHARED);
509 /* 478 /*
510 * Return if this type of quotas is turned off while we didn't 479 * Return if this type of quotas is turned off while we
511 * have an inode lock 480 * didn't have the quota inode lock.
512 */ 481 */
513 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 482 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
514 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 483 return ESRCH;
515 return (ESRCH); 484 }
516 } 485
486 /*
487 * Find the block map; no allocations yet
488 */
489 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
490 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
491 NULL, 0, &map, &nmaps, NULL);
492
493 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
494 if (error)
495 return error;
496
497 ASSERT(nmaps == 1);
498 ASSERT(map.br_blockcount == 1);
499
500 /*
501 * Offset of dquot in the (fixed sized) dquot chunk.
502 */
503 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
504 sizeof(xfs_dqblk_t);
505
506 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
507 if (map.br_startblock == HOLESTARTBLOCK) {
517 /* 508 /*
518 * Find the block map; no allocations yet 509 * We don't allocate unless we're asked to
519 */ 510 */
520 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 511 if (!(flags & XFS_QMOPT_DQALLOC))
521 XFS_DQUOT_CLUSTER_SIZE_FSB, 512 return ENOENT;
522 XFS_BMAPI_METADATA,
523 NULL, 0, &map, &nmaps, NULL, NULL);
524 513
525 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 514 ASSERT(tp);
515 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
516 dqp->q_fileoffset, &bp);
526 if (error) 517 if (error)
527 return (error); 518 return error;
528 ASSERT(nmaps == 1); 519 tp = *tpp;
529 ASSERT(map.br_blockcount == 1); 520 } else {
521 trace_xfs_dqtobp_read(dqp);
530 522
531 /* 523 /*
532 * offset of dquot in the (fixed sized) dquot chunk. 524 * store the blkno etc so that we don't have to do the
525 * mapping all the time
533 */ 526 */
534 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * 527 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
535 sizeof(xfs_dqblk_t);
536 if (map.br_startblock == HOLESTARTBLOCK) {
537 /*
538 * We don't allocate unless we're asked to
539 */
540 if (!(flags & XFS_QMOPT_DQALLOC))
541 return (ENOENT);
542
543 ASSERT(tp);
544 if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
545 dqp->q_fileoffset, &bp)))
546 return (error);
547 tp = *tpp;
548 newdquot = B_TRUE;
549 } else {
550 /*
551 * store the blkno etc so that we don't have to do the
552 * mapping all the time
553 */
554 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
555 }
556 }
557 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
558 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
559
560 /*
561 * Read in the buffer, unless we've just done the allocation
562 * (in which case we already have the buf).
563 */
564 if (!newdquot) {
565 trace_xfs_dqtobp_read(dqp);
566 528
567 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 529 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
568 dqp->q_blkno, 530 dqp->q_blkno,
@@ -571,13 +533,14 @@ xfs_qm_dqtobp(
571 if (error || !bp) 533 if (error || !bp)
572 return XFS_ERROR(error); 534 return XFS_ERROR(error);
573 } 535 }
536
574 ASSERT(XFS_BUF_ISBUSY(bp)); 537 ASSERT(XFS_BUF_ISBUSY(bp));
575 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 538 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
576 539
577 /* 540 /*
578 * calculate the location of the dquot inside the buffer. 541 * calculate the location of the dquot inside the buffer.
579 */ 542 */
580 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); 543 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
581 544
582 /* 545 /*
583 * A simple sanity check in case we got a corrupted dquot... 546 * A simple sanity check in case we got a corrupted dquot...
@@ -1141,6 +1104,46 @@ xfs_qm_dqrele(
1141 xfs_qm_dqput(dqp); 1104 xfs_qm_dqput(dqp);
1142} 1105}
1143 1106
1107/*
1108 * This is the dquot flushing I/O completion routine. It is called
1109 * from interrupt level when the buffer containing the dquot is
1110 * flushed to disk. It is responsible for removing the dquot logitem
1111 * from the AIL if it has not been re-logged, and unlocking the dquot's
1112 * flush lock. This behavior is very similar to that of inodes..
1113 */
1114STATIC void
1115xfs_qm_dqflush_done(
1116 struct xfs_buf *bp,
1117 struct xfs_log_item *lip)
1118{
1119 xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
1120 xfs_dquot_t *dqp = qip->qli_dquot;
1121 struct xfs_ail *ailp = lip->li_ailp;
1122
1123 /*
1124 * We only want to pull the item from the AIL if its
1125 * location in the log has not changed since we started the flush.
1126 * Thus, we only bother if the dquot's lsn has
1127 * not changed. First we check the lsn outside the lock
1128 * since it's cheaper, and then we recheck while
1129 * holding the lock before removing the dquot from the AIL.
1130 */
1131 if ((lip->li_flags & XFS_LI_IN_AIL) &&
1132 lip->li_lsn == qip->qli_flush_lsn) {
1133
1134 /* xfs_trans_ail_delete() drops the AIL lock. */
1135 spin_lock(&ailp->xa_lock);
1136 if (lip->li_lsn == qip->qli_flush_lsn)
1137 xfs_trans_ail_delete(ailp, lip);
1138 else
1139 spin_unlock(&ailp->xa_lock);
1140 }
1141
1142 /*
1143 * Release the dq's flush lock since we're done with it.
1144 */
1145 xfs_dqfunlock(dqp);
1146}
1144 1147
1145/* 1148/*
1146 * Write a modified dquot to disk. 1149 * Write a modified dquot to disk.
@@ -1155,18 +1158,18 @@ xfs_qm_dqflush(
1155 xfs_dquot_t *dqp, 1158 xfs_dquot_t *dqp,
1156 uint flags) 1159 uint flags)
1157{ 1160{
1158 xfs_mount_t *mp; 1161 struct xfs_mount *mp = dqp->q_mount;
1159 xfs_buf_t *bp; 1162 struct xfs_buf *bp;
1160 xfs_disk_dquot_t *ddqp; 1163 struct xfs_disk_dquot *ddqp;
1161 int error; 1164 int error;
1162 1165
1163 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1166 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1164 ASSERT(!completion_done(&dqp->q_flush)); 1167 ASSERT(!completion_done(&dqp->q_flush));
1168
1165 trace_xfs_dqflush(dqp); 1169 trace_xfs_dqflush(dqp);
1166 1170
1167 /* 1171 /*
1168 * If not dirty, or it's pinned and we are not supposed to 1172 * If not dirty, or it's pinned and we are not supposed to block, nada.
1169 * block, nada.
1170 */ 1173 */
1171 if (!XFS_DQ_IS_DIRTY(dqp) || 1174 if (!XFS_DQ_IS_DIRTY(dqp) ||
1172 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { 1175 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1180,40 +1183,46 @@ xfs_qm_dqflush(
1180 * down forcibly. If that's the case we must not write this dquot 1183 * down forcibly. If that's the case we must not write this dquot
1181 * to disk, because the log record didn't make it to disk! 1184 * to disk, because the log record didn't make it to disk!
1182 */ 1185 */
1183 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { 1186 if (XFS_FORCED_SHUTDOWN(mp)) {
1184 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1187 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1185 xfs_dqfunlock(dqp); 1188 xfs_dqfunlock(dqp);
1186 return XFS_ERROR(EIO); 1189 return XFS_ERROR(EIO);
1187 } 1190 }
1188 1191
1189 /* 1192 /*
1190 * Get the buffer containing the on-disk dquot 1193 * Get the buffer containing the on-disk dquot
1191 * We don't need a transaction envelope because we know that the
1192 * the ondisk-dquot has already been allocated for.
1193 */ 1194 */
1194 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1195 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
1196 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1197 if (error) {
1195 ASSERT(error != ENOENT); 1198 ASSERT(error != ENOENT);
1196 /*
1197 * Quotas could have gotten turned off (ESRCH)
1198 */
1199 xfs_dqfunlock(dqp); 1199 xfs_dqfunlock(dqp);
1200 return (error); 1200 return error;
1201 } 1201 }
1202 1202
1203 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 1203 /*
1204 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1204 * Calculate the location of the dquot inside the buffer.
1205 xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); 1205 */
1206 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
1207
1208 /*
1209 * A simple sanity check in case we got a corrupted dquot..
1210 */
1211 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
1213 xfs_buf_relse(bp);
1214 xfs_dqfunlock(dqp);
1215 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1206 return XFS_ERROR(EIO); 1216 return XFS_ERROR(EIO);
1207 } 1217 }
1208 1218
1209 /* This is the only portion of data that needs to persist */ 1219 /* This is the only portion of data that needs to persist */
1210 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); 1220 memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
1211 1221
1212 /* 1222 /*
1213 * Clear the dirty field and remember the flush lsn for later use. 1223 * Clear the dirty field and remember the flush lsn for later use.
1214 */ 1224 */
1215 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1225 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1216 mp = dqp->q_mount;
1217 1226
1218 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1227 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1219 &dqp->q_logitem.qli_item.li_lsn); 1228 &dqp->q_logitem.qli_item.li_lsn);
@@ -1222,8 +1231,9 @@ xfs_qm_dqflush(
1222 * Attach an iodone routine so that we can remove this dquot from the 1231 * Attach an iodone routine so that we can remove this dquot from the
1223 * AIL and release the flush lock once the dquot is synced to disk. 1232 * AIL and release the flush lock once the dquot is synced to disk.
1224 */ 1233 */
1225 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) 1234 xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
1226 xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); 1235 &dqp->q_logitem.qli_item);
1236
1227 /* 1237 /*
1228 * If the buffer is pinned then push on the log so we won't 1238 * If the buffer is pinned then push on the log so we won't
1229 * get stuck waiting in the write for too long. 1239 * get stuck waiting in the write for too long.
@@ -1247,50 +1257,6 @@ xfs_qm_dqflush(
1247 1257
1248} 1258}
1249 1259
1250/*
1251 * This is the dquot flushing I/O completion routine. It is called
1252 * from interrupt level when the buffer containing the dquot is
1253 * flushed to disk. It is responsible for removing the dquot logitem
1254 * from the AIL if it has not been re-logged, and unlocking the dquot's
1255 * flush lock. This behavior is very similar to that of inodes..
1256 */
1257/*ARGSUSED*/
1258STATIC void
1259xfs_qm_dqflush_done(
1260 xfs_buf_t *bp,
1261 xfs_dq_logitem_t *qip)
1262{
1263 xfs_dquot_t *dqp;
1264 struct xfs_ail *ailp;
1265
1266 dqp = qip->qli_dquot;
1267 ailp = qip->qli_item.li_ailp;
1268
1269 /*
1270 * We only want to pull the item from the AIL if its
1271 * location in the log has not changed since we started the flush.
1272 * Thus, we only bother if the dquot's lsn has
1273 * not changed. First we check the lsn outside the lock
1274 * since it's cheaper, and then we recheck while
1275 * holding the lock before removing the dquot from the AIL.
1276 */
1277 if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
1278 qip->qli_item.li_lsn == qip->qli_flush_lsn) {
1279
1280 /* xfs_trans_ail_delete() drops the AIL lock. */
1281 spin_lock(&ailp->xa_lock);
1282 if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
1283 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
1284 else
1285 spin_unlock(&ailp->xa_lock);
1286 }
1287
1288 /*
1289 * Release the dq's flush lock since we're done with it.
1290 */
1291 xfs_dqfunlock(dqp);
1292}
1293
1294int 1260int
1295xfs_qm_dqlock_nowait( 1261xfs_qm_dqlock_nowait(
1296 xfs_dquot_t *dqp) 1262 xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 8d89a24ae32..2a1f3dc10a0 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_bmap.h" 31#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
42#include "xfs_error.h" 33#include "xfs_error.h"
43#include "xfs_itable.h" 34#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_priv.h" 37#include "xfs_trans_priv.h"
48#include "xfs_qm.h" 38#include "xfs_qm.h"
49 39
40static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
41{
42 return container_of(lip, struct xfs_dq_logitem, qli_item);
43}
44
50/* 45/*
51 * returns the number of iovecs needed to log the given dquot item. 46 * returns the number of iovecs needed to log the given dquot item.
52 */ 47 */
53/* ARGSUSED */
54STATIC uint 48STATIC uint
55xfs_qm_dquot_logitem_size( 49xfs_qm_dquot_logitem_size(
56 xfs_dq_logitem_t *logitem) 50 struct xfs_log_item *lip)
57{ 51{
58 /* 52 /*
59 * we need only two iovecs, one for the format, one for the real thing 53 * we need only two iovecs, one for the format, one for the real thing
60 */ 54 */
61 return (2); 55 return 2;
62} 56}
63 57
64/* 58/*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
66 */ 60 */
67STATIC void 61STATIC void
68xfs_qm_dquot_logitem_format( 62xfs_qm_dquot_logitem_format(
69 xfs_dq_logitem_t *logitem, 63 struct xfs_log_item *lip,
70 xfs_log_iovec_t *logvec) 64 struct xfs_log_iovec *logvec)
71{ 65{
72 ASSERT(logitem); 66 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
73 ASSERT(logitem->qli_dquot);
74 67
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 68 logvec->i_addr = &qlip->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 69 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 logvec->i_type = XLOG_REG_TYPE_QFORMAT; 70 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 71 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 72 logvec->i_addr = &qlip->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 73 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 logvec->i_type = XLOG_REG_TYPE_DQUOT; 74 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 75
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 76 ASSERT(2 == lip->li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 77 qlip->qli_format.qlf_size = 2;
85 78
86} 79}
87 80
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
90 */ 83 */
91STATIC void 84STATIC void
92xfs_qm_dquot_logitem_pin( 85xfs_qm_dquot_logitem_pin(
93 xfs_dq_logitem_t *logitem) 86 struct xfs_log_item *lip)
94{ 87{
95 xfs_dquot_t *dqp = logitem->qli_dquot; 88 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
96 89
97 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 90 ASSERT(XFS_DQ_IS_LOCKED(dqp));
98 atomic_inc(&dqp->q_pincount); 91 atomic_inc(&dqp->q_pincount);
@@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin(
104 * dquot must have been previously pinned with a call to 97 * dquot must have been previously pinned with a call to
105 * xfs_qm_dquot_logitem_pin(). 98 * xfs_qm_dquot_logitem_pin().
106 */ 99 */
107/* ARGSUSED */
108STATIC void 100STATIC void
109xfs_qm_dquot_logitem_unpin( 101xfs_qm_dquot_logitem_unpin(
110 xfs_dq_logitem_t *logitem) 102 struct xfs_log_item *lip,
103 int remove)
111{ 104{
112 xfs_dquot_t *dqp = logitem->qli_dquot; 105 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
113 106
114 ASSERT(atomic_read(&dqp->q_pincount) > 0); 107 ASSERT(atomic_read(&dqp->q_pincount) > 0);
115 if (atomic_dec_and_test(&dqp->q_pincount)) 108 if (atomic_dec_and_test(&dqp->q_pincount))
116 wake_up(&dqp->q_pinwait); 109 wake_up(&dqp->q_pinwait);
117} 110}
118 111
119/* ARGSUSED */
120STATIC void
121xfs_qm_dquot_logitem_unpin_remove(
122 xfs_dq_logitem_t *logitem,
123 xfs_trans_t *tp)
124{
125 xfs_qm_dquot_logitem_unpin(logitem);
126}
127
128/* 112/*
129 * Given the logitem, this writes the corresponding dquot entry to disk 113 * Given the logitem, this writes the corresponding dquot entry to disk
130 * asynchronously. This is called with the dquot entry securely locked; 114 * asynchronously. This is called with the dquot entry securely locked;
@@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
133 */ 117 */
134STATIC void 118STATIC void
135xfs_qm_dquot_logitem_push( 119xfs_qm_dquot_logitem_push(
136 xfs_dq_logitem_t *logitem) 120 struct xfs_log_item *lip)
137{ 121{
138 xfs_dquot_t *dqp; 122 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
139 int error; 123 int error;
140
141 dqp = logitem->qli_dquot;
142 124
143 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 125 ASSERT(XFS_DQ_IS_LOCKED(dqp));
144 ASSERT(!completion_done(&dqp->q_flush)); 126 ASSERT(!completion_done(&dqp->q_flush));
@@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push(
160 xfs_dqunlock(dqp); 142 xfs_dqunlock(dqp);
161} 143}
162 144
163/*ARGSUSED*/
164STATIC xfs_lsn_t 145STATIC xfs_lsn_t
165xfs_qm_dquot_logitem_committed( 146xfs_qm_dquot_logitem_committed(
166 xfs_dq_logitem_t *l, 147 struct xfs_log_item *lip,
167 xfs_lsn_t lsn) 148 xfs_lsn_t lsn)
168{ 149{
169 /* 150 /*
170 * We always re-log the entire dquot when it becomes dirty, 151 * We always re-log the entire dquot when it becomes dirty,
171 * so, the latest copy _is_ the only one that matters. 152 * so, the latest copy _is_ the only one that matters.
172 */ 153 */
173 return (lsn); 154 return lsn;
174} 155}
175 156
176
177/* 157/*
178 * This is called to wait for the given dquot to be unpinned. 158 * This is called to wait for the given dquot to be unpinned.
179 * Most of these pin/unpin routines are plagiarized from inode code. 159 * Most of these pin/unpin routines are plagiarized from inode code.
180 */ 160 */
181void 161void
182xfs_qm_dqunpin_wait( 162xfs_qm_dqunpin_wait(
183 xfs_dquot_t *dqp) 163 struct xfs_dquot *dqp)
184{ 164{
185 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 165 ASSERT(XFS_DQ_IS_LOCKED(dqp));
186 if (atomic_read(&dqp->q_pincount) == 0) 166 if (atomic_read(&dqp->q_pincount) == 0)
@@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait(
206 */ 186 */
207STATIC void 187STATIC void
208xfs_qm_dquot_logitem_pushbuf( 188xfs_qm_dquot_logitem_pushbuf(
209 xfs_dq_logitem_t *qip) 189 struct xfs_log_item *lip)
210{ 190{
211 xfs_dquot_t *dqp; 191 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
212 xfs_mount_t *mp; 192 struct xfs_dquot *dqp = qlip->qli_dquot;
213 xfs_buf_t *bp; 193 struct xfs_buf *bp;
214 194
215 dqp = qip->qli_dquot;
216 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 195 ASSERT(XFS_DQ_IS_LOCKED(dqp));
217 196
218 /* 197 /*
@@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
220 * inode flush completed and the inode was taken off the AIL. 199 * inode flush completed and the inode was taken off the AIL.
221 * So, just get out. 200 * So, just get out.
222 */ 201 */
223 if (completion_done(&dqp->q_flush) || 202 if (completion_done(&dqp->q_flush) ||
224 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 203 !(lip->li_flags & XFS_LI_IN_AIL)) {
225 xfs_dqunlock(dqp); 204 xfs_dqunlock(dqp);
226 return; 205 return;
227 } 206 }
228 mp = dqp->q_mount; 207
229 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 208 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
230 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); 209 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
231 xfs_dqunlock(dqp); 210 xfs_dqunlock(dqp);
232 if (!bp) 211 if (!bp)
233 return; 212 return;
234 if (XFS_BUF_ISDELAYWRITE(bp)) 213 if (XFS_BUF_ISDELAYWRITE(bp))
235 xfs_buf_delwri_promote(bp); 214 xfs_buf_delwri_promote(bp);
236 xfs_buf_relse(bp); 215 xfs_buf_relse(bp);
237 return;
238
239} 216}
240 217
241/* 218/*
@@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
250 */ 227 */
251STATIC uint 228STATIC uint
252xfs_qm_dquot_logitem_trylock( 229xfs_qm_dquot_logitem_trylock(
253 xfs_dq_logitem_t *qip) 230 struct xfs_log_item *lip)
254{ 231{
255 xfs_dquot_t *dqp; 232 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
256 233
257 dqp = qip->qli_dquot;
258 if (atomic_read(&dqp->q_pincount) > 0) 234 if (atomic_read(&dqp->q_pincount) > 0)
259 return XFS_ITEM_PINNED; 235 return XFS_ITEM_PINNED;
260 236
261 if (! xfs_qm_dqlock_nowait(dqp)) 237 if (!xfs_qm_dqlock_nowait(dqp))
262 return XFS_ITEM_LOCKED; 238 return XFS_ITEM_LOCKED;
263 239
264 if (!xfs_dqflock_nowait(dqp)) { 240 if (!xfs_dqflock_nowait(dqp)) {
@@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
269 return XFS_ITEM_PUSHBUF; 245 return XFS_ITEM_PUSHBUF;
270 } 246 }
271 247
272 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 248 ASSERT(lip->li_flags & XFS_LI_IN_AIL);
273 return XFS_ITEM_SUCCESS; 249 return XFS_ITEM_SUCCESS;
274} 250}
275 251
276
277/* 252/*
278 * Unlock the dquot associated with the log item. 253 * Unlock the dquot associated with the log item.
279 * Clear the fields of the dquot and dquot log item that 254 * Clear the fields of the dquot and dquot log item that
@@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
282 */ 257 */
283STATIC void 258STATIC void
284xfs_qm_dquot_logitem_unlock( 259xfs_qm_dquot_logitem_unlock(
285 xfs_dq_logitem_t *ql) 260 struct xfs_log_item *lip)
286{ 261{
287 xfs_dquot_t *dqp; 262 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
288 263
289 ASSERT(ql != NULL);
290 dqp = ql->qli_dquot;
291 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 264 ASSERT(XFS_DQ_IS_LOCKED(dqp));
292 265
293 /* 266 /*
@@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock(
304 xfs_dqunlock(dqp); 277 xfs_dqunlock(dqp);
305} 278}
306 279
307
308/* 280/*
309 * this needs to stamp an lsn into the dquot, I think. 281 * this needs to stamp an lsn into the dquot, I think.
310 * rpc's that look at user dquot's would then have to 282 * rpc's that look at user dquot's would then have to
311 * push on the dependency recorded in the dquot 283 * push on the dependency recorded in the dquot
312 */ 284 */
313/* ARGSUSED */
314STATIC void 285STATIC void
315xfs_qm_dquot_logitem_committing( 286xfs_qm_dquot_logitem_committing(
316 xfs_dq_logitem_t *l, 287 struct xfs_log_item *lip,
317 xfs_lsn_t lsn) 288 xfs_lsn_t lsn)
318{ 289{
319 return;
320} 290}
321 291
322
323/* 292/*
324 * This is the ops vector for dquots 293 * This is the ops vector for dquots
325 */ 294 */
326static struct xfs_item_ops xfs_dquot_item_ops = { 295static struct xfs_item_ops xfs_dquot_item_ops = {
327 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, 296 .iop_size = xfs_qm_dquot_logitem_size,
328 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 297 .iop_format = xfs_qm_dquot_logitem_format,
329 xfs_qm_dquot_logitem_format, 298 .iop_pin = xfs_qm_dquot_logitem_pin,
330 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, 299 .iop_unpin = xfs_qm_dquot_logitem_unpin,
331 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, 300 .iop_trylock = xfs_qm_dquot_logitem_trylock,
332 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 301 .iop_unlock = xfs_qm_dquot_logitem_unlock,
333 xfs_qm_dquot_logitem_unpin_remove, 302 .iop_committed = xfs_qm_dquot_logitem_committed,
334 .iop_trylock = (uint(*)(xfs_log_item_t*)) 303 .iop_push = xfs_qm_dquot_logitem_push,
335 xfs_qm_dquot_logitem_trylock, 304 .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
336 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock, 305 .iop_committing = xfs_qm_dquot_logitem_committing
337 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
338 xfs_qm_dquot_logitem_committed,
339 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
340 .iop_pushbuf = (void(*)(xfs_log_item_t*))
341 xfs_qm_dquot_logitem_pushbuf,
342 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
343 xfs_qm_dquot_logitem_committing
344}; 306};
345 307
346/* 308/*
@@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
350 */ 312 */
351void 313void
352xfs_qm_dquot_logitem_init( 314xfs_qm_dquot_logitem_init(
353 struct xfs_dquot *dqp) 315 struct xfs_dquot *dqp)
354{ 316{
355 xfs_dq_logitem_t *lp; 317 struct xfs_dq_logitem *lp = &dqp->q_logitem;
356 lp = &dqp->q_logitem;
357 318
358 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, 319 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
359 &xfs_dquot_item_ops); 320 &xfs_dquot_item_ops);
@@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init(
374 335
375/*------------------ QUOTAOFF LOG ITEMS -------------------*/ 336/*------------------ QUOTAOFF LOG ITEMS -------------------*/
376 337
338static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
339{
340 return container_of(lip, struct xfs_qoff_logitem, qql_item);
341}
342
343
377/* 344/*
378 * This returns the number of iovecs needed to log the given quotaoff item. 345 * This returns the number of iovecs needed to log the given quotaoff item.
379 * We only need 1 iovec for an quotaoff item. It just logs the 346 * We only need 1 iovec for an quotaoff item. It just logs the
380 * quotaoff_log_format structure. 347 * quotaoff_log_format structure.
381 */ 348 */
382/*ARGSUSED*/
383STATIC uint 349STATIC uint
384xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) 350xfs_qm_qoff_logitem_size(
351 struct xfs_log_item *lip)
385{ 352{
386 return (1); 353 return 1;
387} 354}
388 355
389/* 356/*
@@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
394 * slots in the quotaoff item have been filled. 361 * slots in the quotaoff item have been filled.
395 */ 362 */
396STATIC void 363STATIC void
397xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, 364xfs_qm_qoff_logitem_format(
398 xfs_log_iovec_t *log_vector) 365 struct xfs_log_item *lip,
366 struct xfs_log_iovec *log_vector)
399{ 367{
400 ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); 368 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
369
370 ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
401 371
402 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 372 log_vector->i_addr = &qflip->qql_format;
403 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 373 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
404 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; 374 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
405 qf->qql_format.qf_size = 1; 375 qflip->qql_format.qf_size = 1;
406} 376}
407 377
408
409/* 378/*
410 * Pinning has no meaning for an quotaoff item, so just return. 379 * Pinning has no meaning for an quotaoff item, so just return.
411 */ 380 */
412/*ARGSUSED*/
413STATIC void 381STATIC void
414xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) 382xfs_qm_qoff_logitem_pin(
383 struct xfs_log_item *lip)
415{ 384{
416 return;
417} 385}
418 386
419
420/* 387/*
421 * Since pinning has no meaning for an quotaoff item, unpinning does 388 * Since pinning has no meaning for an quotaoff item, unpinning does
422 * not either. 389 * not either.
423 */ 390 */
424/*ARGSUSED*/
425STATIC void 391STATIC void
426xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) 392xfs_qm_qoff_logitem_unpin(
393 struct xfs_log_item *lip,
394 int remove)
427{ 395{
428 return;
429}
430
431/*ARGSUSED*/
432STATIC void
433xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
434{
435 return;
436} 396}
437 397
438/* 398/*
439 * Quotaoff items have no locking, so just return success. 399 * Quotaoff items have no locking, so just return success.
440 */ 400 */
441/*ARGSUSED*/
442STATIC uint 401STATIC uint
443xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) 402xfs_qm_qoff_logitem_trylock(
403 struct xfs_log_item *lip)
444{ 404{
445 return XFS_ITEM_LOCKED; 405 return XFS_ITEM_LOCKED;
446} 406}
@@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
449 * Quotaoff items have no locking or pushing, so return failure 409 * Quotaoff items have no locking or pushing, so return failure
450 * so that the caller doesn't bother with us. 410 * so that the caller doesn't bother with us.
451 */ 411 */
452/*ARGSUSED*/
453STATIC void 412STATIC void
454xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) 413xfs_qm_qoff_logitem_unlock(
414 struct xfs_log_item *lip)
455{ 415{
456 return;
457} 416}
458 417
459/* 418/*
460 * The quotaoff-start-item is logged only once and cannot be moved in the log, 419 * The quotaoff-start-item is logged only once and cannot be moved in the log,
461 * so simply return the lsn at which it's been logged. 420 * so simply return the lsn at which it's been logged.
462 */ 421 */
463/*ARGSUSED*/
464STATIC xfs_lsn_t 422STATIC xfs_lsn_t
465xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) 423xfs_qm_qoff_logitem_committed(
424 struct xfs_log_item *lip,
425 xfs_lsn_t lsn)
466{ 426{
467 return (lsn); 427 return lsn;
468} 428}
469 429
470/* 430/*
471 * There isn't much you can do to push on an quotaoff item. It is simply 431 * There isn't much you can do to push on an quotaoff item. It is simply
472 * stuck waiting for the log to be flushed to disk. 432 * stuck waiting for the log to be flushed to disk.
473 */ 433 */
474/*ARGSUSED*/
475STATIC void 434STATIC void
476xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) 435xfs_qm_qoff_logitem_push(
436 struct xfs_log_item *lip)
477{ 437{
478 return;
479} 438}
480 439
481 440
482/*ARGSUSED*/
483STATIC xfs_lsn_t 441STATIC xfs_lsn_t
484xfs_qm_qoffend_logitem_committed( 442xfs_qm_qoffend_logitem_committed(
485 xfs_qoff_logitem_t *qfe, 443 struct xfs_log_item *lip,
486 xfs_lsn_t lsn) 444 xfs_lsn_t lsn)
487{ 445{
488 xfs_qoff_logitem_t *qfs; 446 struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
489 struct xfs_ail *ailp; 447 struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
448 struct xfs_ail *ailp = qfs->qql_item.li_ailp;
490 449
491 qfs = qfe->qql_start_lip;
492 ailp = qfs->qql_item.li_ailp;
493 spin_lock(&ailp->xa_lock);
494 /* 450 /*
495 * Delete the qoff-start logitem from the AIL. 451 * Delete the qoff-start logitem from the AIL.
496 * xfs_trans_ail_delete() drops the AIL lock. 452 * xfs_trans_ail_delete() drops the AIL lock.
497 */ 453 */
454 spin_lock(&ailp->xa_lock);
498 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); 455 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
456
499 kmem_free(qfs); 457 kmem_free(qfs);
500 kmem_free(qfe); 458 kmem_free(qfe);
501 return (xfs_lsn_t)-1; 459 return (xfs_lsn_t)-1;
@@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed(
515 * (truly makes the quotaoff irrevocable). If we do something else, 473 * (truly makes the quotaoff irrevocable). If we do something else,
516 * then maybe we don't need two. 474 * then maybe we don't need two.
517 */ 475 */
518/* ARGSUSED */
519STATIC void
520xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
521{
522 return;
523}
524
525/* ARGSUSED */
526STATIC void 476STATIC void
527xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) 477xfs_qm_qoff_logitem_committing(
478 struct xfs_log_item *lip,
479 xfs_lsn_t commit_lsn)
528{ 480{
529 return;
530} 481}
531 482
532static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { 483static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
533 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, 484 .iop_size = xfs_qm_qoff_logitem_size,
534 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 485 .iop_format = xfs_qm_qoff_logitem_format,
535 xfs_qm_qoff_logitem_format, 486 .iop_pin = xfs_qm_qoff_logitem_pin,
536 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 487 .iop_unpin = xfs_qm_qoff_logitem_unpin,
537 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, 488 .iop_trylock = xfs_qm_qoff_logitem_trylock,
538 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 489 .iop_unlock = xfs_qm_qoff_logitem_unlock,
539 xfs_qm_qoff_logitem_unpin_remove, 490 .iop_committed = xfs_qm_qoffend_logitem_committed,
540 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 491 .iop_push = xfs_qm_qoff_logitem_push,
541 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, 492 .iop_committing = xfs_qm_qoff_logitem_committing
542 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
543 xfs_qm_qoffend_logitem_committed,
544 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
545 .iop_pushbuf = NULL,
546 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
547 xfs_qm_qoffend_logitem_committing
548}; 493};
549 494
550/* 495/*
551 * This is the ops vector shared by all quotaoff-start log items. 496 * This is the ops vector shared by all quotaoff-start log items.
552 */ 497 */
553static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { 498static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
554 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, 499 .iop_size = xfs_qm_qoff_logitem_size,
555 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 500 .iop_format = xfs_qm_qoff_logitem_format,
556 xfs_qm_qoff_logitem_format, 501 .iop_pin = xfs_qm_qoff_logitem_pin,
557 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 502 .iop_unpin = xfs_qm_qoff_logitem_unpin,
558 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, 503 .iop_trylock = xfs_qm_qoff_logitem_trylock,
559 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 504 .iop_unlock = xfs_qm_qoff_logitem_unlock,
560 xfs_qm_qoff_logitem_unpin_remove, 505 .iop_committed = xfs_qm_qoff_logitem_committed,
561 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 506 .iop_push = xfs_qm_qoff_logitem_push,
562 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, 507 .iop_committing = xfs_qm_qoff_logitem_committing
563 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
564 xfs_qm_qoff_logitem_committed,
565 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
566 .iop_pushbuf = NULL,
567 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
568 xfs_qm_qoff_logitem_committing
569}; 508};
570 509
571/* 510/*
572 * Allocate and initialize an quotaoff item of the correct quota type(s). 511 * Allocate and initialize an quotaoff item of the correct quota type(s).
573 */ 512 */
574xfs_qoff_logitem_t * 513struct xfs_qoff_logitem *
575xfs_qm_qoff_logitem_init( 514xfs_qm_qoff_logitem_init(
576 struct xfs_mount *mp, 515 struct xfs_mount *mp,
577 xfs_qoff_logitem_t *start, 516 struct xfs_qoff_logitem *start,
578 uint flags) 517 uint flags)
579{ 518{
580 xfs_qoff_logitem_t *qf; 519 struct xfs_qoff_logitem *qf;
581 520
582 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); 521 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
583 522
584 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 523 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
585 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); 524 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
@@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init(
587 qf->qql_format.qf_type = XFS_LI_QUOTAOFF; 526 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
588 qf->qql_format.qf_flags = flags; 527 qf->qql_format.qf_flags = flags;
589 qf->qql_start_lip = start; 528 qf->qql_start_lip = start;
590 return (qf); 529 return qf;
591} 530}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 67c018392d6..f8e854b4fde 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 31#include "xfs_dinode.h"
37#include "xfs_inode.h" 32#include "xfs_inode.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
40#include "xfs_itable.h" 34#include "xfs_itable.h"
41#include "xfs_rtalloc.h" 35#include "xfs_rtalloc.h"
42#include "xfs_error.h" 36#include "xfs_error.h"
43#include "xfs_bmap.h" 37#include "xfs_bmap.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 38#include "xfs_attr.h"
46#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
47#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
@@ -62,8 +55,6 @@ uint ndquot;
62kmem_zone_t *qm_dqzone; 55kmem_zone_t *qm_dqzone;
63kmem_zone_t *qm_dqtrxzone; 56kmem_zone_t *qm_dqtrxzone;
64 57
65static cred_t xfs_zerocr;
66
67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 58STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 59STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 60
@@ -844,7 +835,7 @@ xfs_qm_dqattach_locked(
844 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 835 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
845 flags & XFS_QMOPT_DQALLOC, 836 flags & XFS_QMOPT_DQALLOC,
846 ip->i_udquot, &ip->i_gdquot) : 837 ip->i_udquot, &ip->i_gdquot) :
847 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 838 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
848 flags & XFS_QMOPT_DQALLOC, 839 flags & XFS_QMOPT_DQALLOC,
849 ip->i_udquot, &ip->i_gdquot); 840 ip->i_udquot, &ip->i_gdquot);
850 /* 841 /*
@@ -1206,87 +1197,6 @@ xfs_qm_list_destroy(
1206 mutex_destroy(&(list->qh_lock)); 1197 mutex_destroy(&(list->qh_lock));
1207} 1198}
1208 1199
1209
1210/*
1211 * Stripped down version of dqattach. This doesn't attach, or even look at the
1212 * dquots attached to the inode. The rationale is that there won't be any
1213 * attached at the time this is called from quotacheck.
1214 */
1215STATIC int
1216xfs_qm_dqget_noattach(
1217 xfs_inode_t *ip,
1218 xfs_dquot_t **O_udqpp,
1219 xfs_dquot_t **O_gdqpp)
1220{
1221 int error;
1222 xfs_mount_t *mp;
1223 xfs_dquot_t *udqp, *gdqp;
1224
1225 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1226 mp = ip->i_mount;
1227 udqp = NULL;
1228 gdqp = NULL;
1229
1230 if (XFS_IS_UQUOTA_ON(mp)) {
1231 ASSERT(ip->i_udquot == NULL);
1232 /*
1233 * We want the dquot allocated if it doesn't exist.
1234 */
1235 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1236 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1237 &udqp))) {
1238 /*
1239 * Shouldn't be able to turn off quotas here.
1240 */
1241 ASSERT(error != ESRCH);
1242 ASSERT(error != ENOENT);
1243 return error;
1244 }
1245 ASSERT(udqp);
1246 }
1247
1248 if (XFS_IS_OQUOTA_ON(mp)) {
1249 ASSERT(ip->i_gdquot == NULL);
1250 if (udqp)
1251 xfs_dqunlock(udqp);
1252 error = XFS_IS_GQUOTA_ON(mp) ?
1253 xfs_qm_dqget(mp, ip,
1254 ip->i_d.di_gid, XFS_DQ_GROUP,
1255 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1256 &gdqp) :
1257 xfs_qm_dqget(mp, ip,
1258 ip->i_d.di_projid, XFS_DQ_PROJ,
1259 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1260 &gdqp);
1261 if (error) {
1262 if (udqp)
1263 xfs_qm_dqrele(udqp);
1264 ASSERT(error != ESRCH);
1265 ASSERT(error != ENOENT);
1266 return error;
1267 }
1268 ASSERT(gdqp);
1269
1270 /* Reacquire the locks in the right order */
1271 if (udqp) {
1272 if (! xfs_qm_dqlock_nowait(udqp)) {
1273 xfs_dqunlock(gdqp);
1274 xfs_dqlock(udqp);
1275 xfs_dqlock(gdqp);
1276 }
1277 }
1278 }
1279
1280 *O_udqpp = udqp;
1281 *O_gdqpp = gdqp;
1282
1283#ifdef QUOTADEBUG
1284 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1285 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1286#endif
1287 return 0;
1288}
1289
1290/* 1200/*
1291 * Create an inode and return with a reference already taken, but unlocked 1201 * Create an inode and return with a reference already taken, but unlocked
1292 * This is how we create quota inodes 1202 * This is how we create quota inodes
@@ -1312,8 +1222,8 @@ xfs_qm_qino_alloc(
1312 return error; 1222 return error;
1313 } 1223 }
1314 1224
1315 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 1225 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
1316 &xfs_zerocr, 0, 1, ip, &committed))) { 1226 if (error) {
1317 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1227 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1318 XFS_TRANS_ABORT); 1228 XFS_TRANS_ABORT);
1319 return error; 1229 return error;
@@ -1497,7 +1407,7 @@ xfs_qm_dqiterate(
1497 maxlblkcnt - lblkno, 1407 maxlblkcnt - lblkno,
1498 XFS_BMAPI_METADATA, 1408 XFS_BMAPI_METADATA,
1499 NULL, 1409 NULL,
1500 0, map, &nmaps, NULL, NULL); 1410 0, map, &nmaps, NULL);
1501 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1411 xfs_iunlock(qip, XFS_ILOCK_SHARED);
1502 if (error) 1412 if (error)
1503 break; 1413 break;
@@ -1523,7 +1433,7 @@ xfs_qm_dqiterate(
1523 rablkcnt = map[i+1].br_blockcount; 1433 rablkcnt = map[i+1].br_blockcount;
1524 rablkno = map[i+1].br_startblock; 1434 rablkno = map[i+1].br_startblock;
1525 while (rablkcnt--) { 1435 while (rablkcnt--) {
1526 xfs_baread(mp->m_ddev_targp, 1436 xfs_buf_readahead(mp->m_ddev_targp,
1527 XFS_FSB_TO_DADDR(mp, rablkno), 1437 XFS_FSB_TO_DADDR(mp, rablkno),
1528 mp->m_quotainfo->qi_dqchunklen); 1438 mp->m_quotainfo->qi_dqchunklen);
1529 rablkno++; 1439 rablkno++;
@@ -1553,18 +1463,34 @@ xfs_qm_dqiterate(
1553 1463
1554/* 1464/*
1555 * Called by dqusage_adjust in doing a quotacheck. 1465 * Called by dqusage_adjust in doing a quotacheck.
1556 * Given the inode, and a dquot (either USR or GRP, doesn't matter), 1466 *
1557 * this updates its incore copy as well as the buffer copy. This is 1467 * Given the inode, and a dquot id this updates both the incore dqout as well
1558 * so that once the quotacheck is done, we can just log all the buffers, 1468 * as the buffer copy. This is so that once the quotacheck is done, we can
1559 * as opposed to logging numerous updates to individual dquots. 1469 * just log all the buffers, as opposed to logging numerous updates to
1470 * individual dquots.
1560 */ 1471 */
1561STATIC void 1472STATIC int
1562xfs_qm_quotacheck_dqadjust( 1473xfs_qm_quotacheck_dqadjust(
1563 xfs_dquot_t *dqp, 1474 struct xfs_inode *ip,
1475 xfs_dqid_t id,
1476 uint type,
1564 xfs_qcnt_t nblks, 1477 xfs_qcnt_t nblks,
1565 xfs_qcnt_t rtblks) 1478 xfs_qcnt_t rtblks)
1566{ 1479{
1567 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1480 struct xfs_mount *mp = ip->i_mount;
1481 struct xfs_dquot *dqp;
1482 int error;
1483
1484 error = xfs_qm_dqget(mp, ip, id, type,
1485 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
1486 if (error) {
1487 /*
1488 * Shouldn't be able to turn off quotas here.
1489 */
1490 ASSERT(error != ESRCH);
1491 ASSERT(error != ENOENT);
1492 return error;
1493 }
1568 1494
1569 trace_xfs_dqadjust(dqp); 1495 trace_xfs_dqadjust(dqp);
1570 1496
@@ -1589,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
1589 * There are no timers for the default values set in the root dquot. 1515 * There are no timers for the default values set in the root dquot.
1590 */ 1516 */
1591 if (dqp->q_core.d_id) { 1517 if (dqp->q_core.d_id) {
1592 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1518 xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
1593 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1519 xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
1594 } 1520 }
1595 1521
1596 dqp->dq_flags |= XFS_DQ_DIRTY; 1522 dqp->dq_flags |= XFS_DQ_DIRTY;
1523 xfs_qm_dqput(dqp);
1524 return 0;
1597} 1525}
1598 1526
1599STATIC int 1527STATIC int
@@ -1636,8 +1564,7 @@ xfs_qm_dqusage_adjust(
1636 int *res) /* result code value */ 1564 int *res) /* result code value */
1637{ 1565{
1638 xfs_inode_t *ip; 1566 xfs_inode_t *ip;
1639 xfs_dquot_t *udqp, *gdqp; 1567 xfs_qcnt_t nblks, rtblks = 0;
1640 xfs_qcnt_t nblks, rtblks;
1641 int error; 1568 int error;
1642 1569
1643 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1570 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1657,49 +1584,24 @@ xfs_qm_dqusage_adjust(
1657 * the case in all other instances. It's OK that we do this because 1584 * the case in all other instances. It's OK that we do this because
1658 * quotacheck is done only at mount time. 1585 * quotacheck is done only at mount time.
1659 */ 1586 */
1660 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { 1587 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
1588 if (error) {
1661 *res = BULKSTAT_RV_NOTHING; 1589 *res = BULKSTAT_RV_NOTHING;
1662 return error; 1590 return error;
1663 } 1591 }
1664 1592
1665 /* 1593 ASSERT(ip->i_delayed_blks == 0);
1666 * Obtain the locked dquots. In case of an error (eg. allocation
1667 * fails for ENOSPC), we return the negative of the error number
1668 * to bulkstat, so that it can get propagated to quotacheck() and
1669 * making us disable quotas for the file system.
1670 */
1671 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1672 xfs_iput(ip, XFS_ILOCK_EXCL);
1673 *res = BULKSTAT_RV_GIVEUP;
1674 return error;
1675 }
1676 1594
1677 rtblks = 0; 1595 if (XFS_IS_REALTIME_INODE(ip)) {
1678 if (! XFS_IS_REALTIME_INODE(ip)) {
1679 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1680 } else {
1681 /* 1596 /*
1682 * Walk thru the extent list and count the realtime blocks. 1597 * Walk thru the extent list and count the realtime blocks.
1683 */ 1598 */
1684 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1599 error = xfs_qm_get_rtblks(ip, &rtblks);
1685 xfs_iput(ip, XFS_ILOCK_EXCL); 1600 if (error)
1686 if (udqp) 1601 goto error0;
1687 xfs_qm_dqput(udqp);
1688 if (gdqp)
1689 xfs_qm_dqput(gdqp);
1690 *res = BULKSTAT_RV_GIVEUP;
1691 return error;
1692 }
1693 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1694 } 1602 }
1695 ASSERT(ip->i_delayed_blks == 0);
1696 1603
1697 /* 1604 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1698 * We can't release the inode while holding its dquot locks.
1699 * The inode can go into inactive and might try to acquire the dquotlocks.
1700 * So, just unlock here and do a vn_rele at the end.
1701 */
1702 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1703 1605
1704 /* 1606 /*
1705 * Add the (disk blocks and inode) resources occupied by this 1607 * Add the (disk blocks and inode) resources occupied by this
@@ -1714,26 +1616,36 @@ xfs_qm_dqusage_adjust(
1714 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1616 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1715 */ 1617 */
1716 if (XFS_IS_UQUOTA_ON(mp)) { 1618 if (XFS_IS_UQUOTA_ON(mp)) {
1717 ASSERT(udqp); 1619 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
1718 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); 1620 XFS_DQ_USER, nblks, rtblks);
1719 xfs_qm_dqput(udqp); 1621 if (error)
1622 goto error0;
1720 } 1623 }
1721 if (XFS_IS_OQUOTA_ON(mp)) { 1624
1722 ASSERT(gdqp); 1625 if (XFS_IS_GQUOTA_ON(mp)) {
1723 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); 1626 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
1724 xfs_qm_dqput(gdqp); 1627 XFS_DQ_GROUP, nblks, rtblks);
1628 if (error)
1629 goto error0;
1725 } 1630 }
1726 /*
1727 * Now release the inode. This will send it to 'inactive', and
1728 * possibly even free blocks.
1729 */
1730 IRELE(ip);
1731 1631
1732 /* 1632 if (XFS_IS_PQUOTA_ON(mp)) {
1733 * Goto next inode. 1633 error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
1734 */ 1634 XFS_DQ_PROJ, nblks, rtblks);
1635 if (error)
1636 goto error0;
1637 }
1638
1639 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1640 IRELE(ip);
1735 *res = BULKSTAT_RV_DIDONE; 1641 *res = BULKSTAT_RV_DIDONE;
1736 return 0; 1642 return 0;
1643
1644error0:
1645 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1646 IRELE(ip);
1647 *res = BULKSTAT_RV_GIVEUP;
1648 return error;
1737} 1649}
1738 1650
1739/* 1651/*
@@ -2229,7 +2141,7 @@ xfs_qm_write_sb_changes(
2229 2141
2230 2142
2231/* 2143/*
2232 * Given an inode, a uid and gid (from cred_t) make sure that we have 2144 * Given an inode, a uid, gid and prid make sure that we have
2233 * allocated relevant dquot(s) on disk, and that we won't exceed inode 2145 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2234 * quotas by creating this file. 2146 * quotas by creating this file.
2235 * This also attaches dquot(s) to the given inode after locking it, 2147 * This also attaches dquot(s) to the given inode after locking it,
@@ -2337,7 +2249,7 @@ xfs_qm_vop_dqalloc(
2337 xfs_dqunlock(gq); 2249 xfs_dqunlock(gq);
2338 } 2250 }
2339 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 2251 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
2340 if (ip->i_d.di_projid != prid) { 2252 if (xfs_get_projid(ip) != prid) {
2341 xfs_iunlock(ip, lockflags); 2253 xfs_iunlock(ip, lockflags);
2342 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 2254 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
2343 XFS_DQ_PROJ, 2255 XFS_DQ_PROJ,
@@ -2459,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
2459 } 2371 }
2460 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 2372 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
2461 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 2373 if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
2462 ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) 2374 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
2463 prjflags = XFS_QMOPT_ENOSPC; 2375 prjflags = XFS_QMOPT_ENOSPC;
2464 2376
2465 if (prjflags || 2377 if (prjflags ||
@@ -2563,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
2563 ip->i_gdquot = gdqp; 2475 ip->i_gdquot = gdqp;
2564 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2476 ASSERT(XFS_IS_OQUOTA_ON(mp));
2565 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2477 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2566 ip->i_d.di_gid : ip->i_d.di_projid) == 2478 ip->i_d.di_gid : xfs_get_projid(ip)) ==
2567 be32_to_cpu(gdqp->q_core.d_id)); 2479 be32_to_cpu(gdqp->q_core.d_id));
2568 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2480 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2569 } 2481 }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c1279..45b5cb1788a 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 32#include "xfs_bmap.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_qm.h" 37#include "xfs_qm.h"
@@ -91,7 +81,7 @@ xfs_qm_statvfs(
91 xfs_mount_t *mp = ip->i_mount; 81 xfs_mount_t *mp = ip->i_mount;
92 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
93 83
94 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
95 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
96 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
97 } 87 }
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 3d1fc79532e..8671a0b3264 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_bmap.h" 32#include "xfs_bmap.h"
41#include "xfs_btree.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_qm.h" 37#include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index b4487764e92..bdebc183223 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
26#include "xfs_trans.h" 26#include "xfs_trans.h"
27#include "xfs_sb.h" 27#include "xfs_sb.h"
28#include "xfs_ag.h" 28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_alloc.h" 29#include "xfs_alloc.h"
31#include "xfs_dmapi.h"
32#include "xfs_quota.h" 30#include "xfs_quota.h"
33#include "xfs_mount.h" 31#include "xfs_mount.h"
34#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
35#include "xfs_alloc_btree.h"
36#include "xfs_ialloc_btree.h"
37#include "xfs_dir2_sf.h"
38#include "xfs_attr_sf.h"
39#include "xfs_dinode.h"
40#include "xfs_inode.h" 33#include "xfs_inode.h"
41#include "xfs_ialloc.h"
42#include "xfs_itable.h" 34#include "xfs_itable.h"
43#include "xfs_bmap.h" 35#include "xfs_bmap.h"
44#include "xfs_btree.h"
45#include "xfs_rtalloc.h" 36#include "xfs_rtalloc.h"
46#include "xfs_error.h" 37#include "xfs_error.h"
47#include "xfs_rw.h"
48#include "xfs_attr.h" 38#include "xfs_attr.h"
49#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
50#include "xfs_utils.h" 40#include "xfs_utils.h"
@@ -248,40 +238,74 @@ out_unlock:
248 return error; 238 return error;
249} 239}
250 240
241STATIC int
242xfs_qm_scall_trunc_qfile(
243 struct xfs_mount *mp,
244 xfs_ino_t ino)
245{
246 struct xfs_inode *ip;
247 struct xfs_trans *tp;
248 int error;
249
250 if (ino == NULLFSINO)
251 return 0;
252
253 error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
254 if (error)
255 return error;
256
257 xfs_ilock(ip, XFS_IOLOCK_EXCL);
258
259 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
260 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
261 XFS_TRANS_PERM_LOG_RES,
262 XFS_ITRUNCATE_LOG_COUNT);
263 if (error) {
264 xfs_trans_cancel(tp, 0);
265 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
266 goto out_put;
267 }
268
269 xfs_ilock(ip, XFS_ILOCK_EXCL);
270 xfs_trans_ijoin(tp, ip);
271
272 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
273 if (error) {
274 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
275 XFS_TRANS_ABORT);
276 goto out_unlock;
277 }
278
279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281
282out_unlock:
283 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
284out_put:
285 IRELE(ip);
286 return error;
287}
288
251int 289int
252xfs_qm_scall_trunc_qfiles( 290xfs_qm_scall_trunc_qfiles(
253 xfs_mount_t *mp, 291 xfs_mount_t *mp,
254 uint flags) 292 uint flags)
255{ 293{
256 int error = 0, error2 = 0; 294 int error = 0, error2 = 0;
257 xfs_inode_t *qip;
258 295
259 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
260 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
261 return XFS_ERROR(EINVAL); 298 return XFS_ERROR(EINVAL);
262 } 299 }
263 300
264 if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { 301 if (flags & XFS_DQ_USER)
265 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip); 302 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
266 if (!error) { 303 if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
267 error = xfs_truncate_file(mp, qip); 304 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
268 IRELE(qip);
269 }
270 }
271
272 if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
273 mp->m_sb.sb_gquotino != NULLFSINO) {
274 error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip);
275 if (!error2) {
276 error2 = xfs_truncate_file(mp, qip);
277 IRELE(qip);
278 }
279 }
280 305
281 return error ? error : error2; 306 return error ? error : error2;
282} 307}
283 308
284
285/* 309/*
286 * Switch on (a given) quota enforcement for a filesystem. This takes 310 * Switch on (a given) quota enforcement for a filesystem. This takes
287 * effect immediately. 311 * effect immediately.
@@ -786,9 +810,9 @@ xfs_qm_export_dquot(
786 } 810 }
787 811
788#ifdef DEBUG 812#ifdef DEBUG
789 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || 813 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
790 (XFS_IS_OQUOTA_ENFORCED(mp) && 814 (XFS_IS_OQUOTA_ENFORCED(mp) &&
791 (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && 815 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
792 dst->d_id != 0) { 816 dst->d_id != 0) {
793 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && 817 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
794 (dst->d_blk_softlimit > 0)) { 818 (dst->d_blk_softlimit > 0)) {
@@ -809,17 +833,17 @@ xfs_qm_export_qtype_flags(
809 /* 833 /*
810 * Can't be more than one, or none. 834 * Can't be more than one, or none.
811 */ 835 */
812 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != 836 ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
813 (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); 837 (FS_PROJ_QUOTA | FS_USER_QUOTA));
814 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != 838 ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
815 (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); 839 (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
816 ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != 840 ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
817 (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); 841 (FS_USER_QUOTA | FS_GROUP_QUOTA));
818 ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); 842 ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
819 843
820 return (flags & XFS_DQ_USER) ? 844 return (flags & XFS_DQ_USER) ?
821 XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? 845 FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
822 XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; 846 FS_PROJ_QUOTA : FS_GROUP_QUOTA;
823} 847}
824 848
825STATIC uint 849STATIC uint
@@ -830,16 +854,16 @@ xfs_qm_export_flags(
830 854
831 uflags = 0; 855 uflags = 0;
832 if (flags & XFS_UQUOTA_ACCT) 856 if (flags & XFS_UQUOTA_ACCT)
833 uflags |= XFS_QUOTA_UDQ_ACCT; 857 uflags |= FS_QUOTA_UDQ_ACCT;
834 if (flags & XFS_PQUOTA_ACCT) 858 if (flags & XFS_PQUOTA_ACCT)
835 uflags |= XFS_QUOTA_PDQ_ACCT; 859 uflags |= FS_QUOTA_PDQ_ACCT;
836 if (flags & XFS_GQUOTA_ACCT) 860 if (flags & XFS_GQUOTA_ACCT)
837 uflags |= XFS_QUOTA_GDQ_ACCT; 861 uflags |= FS_QUOTA_GDQ_ACCT;
838 if (flags & XFS_UQUOTA_ENFD) 862 if (flags & XFS_UQUOTA_ENFD)
839 uflags |= XFS_QUOTA_UDQ_ENFD; 863 uflags |= FS_QUOTA_UDQ_ENFD;
840 if (flags & (XFS_OQUOTA_ENFD)) { 864 if (flags & (XFS_OQUOTA_ENFD)) {
841 uflags |= (flags & XFS_GQUOTA_ACCT) ? 865 uflags |= (flags & XFS_GQUOTA_ACCT) ?
842 XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; 866 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
843 } 867 }
844 return (uflags); 868 return (uflags);
845} 869}
@@ -851,21 +875,14 @@ xfs_dqrele_inode(
851 struct xfs_perag *pag, 875 struct xfs_perag *pag,
852 int flags) 876 int flags)
853{ 877{
854 int error;
855
856 /* skip quota inodes */ 878 /* skip quota inodes */
857 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 879 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
858 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 880 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
859 ASSERT(ip->i_udquot == NULL); 881 ASSERT(ip->i_udquot == NULL);
860 ASSERT(ip->i_gdquot == NULL); 882 ASSERT(ip->i_gdquot == NULL);
861 read_unlock(&pag->pag_ici_lock);
862 return 0; 883 return 0;
863 } 884 }
864 885
865 error = xfs_sync_inode_valid(ip, pag);
866 if (error)
867 return error;
868
869 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
870 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 887 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
871 xfs_qm_dqrele(ip->i_udquot); 888 xfs_qm_dqrele(ip->i_udquot);
@@ -875,8 +892,7 @@ xfs_dqrele_inode(
875 xfs_qm_dqrele(ip->i_gdquot); 892 xfs_qm_dqrele(ip->i_gdquot);
876 ip->i_gdquot = NULL; 893 ip->i_gdquot = NULL;
877 } 894 }
878 xfs_iput(ip, XFS_ILOCK_EXCL); 895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
879
880 return 0; 896 return 0;
881} 897}
882 898
@@ -893,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
893 uint flags) 909 uint flags)
894{ 910{
895 ASSERT(mp->m_quotainfo); 911 ASSERT(mp->m_quotainfo);
896 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, 912 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
897 XFS_ICI_NO_TAG, 0, NULL);
898} 913}
899 914
900/*------------------------------------------------------------------------*/ 915/*------------------------------------------------------------------------*/
@@ -1143,13 +1158,14 @@ xfs_qm_internalqcheck_adjust(
1143 * of those now. 1158 * of those now.
1144 */ 1159 */
1145 if (! ipreleased) { 1160 if (! ipreleased) {
1146 xfs_iput(ip, lock_flags); 1161 xfs_iunlock(ip, lock_flags);
1162 IRELE(ip);
1147 ipreleased = B_TRUE; 1163 ipreleased = B_TRUE;
1148 goto again; 1164 goto again;
1149 } 1165 }
1150 xfs_qm_internalqcheck_get_dquots(mp, 1166 xfs_qm_internalqcheck_get_dquots(mp,
1151 (xfs_dqid_t) ip->i_d.di_uid, 1167 (xfs_dqid_t) ip->i_d.di_uid,
1152 (xfs_dqid_t) ip->i_d.di_projid, 1168 (xfs_dqid_t) xfs_get_projid(ip),
1153 (xfs_dqid_t) ip->i_d.di_gid, 1169 (xfs_dqid_t) ip->i_d.di_gid,
1154 &ud, &gd); 1170 &ud, &gd);
1155 if (XFS_IS_UQUOTA_ON(mp)) { 1171 if (XFS_IS_UQUOTA_ON(mp)) {
@@ -1160,7 +1176,8 @@ xfs_qm_internalqcheck_adjust(
1160 ASSERT(gd); 1176 ASSERT(gd);
1161 xfs_qm_internalqcheck_dqadjust(ip, gd); 1177 xfs_qm_internalqcheck_dqadjust(ip, gd);
1162 } 1178 }
1163 xfs_iput(ip, lock_flags); 1179 xfs_iunlock(ip, lock_flags);
1180 IRELE(ip);
1164 *res = BULKSTAT_RV_DIDONE; 1181 *res = BULKSTAT_RV_DIDONE;
1165 return (0); 1182 return (0);
1166} 1183}
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 061d827da33..7de91d1b75c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 32#include "xfs_bmap.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_priv.h" 37#include "xfs_trans_priv.h"
@@ -59,16 +49,14 @@ xfs_trans_dqjoin(
59 xfs_trans_t *tp, 49 xfs_trans_t *tp,
60 xfs_dquot_t *dqp) 50 xfs_dquot_t *dqp)
61{ 51{
62 xfs_dq_logitem_t *lp = &dqp->q_logitem;
63
64 ASSERT(dqp->q_transp != tp); 52 ASSERT(dqp->q_transp != tp);
65 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 53 ASSERT(XFS_DQ_IS_LOCKED(dqp));
66 ASSERT(lp->qli_dquot == dqp); 54 ASSERT(dqp->q_logitem.qli_dquot == dqp);
67 55
68 /* 56 /*
69 * Get a log_item_desc to point at the new item. 57 * Get a log_item_desc to point at the new item.
70 */ 58 */
71 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); 59 xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
72 60
73 /* 61 /*
74 * Initialize i_transp so we can later determine if this dquot is 62 * Initialize i_transp so we can later determine if this dquot is
@@ -93,16 +81,11 @@ xfs_trans_log_dquot(
93 xfs_trans_t *tp, 81 xfs_trans_t *tp,
94 xfs_dquot_t *dqp) 82 xfs_dquot_t *dqp)
95{ 83{
96 xfs_log_item_desc_t *lidp;
97
98 ASSERT(dqp->q_transp == tp); 84 ASSERT(dqp->q_transp == tp);
99 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 85 ASSERT(XFS_DQ_IS_LOCKED(dqp));
100 86
101 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
102 ASSERT(lidp != NULL);
103
104 tp->t_flags |= XFS_TRANS_DIRTY; 87 tp->t_flags |= XFS_TRANS_DIRTY;
105 lidp->lid_flags |= XFS_LID_DIRTY; 88 dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
106} 89}
107 90
108/* 91/*
@@ -874,9 +857,8 @@ xfs_trans_get_qoff_item(
874 /* 857 /*
875 * Get a log_item_desc to point at the new item. 858 * Get a log_item_desc to point at the new item.
876 */ 859 */
877 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); 860 xfs_trans_add_item(tp, &q->qql_item);
878 861 return q;
879 return (q);
880} 862}
881 863
882 864
@@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item(
890 xfs_trans_t *tp, 872 xfs_trans_t *tp,
891 xfs_qoff_logitem_t *qlp) 873 xfs_qoff_logitem_t *qlp)
892{ 874{
893 xfs_log_item_desc_t *lidp;
894
895 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
896 ASSERT(lidp != NULL);
897
898 tp->t_flags |= XFS_TRANS_DIRTY; 875 tp->t_flags |= XFS_TRANS_DIRTY;
899 lidp->lid_flags |= XFS_LID_DIRTY; 876 qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
900} 877}
901 878
902STATIC void 879STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee0..975aa10e1a4 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dmapi.h"
26#include "xfs_mount.h" 25#include "xfs_mount.h"
27#include "xfs_error.h" 26#include "xfs_error.h"
28 27
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4e..63c7a1a6c02 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,15 @@ typedef struct xfs_perag {
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
235
236 /* buffer cache index */
237 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
238 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
239
240 /* for rcu-safe freeing */
241 struct rcu_head rcu_head;
233#endif 242#endif
234 int pagb_count; /* pagb slots in use */ 243 int pagb_count; /* pagb slots in use */
235} xfs_perag_t; 244} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a7fbe8a99b1..112abc439ca 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 34#include "xfs_alloc.h"
40#include "xfs_error.h" 35#include "xfs_error.h"
41#include "xfs_trace.h" 36#include "xfs_trace.h"
@@ -680,7 +675,7 @@ xfs_alloc_ag_vextent_near(
680 xfs_agblock_t gtbnoa; /* aligned ... */ 675 xfs_agblock_t gtbnoa; /* aligned ... */
681 xfs_extlen_t gtdiff; /* difference to right side entry */ 676 xfs_extlen_t gtdiff; /* difference to right side entry */
682 xfs_extlen_t gtlen; /* length of right side entry */ 677 xfs_extlen_t gtlen; /* length of right side entry */
683 xfs_extlen_t gtlena; /* aligned ... */ 678 xfs_extlen_t gtlena = 0; /* aligned ... */
684 xfs_agblock_t gtnew; /* useful start bno of right side */ 679 xfs_agblock_t gtnew; /* useful start bno of right side */
685 int error; /* error code */ 680 int error; /* error code */
686 int i; /* result code, temporary */ 681 int i; /* result code, temporary */
@@ -688,10 +683,8 @@ xfs_alloc_ag_vextent_near(
688 xfs_agblock_t ltbno; /* start bno of left side entry */ 683 xfs_agblock_t ltbno; /* start bno of left side entry */
689 xfs_agblock_t ltbnoa; /* aligned ... */ 684 xfs_agblock_t ltbnoa; /* aligned ... */
690 xfs_extlen_t ltdiff; /* difference to left side entry */ 685 xfs_extlen_t ltdiff; /* difference to left side entry */
691 /*REFERENCED*/
692 xfs_agblock_t ltend; /* end bno of left side entry */
693 xfs_extlen_t ltlen; /* length of left side entry */ 686 xfs_extlen_t ltlen; /* length of left side entry */
694 xfs_extlen_t ltlena; /* aligned ... */ 687 xfs_extlen_t ltlena = 0; /* aligned ... */
695 xfs_agblock_t ltnew; /* useful start bno of left side */ 688 xfs_agblock_t ltnew; /* useful start bno of left side */
696 xfs_extlen_t rlen; /* length of returned extent */ 689 xfs_extlen_t rlen; /* length of returned extent */
697#if defined(DEBUG) && defined(__KERNEL__) 690#if defined(DEBUG) && defined(__KERNEL__)
@@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near(
814 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 807 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
815 goto error0; 808 goto error0;
816 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 809 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
817 ltend = ltbno + ltlen; 810 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
818 ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
819 args->len = blen; 811 args->len = blen;
820 if (!xfs_alloc_fix_minleft(args)) { 812 if (!xfs_alloc_fix_minleft(args)) {
821 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 813 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near(
828 */ 820 */
829 args->agbno = bnew; 821 args->agbno = bnew;
830 ASSERT(bnew >= ltbno); 822 ASSERT(bnew >= ltbno);
831 ASSERT(bnew + blen <= ltend); 823 ASSERT(bnew + blen <= ltbno + ltlen);
832 /* 824 /*
833 * Set up a cursor for the by-bno tree. 825 * Set up a cursor for the by-bno tree.
834 */ 826 */
@@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
1157 /* 1149 /*
1158 * Fix up the length and compute the useful address. 1150 * Fix up the length and compute the useful address.
1159 */ 1151 */
1160 ltend = ltbno + ltlen;
1161 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1152 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1162 xfs_alloc_fix_len(args); 1153 xfs_alloc_fix_len(args);
1163 if (!xfs_alloc_fix_minleft(args)) { 1154 if (!xfs_alloc_fix_minleft(args)) {
@@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
1170 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1161 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
1171 ltlen, &ltnew); 1162 ltlen, &ltnew);
1172 ASSERT(ltnew >= ltbno); 1163 ASSERT(ltnew >= ltbno);
1173 ASSERT(ltnew + rlen <= ltend); 1164 ASSERT(ltnew + rlen <= ltbno + ltlen);
1174 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1165 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1175 args->agbno = ltnew; 1166 args->agbno = ltnew;
1176 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1167 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 6d05199b667..895009a9727 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -27,16 +27,16 @@ struct xfs_busy_extent;
27/* 27/*
28 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
29 */ 29 */
30typedef enum xfs_alloctype 30#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
31{ 31#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
32 XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ 32#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
33 XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ 33#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
34 XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ 34#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
35 XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ 35#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
36 XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ 36#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
37 XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ 37
38 XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ 38/* this should become an enum again when the tracing code is fixed */
39} xfs_alloctype_t; 39typedef unsigned int xfs_alloctype_t;
40 40
41#define XFS_ALLOC_TYPES \ 41#define XFS_ALLOC_TYPES \
42 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ 42 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 83f49421875..3916925e258 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_btree_trace.h" 34#include "xfs_btree_trace.h"
39#include "xfs_ialloc.h"
40#include "xfs_alloc.h" 35#include "xfs_alloc.h"
41#include "xfs_error.h" 36#include "xfs_error.h"
42#include "xfs_trace.h" 37#include "xfs_trace.h"
@@ -285,38 +280,6 @@ xfs_allocbt_key_diff(
285 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
286} 281}
287 282
288STATIC int
289xfs_allocbt_kill_root(
290 struct xfs_btree_cur *cur,
291 struct xfs_buf *bp,
292 int level,
293 union xfs_btree_ptr *newroot)
294{
295 int error;
296
297 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
298 XFS_BTREE_STATS_INC(cur, killroot);
299
300 /*
301 * Update the root pointer, decreasing the level by 1 and then
302 * free the old root.
303 */
304 xfs_allocbt_set_root(cur, newroot, -1);
305 error = xfs_allocbt_free_block(cur, bp);
306 if (error) {
307 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
308 return error;
309 }
310
311 XFS_BTREE_STATS_INC(cur, free);
312
313 xfs_btree_setbuf(cur, level, NULL);
314 cur->bc_nlevels--;
315
316 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
317 return 0;
318}
319
320#ifdef DEBUG 283#ifdef DEBUG
321STATIC int 284STATIC int
322xfs_allocbt_keys_inorder( 285xfs_allocbt_keys_inorder(
@@ -428,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
428 391
429 .dup_cursor = xfs_allocbt_dup_cursor, 392 .dup_cursor = xfs_allocbt_dup_cursor,
430 .set_root = xfs_allocbt_set_root, 393 .set_root = xfs_allocbt_set_root,
431 .kill_root = xfs_allocbt_kill_root,
432 .alloc_block = xfs_allocbt_alloc_block, 394 .alloc_block = xfs_allocbt_alloc_block,
433 .free_block = xfs_allocbt_free_block, 395 .free_block = xfs_allocbt_free_block,
434 .update_lastrec = xfs_allocbt_update_lastrec, 396 .update_lastrec = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c4..c8637537881 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h" 31#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 32#include "xfs_dinode.h"
38#include "xfs_inode.h" 33#include "xfs_inode.h"
39#include "xfs_alloc.h" 34#include "xfs_alloc.h"
40#include "xfs_btree.h"
41#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
42#include "xfs_bmap.h" 36#include "xfs_bmap.h"
43#include "xfs_attr.h" 37#include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
325 return (error); 319 return (error);
326 } 320 }
327 321
328 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 322 xfs_trans_ijoin(args.trans, dp);
329 xfs_trans_ihold(args.trans, dp);
330 323
331 /* 324 /*
332 * If the attribute list is non-existent or a shortform list, 325 * If the attribute list is non-existent or a shortform list,
@@ -362,16 +355,15 @@ xfs_attr_set_int(
362 if (mp->m_flags & XFS_MOUNT_WSYNC) { 355 if (mp->m_flags & XFS_MOUNT_WSYNC) {
363 xfs_trans_set_sync(args.trans); 356 xfs_trans_set_sync(args.trans);
364 } 357 }
358
359 if (!error && (flags & ATTR_KERNOTIME) == 0) {
360 xfs_trans_ichgtime(args.trans, dp,
361 XFS_ICHGTIME_CHG);
362 }
365 err2 = xfs_trans_commit(args.trans, 363 err2 = xfs_trans_commit(args.trans,
366 XFS_TRANS_RELEASE_LOG_RES); 364 XFS_TRANS_RELEASE_LOG_RES);
367 xfs_iunlock(dp, XFS_ILOCK_EXCL); 365 xfs_iunlock(dp, XFS_ILOCK_EXCL);
368 366
369 /*
370 * Hit the inode change time.
371 */
372 if (!error && (flags & ATTR_KERNOTIME) == 0) {
373 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
374 }
375 return(error == 0 ? err2 : error); 367 return(error == 0 ? err2 : error);
376 } 368 }
377 369
@@ -396,10 +388,8 @@ xfs_attr_set_int(
396 * bmap_finish() may have committed the last trans and started 388 * bmap_finish() may have committed the last trans and started
397 * a new one. We need the inode to be in all transactions. 389 * a new one. We need the inode to be in all transactions.
398 */ 390 */
399 if (committed) { 391 if (committed)
400 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 392 xfs_trans_ijoin(args.trans, dp);
401 xfs_trans_ihold(args.trans, dp);
402 }
403 393
404 /* 394 /*
405 * Commit the leaf transformation. We'll need another (linked) 395 * Commit the leaf transformation. We'll need another (linked)
@@ -429,6 +419,9 @@ xfs_attr_set_int(
429 xfs_trans_set_sync(args.trans); 419 xfs_trans_set_sync(args.trans);
430 } 420 }
431 421
422 if ((flags & ATTR_KERNOTIME) == 0)
423 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
424
432 /* 425 /*
433 * Commit the last in the sequence of transactions. 426 * Commit the last in the sequence of transactions.
434 */ 427 */
@@ -436,13 +429,6 @@ xfs_attr_set_int(
436 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 429 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
437 xfs_iunlock(dp, XFS_ILOCK_EXCL); 430 xfs_iunlock(dp, XFS_ILOCK_EXCL);
438 431
439 /*
440 * Hit the inode change time.
441 */
442 if (!error && (flags & ATTR_KERNOTIME) == 0) {
443 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
444 }
445
446 return(error); 432 return(error);
447 433
448out: 434out:
@@ -544,8 +530,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
544 * No need to make quota reservations here. We expect to release some 530 * No need to make quota reservations here. We expect to release some
545 * blocks not allocate in the common case. 531 * blocks not allocate in the common case.
546 */ 532 */
547 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 533 xfs_trans_ijoin(args.trans, dp);
548 xfs_trans_ihold(args.trans, dp);
549 534
550 /* 535 /*
551 * Decide on what work routines to call based on the inode size. 536 * Decide on what work routines to call based on the inode size.
@@ -577,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
577 xfs_trans_set_sync(args.trans); 562 xfs_trans_set_sync(args.trans);
578 } 563 }
579 564
565 if ((flags & ATTR_KERNOTIME) == 0)
566 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
567
580 /* 568 /*
581 * Commit the last in the sequence of transactions. 569 * Commit the last in the sequence of transactions.
582 */ 570 */
@@ -584,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
584 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 572 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
585 xfs_iunlock(dp, XFS_ILOCK_EXCL); 573 xfs_iunlock(dp, XFS_ILOCK_EXCL);
586 574
587 /*
588 * Hit the inode change time.
589 */
590 if (!error && (flags & ATTR_KERNOTIME) == 0) {
591 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
592 }
593
594 return(error); 575 return(error);
595 576
596out: 577out:
@@ -821,8 +802,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
821 * No need to make quota reservations here. We expect to release some 802 * No need to make quota reservations here. We expect to release some
822 * blocks, not allocate, in the common case. 803 * blocks, not allocate, in the common case.
823 */ 804 */
824 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); 805 xfs_trans_ijoin(trans, dp);
825 xfs_trans_ihold(trans, dp);
826 806
827 /* 807 /*
828 * Decide on what work routines to call based on the inode size. 808 * Decide on what work routines to call based on the inode size.
@@ -981,10 +961,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
981 * bmap_finish() may have committed the last trans and started 961 * bmap_finish() may have committed the last trans and started
982 * a new one. We need the inode to be in all transactions. 962 * a new one. We need the inode to be in all transactions.
983 */ 963 */
984 if (committed) { 964 if (committed)
985 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 965 xfs_trans_ijoin(args->trans, dp);
986 xfs_trans_ihold(args->trans, dp);
987 }
988 966
989 /* 967 /*
990 * Commit the current trans (including the inode) and start 968 * Commit the current trans (including the inode) and start
@@ -1085,10 +1063,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1085 * and started a new one. We need the inode to be 1063 * and started a new one. We need the inode to be
1086 * in all transactions. 1064 * in all transactions.
1087 */ 1065 */
1088 if (committed) { 1066 if (committed)
1089 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1067 xfs_trans_ijoin(args->trans, dp);
1090 xfs_trans_ihold(args->trans, dp);
1091 }
1092 } else 1068 } else
1093 xfs_da_buf_done(bp); 1069 xfs_da_buf_done(bp);
1094 1070
@@ -1161,10 +1137,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1161 * bmap_finish() may have committed the last trans and started 1137 * bmap_finish() may have committed the last trans and started
1162 * a new one. We need the inode to be in all transactions. 1138 * a new one. We need the inode to be in all transactions.
1163 */ 1139 */
1164 if (committed) { 1140 if (committed)
1165 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1141 xfs_trans_ijoin(args->trans, dp);
1166 xfs_trans_ihold(args->trans, dp);
1167 }
1168 } else 1142 } else
1169 xfs_da_buf_done(bp); 1143 xfs_da_buf_done(bp);
1170 return(0); 1144 return(0);
@@ -1317,10 +1291,8 @@ restart:
1317 * and started a new one. We need the inode to be 1291 * and started a new one. We need the inode to be
1318 * in all transactions. 1292 * in all transactions.
1319 */ 1293 */
1320 if (committed) { 1294 if (committed)
1321 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1295 xfs_trans_ijoin(args->trans, dp);
1322 xfs_trans_ihold(args->trans, dp);
1323 }
1324 1296
1325 /* 1297 /*
1326 * Commit the node conversion and start the next 1298 * Commit the node conversion and start the next
@@ -1356,10 +1328,8 @@ restart:
1356 * bmap_finish() may have committed the last trans and started 1328 * bmap_finish() may have committed the last trans and started
1357 * a new one. We need the inode to be in all transactions. 1329 * a new one. We need the inode to be in all transactions.
1358 */ 1330 */
1359 if (committed) { 1331 if (committed)
1360 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1332 xfs_trans_ijoin(args->trans, dp);
1361 xfs_trans_ihold(args->trans, dp);
1362 }
1363 } else { 1333 } else {
1364 /* 1334 /*
1365 * Addition succeeded, update Btree hashvals. 1335 * Addition succeeded, update Btree hashvals.
@@ -1470,10 +1440,8 @@ restart:
1470 * and started a new one. We need the inode to be 1440 * and started a new one. We need the inode to be
1471 * in all transactions. 1441 * in all transactions.
1472 */ 1442 */
1473 if (committed) { 1443 if (committed)
1474 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1444 xfs_trans_ijoin(args->trans, dp);
1475 xfs_trans_ihold(args->trans, dp);
1476 }
1477 } 1445 }
1478 1446
1479 /* 1447 /*
@@ -1604,10 +1572,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1604 * bmap_finish() may have committed the last trans and started 1572 * bmap_finish() may have committed the last trans and started
1605 * a new one. We need the inode to be in all transactions. 1573 * a new one. We need the inode to be in all transactions.
1606 */ 1574 */
1607 if (committed) { 1575 if (committed)
1608 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1576 xfs_trans_ijoin(args->trans, dp);
1609 xfs_trans_ihold(args->trans, dp);
1610 }
1611 1577
1612 /* 1578 /*
1613 * Commit the Btree join operation and start a new trans. 1579 * Commit the Btree join operation and start a new trans.
@@ -1658,10 +1624,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1658 * and started a new one. We need the inode to be 1624 * and started a new one. We need the inode to be
1659 * in all transactions. 1625 * in all transactions.
1660 */ 1626 */
1661 if (committed) { 1627 if (committed)
1662 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1628 xfs_trans_ijoin(args->trans, dp);
1663 xfs_trans_ihold(args->trans, dp);
1664 }
1665 } else 1629 } else
1666 xfs_da_brelse(args->trans, bp); 1630 xfs_da_brelse(args->trans, bp);
1667 } 1631 }
@@ -2004,7 +1968,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2004 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, 1968 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
2005 args->rmtblkcnt, 1969 args->rmtblkcnt,
2006 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1970 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2007 NULL, 0, map, &nmap, NULL, NULL); 1971 NULL, 0, map, &nmap, NULL);
2008 if (error) 1972 if (error)
2009 return(error); 1973 return(error);
2010 ASSERT(nmap >= 1); 1974 ASSERT(nmap >= 1);
@@ -2022,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2022 1986
2023 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1987 tmp = (valuelen < XFS_BUF_SIZE(bp))
2024 ? valuelen : XFS_BUF_SIZE(bp); 1988 ? valuelen : XFS_BUF_SIZE(bp);
2025 xfs_biomove(bp, 0, tmp, dst, XBF_READ); 1989 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
2026 xfs_buf_relse(bp); 1990 xfs_buf_relse(bp);
2027 dst += tmp; 1991 dst += tmp;
2028 valuelen -= tmp; 1992 valuelen -= tmp;
@@ -2083,7 +2047,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2083 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | 2047 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
2084 XFS_BMAPI_WRITE, 2048 XFS_BMAPI_WRITE,
2085 args->firstblock, args->total, &map, &nmap, 2049 args->firstblock, args->total, &map, &nmap,
2086 args->flist, NULL); 2050 args->flist);
2087 if (!error) { 2051 if (!error) {
2088 error = xfs_bmap_finish(&args->trans, args->flist, 2052 error = xfs_bmap_finish(&args->trans, args->flist,
2089 &committed); 2053 &committed);
@@ -2099,10 +2063,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2099 * bmap_finish() may have committed the last trans and started 2063 * bmap_finish() may have committed the last trans and started
2100 * a new one. We need the inode to be in all transactions. 2064 * a new one. We need the inode to be in all transactions.
2101 */ 2065 */
2102 if (committed) { 2066 if (committed)
2103 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 2067 xfs_trans_ijoin(args->trans, dp);
2104 xfs_trans_ihold(args->trans, dp);
2105 }
2106 2068
2107 ASSERT(nmap == 1); 2069 ASSERT(nmap == 1);
2108 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2070 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2098,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2136 args->rmtblkcnt, 2098 args->rmtblkcnt,
2137 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2099 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2138 args->firstblock, 0, &map, &nmap, 2100 args->firstblock, 0, &map, &nmap,
2139 NULL, NULL); 2101 NULL);
2140 if (error) { 2102 if (error) {
2141 return(error); 2103 return(error);
2142 } 2104 }
@@ -2154,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2154 2116
2155 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2117 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2156 XFS_BUF_SIZE(bp); 2118 XFS_BUF_SIZE(bp);
2157 xfs_biomove(bp, 0, tmp, src, XBF_WRITE); 2119 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2158 if (tmp < XFS_BUF_SIZE(bp)) 2120 if (tmp < XFS_BUF_SIZE(bp))
2159 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2121 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2160 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2122 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2161 return (error); 2123 return (error);
2162 } 2124 }
@@ -2201,7 +2163,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2201 args->rmtblkcnt, 2163 args->rmtblkcnt,
2202 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2164 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2203 args->firstblock, 0, &map, &nmap, 2165 args->firstblock, 0, &map, &nmap,
2204 args->flist, NULL); 2166 args->flist);
2205 if (error) { 2167 if (error) {
2206 return(error); 2168 return(error);
2207 } 2169 }
@@ -2239,7 +2201,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2239 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 2201 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
2240 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2202 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2241 1, args->firstblock, args->flist, 2203 1, args->firstblock, args->flist,
2242 NULL, &done); 2204 &done);
2243 if (!error) { 2205 if (!error) {
2244 error = xfs_bmap_finish(&args->trans, args->flist, 2206 error = xfs_bmap_finish(&args->trans, args->flist,
2245 &committed); 2207 &committed);
@@ -2255,10 +2217,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2255 * bmap_finish() may have committed the last trans and started 2217 * bmap_finish() may have committed the last trans and started
2256 * a new one. We need the inode to be in all transactions. 2218 * a new one. We need the inode to be in all transactions.
2257 */ 2219 */
2258 if (committed) { 2220 if (committed)
2259 xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); 2221 xfs_trans_ijoin(args->trans, args->dp);
2260 xfs_trans_ihold(args->trans, args->dp);
2261 }
2262 2222
2263 /* 2223 /*
2264 * Close out trans and start the next one in the chain. 2224 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc25..a6cff8edcdb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_alloc.h" 32#include "xfs_alloc.h"
35#include "xfs_btree.h" 33#include "xfs_btree.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_attr_sf.h" 34#include "xfs_attr_sf.h"
38#include "xfs_dinode.h" 35#include "xfs_dinode.h"
39#include "xfs_inode.h" 36#include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2931 nmap = 1; 2928 nmap = 1;
2932 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, 2929 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
2933 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2930 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2934 NULL, 0, &map, &nmap, NULL, NULL); 2931 NULL, 0, &map, &nmap, NULL);
2935 if (error) { 2932 if (error) {
2936 return(error); 2933 return(error);
2937 } 2934 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 99587ded043..8abd12e32e1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
30#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
32#include "xfs_dir2_sf.h" 32#include "xfs_dir2_sf.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h" 33#include "xfs_dinode.h"
35#include "xfs_inode.h" 34#include "xfs_inode.h"
36#include "xfs_btree.h" 35#include "xfs_btree.h"
37#include "xfs_dmapi.h"
38#include "xfs_mount.h" 36#include "xfs_mount.h"
39#include "xfs_ialloc.h"
40#include "xfs_itable.h" 37#include "xfs_itable.h"
41#include "xfs_dir2_data.h" 38#include "xfs_dir2_data.h"
42#include "xfs_dir2_leaf.h" 39#include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
104 xfs_fsblock_t *first, /* pointer to firstblock variable */ 101 xfs_fsblock_t *first, /* pointer to firstblock variable */
105 xfs_bmap_free_t *flist, /* list of extents to be freed */ 102 xfs_bmap_free_t *flist, /* list of extents to be freed */
106 int *logflagsp, /* inode logging flags */ 103 int *logflagsp, /* inode logging flags */
107 xfs_extdelta_t *delta, /* Change made to incore extents */
108 int whichfork, /* data or attr fork */ 104 int whichfork, /* data or attr fork */
109 int rsvd); /* OK to allocate reserved blocks */ 105 int rsvd); /* OK to allocate reserved blocks */
110 106
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
122 xfs_fsblock_t *first, /* pointer to firstblock variable */ 118 xfs_fsblock_t *first, /* pointer to firstblock variable */
123 xfs_bmap_free_t *flist, /* list of extents to be freed */ 119 xfs_bmap_free_t *flist, /* list of extents to be freed */
124 int *logflagsp, /* inode logging flags */ 120 int *logflagsp, /* inode logging flags */
125 xfs_extdelta_t *delta, /* Change made to incore extents */
126 int rsvd); /* OK to allocate reserved blocks */ 121 int rsvd); /* OK to allocate reserved blocks */
127 122
128/* 123/*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
135 xfs_extnum_t idx, /* extent number to update/insert */ 130 xfs_extnum_t idx, /* extent number to update/insert */
136 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 131 xfs_bmbt_irec_t *new, /* new data to add to file extents */
137 int *logflagsp,/* inode logging flags */ 132 int *logflagsp,/* inode logging flags */
138 xfs_extdelta_t *delta, /* Change made to incore extents */
139 int rsvd); /* OK to allocate reserved blocks */ 133 int rsvd); /* OK to allocate reserved blocks */
140 134
141/* 135/*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
149 xfs_btree_cur_t *cur, /* if null, not a btree */ 143 xfs_btree_cur_t *cur, /* if null, not a btree */
150 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 144 xfs_bmbt_irec_t *new, /* new data to add to file extents */
151 int *logflagsp, /* inode logging flags */ 145 int *logflagsp, /* inode logging flags */
152 xfs_extdelta_t *delta, /* Change made to incore extents */
153 int whichfork); /* data or attr fork */ 146 int whichfork); /* data or attr fork */
154 147
155/* 148/*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
162 xfs_extnum_t idx, /* extent number to update/insert */ 155 xfs_extnum_t idx, /* extent number to update/insert */
163 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 156 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
164 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 157 xfs_bmbt_irec_t *new, /* new data to add to file extents */
165 int *logflagsp, /* inode logging flags */ 158 int *logflagsp); /* inode logging flags */
166 xfs_extdelta_t *delta); /* Change made to incore extents */
167 159
168/* 160/*
169 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. 161 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
200 xfs_btree_cur_t *cur, /* if null, not a btree */ 192 xfs_btree_cur_t *cur, /* if null, not a btree */
201 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
202 int *logflagsp,/* inode logging flags */ 194 int *logflagsp,/* inode logging flags */
203 xfs_extdelta_t *delta, /* Change made to incore extents */
204 int whichfork, /* data or attr fork */ 195 int whichfork, /* data or attr fork */
205 int rsvd); /* OK to allocate reserved blocks */ 196 int rsvd); /* OK to allocate reserved blocks */
206 197
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
489 xfs_fsblock_t *first, /* pointer to firstblock variable */ 480 xfs_fsblock_t *first, /* pointer to firstblock variable */
490 xfs_bmap_free_t *flist, /* list of extents to be freed */ 481 xfs_bmap_free_t *flist, /* list of extents to be freed */
491 int *logflagsp, /* inode logging flags */ 482 int *logflagsp, /* inode logging flags */
492 xfs_extdelta_t *delta, /* Change made to incore extents */
493 int whichfork, /* data or attr fork */ 483 int whichfork, /* data or attr fork */
494 int rsvd) /* OK to use reserved data blocks */ 484 int rsvd) /* OK to use reserved data blocks */
495{ 485{
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
524 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 514 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
525 } else 515 } else
526 logflags = 0; 516 logflags = 0;
527 /* DELTA: single new extent */
528 if (delta) {
529 if (delta->xed_startoff > new->br_startoff)
530 delta->xed_startoff = new->br_startoff;
531 if (delta->xed_blockcount <
532 new->br_startoff + new->br_blockcount)
533 delta->xed_blockcount = new->br_startoff +
534 new->br_blockcount;
535 }
536 } 517 }
537 /* 518 /*
538 * Any kind of new delayed allocation goes here. 519 * Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
542 ASSERT((cur->bc_private.b.flags & 523 ASSERT((cur->bc_private.b.flags &
543 XFS_BTCUR_BPRV_WASDEL) == 0); 524 XFS_BTCUR_BPRV_WASDEL) == 0);
544 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, 525 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
545 &logflags, delta, rsvd))) 526 &logflags, rsvd)))
546 goto done; 527 goto done;
547 } 528 }
548 /* 529 /*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
553 ASSERT((cur->bc_private.b.flags & 534 ASSERT((cur->bc_private.b.flags &
554 XFS_BTCUR_BPRV_WASDEL) == 0); 535 XFS_BTCUR_BPRV_WASDEL) == 0);
555 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, 536 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
556 &logflags, delta, whichfork))) 537 &logflags, whichfork)))
557 goto done; 538 goto done;
558 } else { 539 } else {
559 xfs_bmbt_irec_t prev; /* old extent at offset idx */ 540 xfs_bmbt_irec_t prev; /* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
578 XFS_BTCUR_BPRV_WASDEL); 559 XFS_BTCUR_BPRV_WASDEL);
579 if ((error = xfs_bmap_add_extent_delay_real(ip, 560 if ((error = xfs_bmap_add_extent_delay_real(ip,
580 idx, &cur, new, &da_new, first, flist, 561 idx, &cur, new, &da_new, first, flist,
581 &logflags, delta, rsvd))) 562 &logflags, rsvd)))
582 goto done; 563 goto done;
583 } else if (new->br_state == XFS_EXT_NORM) { 564 } else if (new->br_state == XFS_EXT_NORM) {
584 ASSERT(new->br_state == XFS_EXT_NORM); 565 ASSERT(new->br_state == XFS_EXT_NORM);
585 if ((error = xfs_bmap_add_extent_unwritten_real( 566 if ((error = xfs_bmap_add_extent_unwritten_real(
586 ip, idx, &cur, new, &logflags, delta))) 567 ip, idx, &cur, new, &logflags)))
587 goto done; 568 goto done;
588 } else { 569 } else {
589 ASSERT(new->br_state == XFS_EXT_UNWRITTEN); 570 ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
590 if ((error = xfs_bmap_add_extent_unwritten_real( 571 if ((error = xfs_bmap_add_extent_unwritten_real(
591 ip, idx, &cur, new, &logflags, delta))) 572 ip, idx, &cur, new, &logflags)))
592 goto done; 573 goto done;
593 } 574 }
594 ASSERT(*curp == cur || *curp == NULL); 575 ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
601 ASSERT((cur->bc_private.b.flags & 582 ASSERT((cur->bc_private.b.flags &
602 XFS_BTCUR_BPRV_WASDEL) == 0); 583 XFS_BTCUR_BPRV_WASDEL) == 0);
603 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, 584 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
604 new, &logflags, delta, whichfork))) 585 new, &logflags, whichfork)))
605 goto done; 586 goto done;
606 } 587 }
607 } 588 }
@@ -633,7 +614,7 @@ xfs_bmap_add_extent(
633 nblks += cur->bc_private.b.allocated; 614 nblks += cur->bc_private.b.allocated;
634 ASSERT(nblks <= da_old); 615 ASSERT(nblks <= da_old);
635 if (nblks < da_old) 616 if (nblks < da_old)
636 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
637 (int64_t)(da_old - nblks), rsvd); 618 (int64_t)(da_old - nblks), rsvd);
638 } 619 }
639 /* 620 /*
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
666 xfs_fsblock_t *first, /* pointer to firstblock variable */ 647 xfs_fsblock_t *first, /* pointer to firstblock variable */
667 xfs_bmap_free_t *flist, /* list of extents to be freed */ 648 xfs_bmap_free_t *flist, /* list of extents to be freed */
668 int *logflagsp, /* inode logging flags */ 649 int *logflagsp, /* inode logging flags */
669 xfs_extdelta_t *delta, /* Change made to incore extents */
670 int rsvd) /* OK to use reserved data block allocation */ 650 int rsvd) /* OK to use reserved data block allocation */
671{ 651{
672 xfs_btree_cur_t *cur; /* btree cursor */ 652 xfs_btree_cur_t *cur; /* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
797 goto done; 777 goto done;
798 } 778 }
799 *dnew = 0; 779 *dnew = 0;
800 /* DELTA: Three in-core extents are replaced by one. */
801 temp = LEFT.br_startoff;
802 temp2 = LEFT.br_blockcount +
803 PREV.br_blockcount +
804 RIGHT.br_blockcount;
805 break; 780 break;
806 781
807 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 782 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
832 goto done; 807 goto done;
833 } 808 }
834 *dnew = 0; 809 *dnew = 0;
835 /* DELTA: Two in-core extents are replaced by one. */
836 temp = LEFT.br_startoff;
837 temp2 = LEFT.br_blockcount +
838 PREV.br_blockcount;
839 break; 810 break;
840 811
841 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 812 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
867 goto done; 838 goto done;
868 } 839 }
869 *dnew = 0; 840 *dnew = 0;
870 /* DELTA: Two in-core extents are replaced by one. */
871 temp = PREV.br_startoff;
872 temp2 = PREV.br_blockcount +
873 RIGHT.br_blockcount;
874 break; 841 break;
875 842
876 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 843 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
900 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 867 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
901 } 868 }
902 *dnew = 0; 869 *dnew = 0;
903 /* DELTA: The in-core extent described by new changed type. */
904 temp = new->br_startoff;
905 temp2 = new->br_blockcount;
906 break; 870 break;
907 871
908 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 872 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
942 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 906 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
943 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 907 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
944 *dnew = temp; 908 *dnew = temp;
945 /* DELTA: The boundary between two in-core extents moved. */
946 temp = LEFT.br_startoff;
947 temp2 = LEFT.br_blockcount +
948 PREV.br_blockcount;
949 break; 909 break;
950 910
951 case BMAP_LEFT_FILLING: 911 case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
990 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
991 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 951 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
992 *dnew = temp; 952 *dnew = temp;
993 /* DELTA: One in-core extent is split in two. */
994 temp = PREV.br_startoff;
995 temp2 = PREV.br_blockcount;
996 break; 953 break;
997 954
998 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 955 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 988 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 989 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1033 *dnew = temp; 990 *dnew = temp;
1034 /* DELTA: The boundary between two in-core extents moved. */
1035 temp = PREV.br_startoff;
1036 temp2 = PREV.br_blockcount +
1037 RIGHT.br_blockcount;
1038 break; 991 break;
1039 992
1040 case BMAP_RIGHT_FILLING: 993 case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
1078 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1079 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1080 *dnew = temp; 1033 *dnew = temp;
1081 /* DELTA: One in-core extent is split in two. */
1082 temp = PREV.br_startoff;
1083 temp2 = PREV.br_blockcount;
1084 break; 1034 break;
1085 1035
1086 case 0: 1036 case 0:
@@ -1129,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
1129 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1130 (cur ? cur->bc_private.b.allocated : 0)); 1080 (cur ? cur->bc_private.b.allocated : 0));
1131 if (diff > 0 && 1081 if (diff > 0 &&
1132 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1082 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1083 -((int64_t)diff), rsvd)) {
1133 /* 1084 /*
1134 * Ick gross gag me with a spoon. 1085 * Ick gross gag me with a spoon.
1135 */ 1086 */
@@ -1139,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
1139 temp--; 1090 temp--;
1140 diff--; 1091 diff--;
1141 if (!diff || 1092 if (!diff ||
1142 !xfs_mod_incore_sb(ip->i_mount, 1093 !xfs_icsb_modify_counters(ip->i_mount,
1143 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1094 XFS_SBS_FDBLOCKS,
1095 -((int64_t)diff), rsvd))
1144 break; 1096 break;
1145 } 1097 }
1146 if (temp2) { 1098 if (temp2) {
1147 temp2--; 1099 temp2--;
1148 diff--; 1100 diff--;
1149 if (!diff || 1101 if (!diff ||
1150 !xfs_mod_incore_sb(ip->i_mount, 1102 !xfs_icsb_modify_counters(ip->i_mount,
1151 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1103 XFS_SBS_FDBLOCKS,
1104 -((int64_t)diff), rsvd))
1152 break; 1105 break;
1153 } 1106 }
1154 } 1107 }
@@ -1161,9 +1114,6 @@ xfs_bmap_add_extent_delay_real(
1161 nullstartblock((int)temp2)); 1114 nullstartblock((int)temp2));
1162 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); 1115 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
1163 *dnew = temp + temp2; 1116 *dnew = temp + temp2;
1164 /* DELTA: One in-core extent is split in three. */
1165 temp = PREV.br_startoff;
1166 temp2 = PREV.br_blockcount;
1167 break; 1117 break;
1168 1118
1169 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1119 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1129,6 @@ xfs_bmap_add_extent_delay_real(
1179 ASSERT(0); 1129 ASSERT(0);
1180 } 1130 }
1181 *curp = cur; 1131 *curp = cur;
1182 if (delta) {
1183 temp2 += temp;
1184 if (delta->xed_startoff > temp)
1185 delta->xed_startoff = temp;
1186 if (delta->xed_blockcount < temp2)
1187 delta->xed_blockcount = temp2;
1188 }
1189done: 1132done:
1190 *logflagsp = rval; 1133 *logflagsp = rval;
1191 return error; 1134 return error;
@@ -1204,8 +1147,7 @@ xfs_bmap_add_extent_unwritten_real(
1204 xfs_extnum_t idx, /* extent number to update/insert */ 1147 xfs_extnum_t idx, /* extent number to update/insert */
1205 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 1148 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1206 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1149 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1207 int *logflagsp, /* inode logging flags */ 1150 int *logflagsp) /* inode logging flags */
1208 xfs_extdelta_t *delta) /* Change made to incore extents */
1209{ 1151{
1210 xfs_btree_cur_t *cur; /* btree cursor */ 1152 xfs_btree_cur_t *cur; /* btree cursor */
1211 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ 1153 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
@@ -1219,8 +1161,6 @@ xfs_bmap_add_extent_unwritten_real(
1219 /* left is 0, right is 1, prev is 2 */ 1161 /* left is 0, right is 1, prev is 2 */
1220 int rval=0; /* return value (logging flags) */ 1162 int rval=0; /* return value (logging flags) */
1221 int state = 0;/* state bits, accessed thru macros */ 1163 int state = 0;/* state bits, accessed thru macros */
1222 xfs_filblks_t temp=0;
1223 xfs_filblks_t temp2=0;
1224 1164
1225#define LEFT r[0] 1165#define LEFT r[0]
1226#define RIGHT r[1] 1166#define RIGHT r[1]
@@ -1341,11 +1281,6 @@ xfs_bmap_add_extent_unwritten_real(
1341 RIGHT.br_blockcount, LEFT.br_state))) 1281 RIGHT.br_blockcount, LEFT.br_state)))
1342 goto done; 1282 goto done;
1343 } 1283 }
1344 /* DELTA: Three in-core extents are replaced by one. */
1345 temp = LEFT.br_startoff;
1346 temp2 = LEFT.br_blockcount +
1347 PREV.br_blockcount +
1348 RIGHT.br_blockcount;
1349 break; 1284 break;
1350 1285
1351 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 1286 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1317,6 @@ xfs_bmap_add_extent_unwritten_real(
1382 LEFT.br_state))) 1317 LEFT.br_state)))
1383 goto done; 1318 goto done;
1384 } 1319 }
1385 /* DELTA: Two in-core extents are replaced by one. */
1386 temp = LEFT.br_startoff;
1387 temp2 = LEFT.br_blockcount +
1388 PREV.br_blockcount;
1389 break; 1320 break;
1390 1321
1391 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1322 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1353,6 @@ xfs_bmap_add_extent_unwritten_real(
1422 newext))) 1353 newext)))
1423 goto done; 1354 goto done;
1424 } 1355 }
1425 /* DELTA: Two in-core extents are replaced by one. */
1426 temp = PREV.br_startoff;
1427 temp2 = PREV.br_blockcount +
1428 RIGHT.br_blockcount;
1429 break; 1356 break;
1430 1357
1431 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 1358 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1380,6 @@ xfs_bmap_add_extent_unwritten_real(
1453 newext))) 1380 newext)))
1454 goto done; 1381 goto done;
1455 } 1382 }
1456 /* DELTA: The in-core extent described by new changed type. */
1457 temp = new->br_startoff;
1458 temp2 = new->br_blockcount;
1459 break; 1383 break;
1460 1384
1461 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 1385 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1425,6 @@ xfs_bmap_add_extent_unwritten_real(
1501 LEFT.br_state)) 1425 LEFT.br_state))
1502 goto done; 1426 goto done;
1503 } 1427 }
1504 /* DELTA: The boundary between two in-core extents moved. */
1505 temp = LEFT.br_startoff;
1506 temp2 = LEFT.br_blockcount +
1507 PREV.br_blockcount;
1508 break; 1428 break;
1509 1429
1510 case BMAP_LEFT_FILLING: 1430 case BMAP_LEFT_FILLING:
@@ -1544,9 +1464,6 @@ xfs_bmap_add_extent_unwritten_real(
1544 goto done; 1464 goto done;
1545 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1465 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1546 } 1466 }
1547 /* DELTA: One in-core extent is split in two. */
1548 temp = PREV.br_startoff;
1549 temp2 = PREV.br_blockcount;
1550 break; 1467 break;
1551 1468
1552 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1469 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1504,6 @@ xfs_bmap_add_extent_unwritten_real(
1587 newext))) 1504 newext)))
1588 goto done; 1505 goto done;
1589 } 1506 }
1590 /* DELTA: The boundary between two in-core extents moved. */
1591 temp = PREV.br_startoff;
1592 temp2 = PREV.br_blockcount +
1593 RIGHT.br_blockcount;
1594 break; 1507 break;
1595 1508
1596 case BMAP_RIGHT_FILLING: 1509 case BMAP_RIGHT_FILLING:
@@ -1630,9 +1543,6 @@ xfs_bmap_add_extent_unwritten_real(
1630 goto done; 1543 goto done;
1631 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1544 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1632 } 1545 }
1633 /* DELTA: One in-core extent is split in two. */
1634 temp = PREV.br_startoff;
1635 temp2 = PREV.br_blockcount;
1636 break; 1546 break;
1637 1547
1638 case 0: 1548 case 0:
@@ -1692,9 +1602,6 @@ xfs_bmap_add_extent_unwritten_real(
1692 goto done; 1602 goto done;
1693 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1603 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1694 } 1604 }
1695 /* DELTA: One in-core extent is split in three. */
1696 temp = PREV.br_startoff;
1697 temp2 = PREV.br_blockcount;
1698 break; 1605 break;
1699 1606
1700 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1607 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1617,6 @@ xfs_bmap_add_extent_unwritten_real(
1710 ASSERT(0); 1617 ASSERT(0);
1711 } 1618 }
1712 *curp = cur; 1619 *curp = cur;
1713 if (delta) {
1714 temp2 += temp;
1715 if (delta->xed_startoff > temp)
1716 delta->xed_startoff = temp;
1717 if (delta->xed_blockcount < temp2)
1718 delta->xed_blockcount = temp2;
1719 }
1720done: 1620done:
1721 *logflagsp = rval; 1621 *logflagsp = rval;
1722 return error; 1622 return error;
@@ -1736,7 +1636,6 @@ xfs_bmap_add_extent_hole_delay(
1736 xfs_extnum_t idx, /* extent number to update/insert */ 1636 xfs_extnum_t idx, /* extent number to update/insert */
1737 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1637 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1738 int *logflagsp, /* inode logging flags */ 1638 int *logflagsp, /* inode logging flags */
1739 xfs_extdelta_t *delta, /* Change made to incore extents */
1740 int rsvd) /* OK to allocate reserved blocks */ 1639 int rsvd) /* OK to allocate reserved blocks */
1741{ 1640{
1742 xfs_bmbt_rec_host_t *ep; /* extent record for idx */ 1641 xfs_bmbt_rec_host_t *ep; /* extent record for idx */
@@ -1747,7 +1646,6 @@ xfs_bmap_add_extent_hole_delay(
1747 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1646 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1748 int state; /* state bits, accessed thru macros */ 1647 int state; /* state bits, accessed thru macros */
1749 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1648 xfs_filblks_t temp=0; /* temp for indirect calculations */
1750 xfs_filblks_t temp2=0;
1751 1649
1752 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1650 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1753 ep = xfs_iext_get_ext(ifp, idx); 1651 ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1717,6 @@ xfs_bmap_add_extent_hole_delay(
1819 1717
1820 xfs_iext_remove(ip, idx, 1, state); 1718 xfs_iext_remove(ip, idx, 1, state);
1821 ip->i_df.if_lastex = idx - 1; 1719 ip->i_df.if_lastex = idx - 1;
1822 /* DELTA: Two in-core extents were replaced by one. */
1823 temp2 = temp;
1824 temp = left.br_startoff;
1825 break; 1720 break;
1826 1721
1827 case BMAP_LEFT_CONTIG: 1722 case BMAP_LEFT_CONTIG:
@@ -1841,9 +1736,6 @@ xfs_bmap_add_extent_hole_delay(
1841 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1736 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1842 1737
1843 ip->i_df.if_lastex = idx - 1; 1738 ip->i_df.if_lastex = idx - 1;
1844 /* DELTA: One in-core extent grew into a hole. */
1845 temp2 = temp;
1846 temp = left.br_startoff;
1847 break; 1739 break;
1848 1740
1849 case BMAP_RIGHT_CONTIG: 1741 case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1754,6 @@ xfs_bmap_add_extent_hole_delay(
1862 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1754 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1863 1755
1864 ip->i_df.if_lastex = idx; 1756 ip->i_df.if_lastex = idx;
1865 /* DELTA: One in-core extent grew into a hole. */
1866 temp2 = temp;
1867 temp = new->br_startoff;
1868 break; 1757 break;
1869 1758
1870 case 0: 1759 case 0:
@@ -1876,26 +1765,16 @@ xfs_bmap_add_extent_hole_delay(
1876 oldlen = newlen = 0; 1765 oldlen = newlen = 0;
1877 xfs_iext_insert(ip, idx, 1, new, state); 1766 xfs_iext_insert(ip, idx, 1, new, state);
1878 ip->i_df.if_lastex = idx; 1767 ip->i_df.if_lastex = idx;
1879 /* DELTA: A new in-core extent was added in a hole. */
1880 temp2 = new->br_blockcount;
1881 temp = new->br_startoff;
1882 break; 1768 break;
1883 } 1769 }
1884 if (oldlen != newlen) { 1770 if (oldlen != newlen) {
1885 ASSERT(oldlen > newlen); 1771 ASSERT(oldlen > newlen);
1886 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 1772 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1887 (int64_t)(oldlen - newlen), rsvd); 1773 (int64_t)(oldlen - newlen), rsvd);
1888 /* 1774 /*
1889 * Nothing to do for disk quota accounting here. 1775 * Nothing to do for disk quota accounting here.
1890 */ 1776 */
1891 } 1777 }
1892 if (delta) {
1893 temp2 += temp;
1894 if (delta->xed_startoff > temp)
1895 delta->xed_startoff = temp;
1896 if (delta->xed_blockcount < temp2)
1897 delta->xed_blockcount = temp2;
1898 }
1899 *logflagsp = 0; 1778 *logflagsp = 0;
1900 return 0; 1779 return 0;
1901} 1780}
@@ -1911,7 +1790,6 @@ xfs_bmap_add_extent_hole_real(
1911 xfs_btree_cur_t *cur, /* if null, not a btree */ 1790 xfs_btree_cur_t *cur, /* if null, not a btree */
1912 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1791 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1913 int *logflagsp, /* inode logging flags */ 1792 int *logflagsp, /* inode logging flags */
1914 xfs_extdelta_t *delta, /* Change made to incore extents */
1915 int whichfork) /* data or attr fork */ 1793 int whichfork) /* data or attr fork */
1916{ 1794{
1917 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ 1795 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
@@ -1922,8 +1800,6 @@ xfs_bmap_add_extent_hole_real(
1922 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1800 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1923 int rval=0; /* return value (logging flags) */ 1801 int rval=0; /* return value (logging flags) */
1924 int state; /* state bits, accessed thru macros */ 1802 int state; /* state bits, accessed thru macros */
1925 xfs_filblks_t temp=0;
1926 xfs_filblks_t temp2=0;
1927 1803
1928 ifp = XFS_IFORK_PTR(ip, whichfork); 1804 ifp = XFS_IFORK_PTR(ip, whichfork);
1929 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1805 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1896,6 @@ xfs_bmap_add_extent_hole_real(
2020 left.br_state))) 1896 left.br_state)))
2021 goto done; 1897 goto done;
2022 } 1898 }
2023 /* DELTA: Two in-core extents were replaced by one. */
2024 temp = left.br_startoff;
2025 temp2 = left.br_blockcount +
2026 new->br_blockcount +
2027 right.br_blockcount;
2028 break; 1899 break;
2029 1900
2030 case BMAP_LEFT_CONTIG: 1901 case BMAP_LEFT_CONTIG:
@@ -2056,10 +1927,6 @@ xfs_bmap_add_extent_hole_real(
2056 left.br_state))) 1927 left.br_state)))
2057 goto done; 1928 goto done;
2058 } 1929 }
2059 /* DELTA: One in-core extent grew. */
2060 temp = left.br_startoff;
2061 temp2 = left.br_blockcount +
2062 new->br_blockcount;
2063 break; 1930 break;
2064 1931
2065 case BMAP_RIGHT_CONTIG: 1932 case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1959,6 @@ xfs_bmap_add_extent_hole_real(
2092 right.br_state))) 1959 right.br_state)))
2093 goto done; 1960 goto done;
2094 } 1961 }
2095 /* DELTA: One in-core extent grew. */
2096 temp = new->br_startoff;
2097 temp2 = new->br_blockcount +
2098 right.br_blockcount;
2099 break; 1962 break;
2100 1963
2101 case 0: 1964 case 0:
@@ -2123,18 +1986,8 @@ xfs_bmap_add_extent_hole_real(
2123 goto done; 1986 goto done;
2124 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1987 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2125 } 1988 }
2126 /* DELTA: A new extent was added in a hole. */
2127 temp = new->br_startoff;
2128 temp2 = new->br_blockcount;
2129 break; 1989 break;
2130 } 1990 }
2131 if (delta) {
2132 temp2 += temp;
2133 if (delta->xed_startoff > temp)
2134 delta->xed_startoff = temp;
2135 if (delta->xed_blockcount < temp2)
2136 delta->xed_blockcount = temp2;
2137 }
2138done: 1991done:
2139 *logflagsp = rval; 1992 *logflagsp = rval;
2140 return error; 1993 return error;
@@ -2959,7 +2812,6 @@ xfs_bmap_del_extent(
2959 xfs_btree_cur_t *cur, /* if null, not a btree */ 2812 xfs_btree_cur_t *cur, /* if null, not a btree */
2960 xfs_bmbt_irec_t *del, /* data to remove from extents */ 2813 xfs_bmbt_irec_t *del, /* data to remove from extents */
2961 int *logflagsp, /* inode logging flags */ 2814 int *logflagsp, /* inode logging flags */
2962 xfs_extdelta_t *delta, /* Change made to incore extents */
2963 int whichfork, /* data or attr fork */ 2815 int whichfork, /* data or attr fork */
2964 int rsvd) /* OK to allocate reserved blocks */ 2816 int rsvd) /* OK to allocate reserved blocks */
2965{ 2817{
@@ -3262,16 +3114,9 @@ xfs_bmap_del_extent(
3262 * Nothing to do for disk quota accounting here. 3114 * Nothing to do for disk quota accounting here.
3263 */ 3115 */
3264 ASSERT(da_old >= da_new); 3116 ASSERT(da_old >= da_new);
3265 if (da_old > da_new) 3117 if (da_old > da_new) {
3266 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3118 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3267 rsvd); 3119 (int64_t)(da_old - da_new), rsvd);
3268 if (delta) {
3269 /* DELTA: report the original extent. */
3270 if (delta->xed_startoff > got.br_startoff)
3271 delta->xed_startoff = got.br_startoff;
3272 if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
3273 delta->xed_blockcount = got.br_startoff +
3274 got.br_blockcount;
3275 } 3120 }
3276done: 3121done:
3277 *logflagsp = flags; 3122 *logflagsp = flags;
@@ -3754,9 +3599,10 @@ xfs_bmap_add_attrfork(
3754 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 3599 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
3755 } 3600 }
3756 ASSERT(ip->i_d.di_anextents == 0); 3601 ASSERT(ip->i_d.di_anextents == 0);
3757 IHOLD(ip); 3602
3758 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 3603 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
3759 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 3604 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3605
3760 switch (ip->i_d.di_format) { 3606 switch (ip->i_d.di_format) {
3761 case XFS_DINODE_FMT_DEV: 3607 case XFS_DINODE_FMT_DEV:
3762 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; 3608 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -4483,8 +4329,7 @@ xfs_bmapi(
4483 xfs_extlen_t total, /* total blocks needed */ 4329 xfs_extlen_t total, /* total blocks needed */
4484 xfs_bmbt_irec_t *mval, /* output: map values */ 4330 xfs_bmbt_irec_t *mval, /* output: map values */
4485 int *nmap, /* i/o: mval size/count */ 4331 int *nmap, /* i/o: mval size/count */
4486 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 4332 xfs_bmap_free_t *flist) /* i/o: list extents to free */
4487 xfs_extdelta_t *delta) /* o: change made to incore extents */
4488{ 4333{
4489 xfs_fsblock_t abno; /* allocated block number */ 4334 xfs_fsblock_t abno; /* allocated block number */
4490 xfs_extlen_t alen; /* allocated extent length */ 4335 xfs_extlen_t alen; /* allocated extent length */
@@ -4596,10 +4441,7 @@ xfs_bmapi(
4596 end = bno + len; 4441 end = bno + len;
4597 obno = bno; 4442 obno = bno;
4598 bma.ip = NULL; 4443 bma.ip = NULL;
4599 if (delta) { 4444
4600 delta->xed_startoff = NULLFILEOFF;
4601 delta->xed_blockcount = 0;
4602 }
4603 while (bno < end && n < *nmap) { 4445 while (bno < end && n < *nmap) {
4604 /* 4446 /*
4605 * Reading past eof, act as though there's a hole 4447 * Reading past eof, act as though there's a hole
@@ -4620,19 +4462,13 @@ xfs_bmapi(
4620 * allocate the stuff asked for in this bmap call 4462 * allocate the stuff asked for in this bmap call
4621 * but that wouldn't be as good. 4463 * but that wouldn't be as good.
4622 */ 4464 */
4623 if (wasdelay && !(flags & XFS_BMAPI_EXACT)) { 4465 if (wasdelay) {
4624 alen = (xfs_extlen_t)got.br_blockcount; 4466 alen = (xfs_extlen_t)got.br_blockcount;
4625 aoff = got.br_startoff; 4467 aoff = got.br_startoff;
4626 if (lastx != NULLEXTNUM && lastx) { 4468 if (lastx != NULLEXTNUM && lastx) {
4627 ep = xfs_iext_get_ext(ifp, lastx - 1); 4469 ep = xfs_iext_get_ext(ifp, lastx - 1);
4628 xfs_bmbt_get_all(ep, &prev); 4470 xfs_bmbt_get_all(ep, &prev);
4629 } 4471 }
4630 } else if (wasdelay) {
4631 alen = (xfs_extlen_t)
4632 XFS_FILBLKS_MIN(len,
4633 (got.br_startoff +
4634 got.br_blockcount) - bno);
4635 aoff = bno;
4636 } else { 4472 } else {
4637 alen = (xfs_extlen_t) 4473 alen = (xfs_extlen_t)
4638 XFS_FILBLKS_MIN(len, MAXEXTLEN); 4474 XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4694,13 +4530,13 @@ xfs_bmapi(
4694 -((int64_t)extsz), (flags & 4530 -((int64_t)extsz), (flags &
4695 XFS_BMAPI_RSVBLOCKS)); 4531 XFS_BMAPI_RSVBLOCKS));
4696 } else { 4532 } else {
4697 error = xfs_mod_incore_sb(mp, 4533 error = xfs_icsb_modify_counters(mp,
4698 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4699 -((int64_t)alen), (flags & 4535 -((int64_t)alen), (flags &
4700 XFS_BMAPI_RSVBLOCKS)); 4536 XFS_BMAPI_RSVBLOCKS));
4701 } 4537 }
4702 if (!error) { 4538 if (!error) {
4703 error = xfs_mod_incore_sb(mp, 4539 error = xfs_icsb_modify_counters(mp,
4704 XFS_SBS_FDBLOCKS, 4540 XFS_SBS_FDBLOCKS,
4705 -((int64_t)indlen), (flags & 4541 -((int64_t)indlen), (flags &
4706 XFS_BMAPI_RSVBLOCKS)); 4542 XFS_BMAPI_RSVBLOCKS));
@@ -4710,7 +4546,7 @@ xfs_bmapi(
4710 (int64_t)extsz, (flags & 4546 (int64_t)extsz, (flags &
4711 XFS_BMAPI_RSVBLOCKS)); 4547 XFS_BMAPI_RSVBLOCKS));
4712 else if (error) 4548 else if (error)
4713 xfs_mod_incore_sb(mp, 4549 xfs_icsb_modify_counters(mp,
4714 XFS_SBS_FDBLOCKS, 4550 XFS_SBS_FDBLOCKS,
4715 (int64_t)alen, (flags & 4551 (int64_t)alen, (flags &
4716 XFS_BMAPI_RSVBLOCKS)); 4552 XFS_BMAPI_RSVBLOCKS));
@@ -4831,7 +4667,7 @@ xfs_bmapi(
4831 got.br_state = XFS_EXT_UNWRITTEN; 4667 got.br_state = XFS_EXT_UNWRITTEN;
4832 } 4668 }
4833 error = xfs_bmap_add_extent(ip, lastx, &cur, &got, 4669 error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
4834 firstblock, flist, &tmp_logflags, delta, 4670 firstblock, flist, &tmp_logflags,
4835 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4671 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
4836 logflags |= tmp_logflags; 4672 logflags |= tmp_logflags;
4837 if (error) 4673 if (error)
@@ -4912,8 +4748,12 @@ xfs_bmapi(
4912 * Check if writing previously allocated but 4748 * Check if writing previously allocated but
4913 * unwritten extents. 4749 * unwritten extents.
4914 */ 4750 */
4915 if (wr && mval->br_state == XFS_EXT_UNWRITTEN && 4751 if (wr &&
4916 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { 4752 ((mval->br_state == XFS_EXT_UNWRITTEN &&
4753 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
4754 (mval->br_state == XFS_EXT_NORM &&
4755 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
4756 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
4917 /* 4757 /*
4918 * Modify (by adding) the state flag, if writing. 4758 * Modify (by adding) the state flag, if writing.
4919 */ 4759 */
@@ -4925,9 +4765,11 @@ xfs_bmapi(
4925 *firstblock; 4765 *firstblock;
4926 cur->bc_private.b.flist = flist; 4766 cur->bc_private.b.flist = flist;
4927 } 4767 }
4928 mval->br_state = XFS_EXT_NORM; 4768 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4769 ? XFS_EXT_NORM
4770 : XFS_EXT_UNWRITTEN;
4929 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4771 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4930 firstblock, flist, &tmp_logflags, delta, 4772 firstblock, flist, &tmp_logflags,
4931 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4773 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
4932 logflags |= tmp_logflags; 4774 logflags |= tmp_logflags;
4933 if (error) 4775 if (error)
@@ -5017,14 +4859,6 @@ xfs_bmapi(
5017 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4859 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
5018 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); 4860 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
5019 error = 0; 4861 error = 0;
5020 if (delta && delta->xed_startoff != NULLFILEOFF) {
5021 /* A change was actually made.
5022 * Note that delta->xed_blockount is an offset at this
5023 * point and needs to be converted to a block count.
5024 */
5025 ASSERT(delta->xed_blockcount > delta->xed_startoff);
5026 delta->xed_blockcount -= delta->xed_startoff;
5027 }
5028error0: 4862error0:
5029 /* 4863 /*
5030 * Log everything. Do this after conversion, there's no point in 4864 * Log everything. Do this after conversion, there's no point in
@@ -5136,8 +4970,6 @@ xfs_bunmapi(
5136 xfs_fsblock_t *firstblock, /* first allocated block 4970 xfs_fsblock_t *firstblock, /* first allocated block
5137 controls a.g. for allocs */ 4971 controls a.g. for allocs */
5138 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 4972 xfs_bmap_free_t *flist, /* i/o: list extents to free */
5139 xfs_extdelta_t *delta, /* o: change made to incore
5140 extents */
5141 int *done) /* set if not done yet */ 4973 int *done) /* set if not done yet */
5142{ 4974{
5143 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4975 xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -5196,10 +5028,7 @@ xfs_bunmapi(
5196 bno = start + len - 1; 5028 bno = start + len - 1;
5197 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, 5029 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
5198 &prev); 5030 &prev);
5199 if (delta) { 5031
5200 delta->xed_startoff = NULLFILEOFF;
5201 delta->xed_blockcount = 0;
5202 }
5203 /* 5032 /*
5204 * Check to see if the given block number is past the end of the 5033 * Check to see if the given block number is past the end of the
5205 * file, back up to the last block if so... 5034 * file, back up to the last block if so...
@@ -5297,7 +5126,7 @@ xfs_bunmapi(
5297 } 5126 }
5298 del.br_state = XFS_EXT_UNWRITTEN; 5127 del.br_state = XFS_EXT_UNWRITTEN;
5299 error = xfs_bmap_add_extent(ip, lastx, &cur, &del, 5128 error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
5300 firstblock, flist, &logflags, delta, 5129 firstblock, flist, &logflags,
5301 XFS_DATA_FORK, 0); 5130 XFS_DATA_FORK, 0);
5302 if (error) 5131 if (error)
5303 goto error0; 5132 goto error0;
@@ -5352,7 +5181,7 @@ xfs_bunmapi(
5352 prev.br_state = XFS_EXT_UNWRITTEN; 5181 prev.br_state = XFS_EXT_UNWRITTEN;
5353 error = xfs_bmap_add_extent(ip, lastx - 1, &cur, 5182 error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
5354 &prev, firstblock, flist, &logflags, 5183 &prev, firstblock, flist, &logflags,
5355 delta, XFS_DATA_FORK, 0); 5184 XFS_DATA_FORK, 0);
5356 if (error) 5185 if (error)
5357 goto error0; 5186 goto error0;
5358 goto nodelete; 5187 goto nodelete;
@@ -5361,7 +5190,7 @@ xfs_bunmapi(
5361 del.br_state = XFS_EXT_UNWRITTEN; 5190 del.br_state = XFS_EXT_UNWRITTEN;
5362 error = xfs_bmap_add_extent(ip, lastx, &cur, 5191 error = xfs_bmap_add_extent(ip, lastx, &cur,
5363 &del, firstblock, flist, &logflags, 5192 &del, firstblock, flist, &logflags,
5364 delta, XFS_DATA_FORK, 0); 5193 XFS_DATA_FORK, 0);
5365 if (error) 5194 if (error)
5366 goto error0; 5195 goto error0;
5367 goto nodelete; 5196 goto nodelete;
@@ -5381,7 +5210,7 @@ xfs_bunmapi(
5381 ip, -((long)del.br_blockcount), 0, 5210 ip, -((long)del.br_blockcount), 0,
5382 XFS_QMOPT_RES_RTBLKS); 5211 XFS_QMOPT_RES_RTBLKS);
5383 } else { 5212 } else {
5384 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5213 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5385 (int64_t)del.br_blockcount, rsvd); 5214 (int64_t)del.br_blockcount, rsvd);
5386 (void)xfs_trans_reserve_quota_nblks(NULL, 5215 (void)xfs_trans_reserve_quota_nblks(NULL,
5387 ip, -((long)del.br_blockcount), 0, 5216 ip, -((long)del.br_blockcount), 0,
@@ -5414,7 +5243,7 @@ xfs_bunmapi(
5414 goto error0; 5243 goto error0;
5415 } 5244 }
5416 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, 5245 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
5417 &tmp_logflags, delta, whichfork, rsvd); 5246 &tmp_logflags, whichfork, rsvd);
5418 logflags |= tmp_logflags; 5247 logflags |= tmp_logflags;
5419 if (error) 5248 if (error)
5420 goto error0; 5249 goto error0;
@@ -5471,14 +5300,6 @@ nodelete:
5471 ASSERT(ifp->if_ext_max == 5300 ASSERT(ifp->if_ext_max ==
5472 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 5301 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5473 error = 0; 5302 error = 0;
5474 if (delta && delta->xed_startoff != NULLFILEOFF) {
5475 /* A change was actually made.
5476 * Note that delta->xed_blockount is an offset at this
5477 * point and needs to be converted to a block count.
5478 */
5479 ASSERT(delta->xed_blockcount > delta->xed_startoff);
5480 delta->xed_blockcount -= delta->xed_startoff;
5481 }
5482error0: 5303error0:
5483 /* 5304 /*
5484 * Log everything. Do this after conversion, there's no point in 5305 * Log everything. Do this after conversion, there's no point in
@@ -5605,28 +5426,6 @@ xfs_getbmap(
5605 prealloced = 0; 5426 prealloced = 0;
5606 fixlen = 1LL << 32; 5427 fixlen = 1LL << 32;
5607 } else { 5428 } else {
5608 /*
5609 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
5610 * not generate a DMAPI read event. Otherwise, if the
5611 * DM_EVENT_READ bit is set for the file, generate a read
5612 * event in order that the DMAPI application may do its thing
5613 * before we return the extents. Usually this means restoring
5614 * user file data to regions of the file that look like holes.
5615 *
5616 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
5617 * BMV_IF_NO_DMAPI_READ so that read events are generated.
5618 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
5619 * could misinterpret holes in a DMAPI file as true holes,
5620 * when in fact they may represent offline user data.
5621 */
5622 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5623 !(iflags & BMV_IF_NO_DMAPI_READ)) {
5624 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
5625 0, 0, 0, NULL);
5626 if (error)
5627 return XFS_ERROR(error);
5628 }
5629
5630 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && 5429 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5631 ip->i_d.di_format != XFS_DINODE_FMT_BTREE && 5430 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5632 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 5431 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5512,7 @@ xfs_getbmap(
5713 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), 5512 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5714 XFS_BB_TO_FSB(mp, bmv->bmv_length), 5513 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5715 bmapi_flags, NULL, 0, map, &nmap, 5514 bmapi_flags, NULL, 0, map, &nmap,
5716 NULL, NULL); 5515 NULL);
5717 if (error) 5516 if (error)
5718 goto out_free_map; 5517 goto out_free_map;
5719 ASSERT(nmap <= subnex); 5518 ASSERT(nmap <= subnex);
@@ -5744,12 +5543,24 @@ xfs_getbmap(
5744 map[i].br_startblock)) 5543 map[i].br_startblock))
5745 goto out_free_map; 5544 goto out_free_map;
5746 5545
5747 nexleft--;
5748 bmv->bmv_offset = 5546 bmv->bmv_offset =
5749 out[cur_ext].bmv_offset + 5547 out[cur_ext].bmv_offset +
5750 out[cur_ext].bmv_length; 5548 out[cur_ext].bmv_length;
5751 bmv->bmv_length = 5549 bmv->bmv_length =
5752 max_t(__int64_t, 0, bmvend - bmv->bmv_offset); 5550 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
5551
5552 /*
5553 * In case we don't want to return the hole,
5554 * don't increase cur_ext so that we can reuse
5555 * it in the next loop.
5556 */
5557 if ((iflags & BMV_IF_NO_HOLES) &&
5558 map[i].br_startblock == HOLESTARTBLOCK) {
5559 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
5560 continue;
5561 }
5562
5563 nexleft--;
5753 bmv->bmv_entries++; 5564 bmv->bmv_entries++;
5754 cur_ext++; 5565 cur_ext++;
5755 } 5566 }
@@ -5859,66 +5670,34 @@ xfs_bmap_eof(
5859} 5670}
5860 5671
5861#ifdef DEBUG 5672#ifdef DEBUG
5862STATIC 5673STATIC struct xfs_buf *
5863xfs_buf_t *
5864xfs_bmap_get_bp( 5674xfs_bmap_get_bp(
5865 xfs_btree_cur_t *cur, 5675 struct xfs_btree_cur *cur,
5866 xfs_fsblock_t bno) 5676 xfs_fsblock_t bno)
5867{ 5677{
5868 int i; 5678 struct xfs_log_item_desc *lidp;
5869 xfs_buf_t *bp; 5679 int i;
5870 5680
5871 if (!cur) 5681 if (!cur)
5872 return(NULL); 5682 return NULL;
5873
5874 bp = NULL;
5875 for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
5876 bp = cur->bc_bufs[i];
5877 if (!bp) break;
5878 if (XFS_BUF_ADDR(bp) == bno)
5879 break; /* Found it */
5880 }
5881 if (i == XFS_BTREE_MAXLEVELS)
5882 bp = NULL;
5883
5884 if (!bp) { /* Chase down all the log items to see if the bp is there */
5885 xfs_log_item_chunk_t *licp;
5886 xfs_trans_t *tp;
5887
5888 tp = cur->bc_tp;
5889 licp = &tp->t_items;
5890 while (!bp && licp != NULL) {
5891 if (xfs_lic_are_all_free(licp)) {
5892 licp = licp->lic_next;
5893 continue;
5894 }
5895 for (i = 0; i < licp->lic_unused; i++) {
5896 xfs_log_item_desc_t *lidp;
5897 xfs_log_item_t *lip;
5898 xfs_buf_log_item_t *bip;
5899 xfs_buf_t *lbp;
5900
5901 if (xfs_lic_isfree(licp, i)) {
5902 continue;
5903 }
5904
5905 lidp = xfs_lic_slot(licp, i);
5906 lip = lidp->lid_item;
5907 if (lip->li_type != XFS_LI_BUF)
5908 continue;
5909 5683
5910 bip = (xfs_buf_log_item_t *)lip; 5684 for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
5911 lbp = bip->bli_buf; 5685 if (!cur->bc_bufs[i])
5686 break;
5687 if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
5688 return cur->bc_bufs[i];
5689 }
5912 5690
5913 if (XFS_BUF_ADDR(lbp) == bno) { 5691 /* Chase down all the log items to see if the bp is there */
5914 bp = lbp; 5692 list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
5915 break; /* Found it */ 5693 struct xfs_buf_log_item *bip;
5916 } 5694 bip = (struct xfs_buf_log_item *)lidp->lid_item;
5917 } 5695 if (bip->bli_item.li_type == XFS_LI_BUF &&
5918 licp = licp->lic_next; 5696 XFS_BUF_ADDR(bip->bli_buf) == bno)
5919 } 5697 return bip->bli_buf;
5920 } 5698 }
5921 return(bp); 5699
5700 return NULL;
5922} 5701}
5923 5702
5924STATIC void 5703STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87..71ec9b6ecdf 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
28extern kmem_zone_t *xfs_bmap_free_item_zone; 28extern kmem_zone_t *xfs_bmap_free_item_zone;
29 29
30/* 30/*
31 * DELTA: describe a change to the in-core extent list.
32 *
33 * Internally the use of xed_blockount is somewhat funky.
34 * xed_blockcount contains an offset much of the time because this
35 * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are
36 * the same underlying type).
37 */
38typedef struct xfs_extdelta
39{
40 xfs_fileoff_t xed_startoff; /* offset of range */
41 xfs_filblks_t xed_blockcount; /* blocks in range */
42} xfs_extdelta_t;
43
44/*
45 * List of extents to be free "later". 31 * List of extents to be free "later".
46 * The list is kept sorted on xbf_startblock. 32 * The list is kept sorted on xbf_startblock.
47 */ 33 */
@@ -82,27 +68,25 @@ typedef struct xfs_bmap_free
82#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ 68#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
83#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
84#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
85#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ 71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
86#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ 72#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
87#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ 73#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
88#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ 74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
89#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */
90#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
91 /* combine contig. space */ 75 /* combine contig. space */
92#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ 76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
93/* XFS_BMAPI_DIRECT_IO 0x800 */ 77/*
94#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */ 78 * unwritten extent conversion - this needs write cache flushing and no additional
95 /* need write cache flushing and no */ 79 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
96 /* additional allocation alignments */ 80 * from written to unwritten, otherwise convert from unwritten to written.
81 */
82#define XFS_BMAPI_CONVERT 0x200
97 83
98#define XFS_BMAPI_FLAGS \ 84#define XFS_BMAPI_FLAGS \
99 { XFS_BMAPI_WRITE, "WRITE" }, \ 85 { XFS_BMAPI_WRITE, "WRITE" }, \
100 { XFS_BMAPI_DELAY, "DELAY" }, \ 86 { XFS_BMAPI_DELAY, "DELAY" }, \
101 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 87 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
102 { XFS_BMAPI_METADATA, "METADATA" }, \ 88 { XFS_BMAPI_METADATA, "METADATA" }, \
103 { XFS_BMAPI_EXACT, "EXACT" }, \
104 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 89 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
105 { XFS_BMAPI_ASYNC, "ASYNC" }, \
106 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ 90 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
107 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 91 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
108 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 92 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
@@ -310,9 +294,7 @@ xfs_bmapi(
310 xfs_extlen_t total, /* total blocks needed */ 294 xfs_extlen_t total, /* total blocks needed */
311 struct xfs_bmbt_irec *mval, /* output: map values */ 295 struct xfs_bmbt_irec *mval, /* output: map values */
312 int *nmap, /* i/o: mval size/count */ 296 int *nmap, /* i/o: mval size/count */
313 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 297 xfs_bmap_free_t *flist); /* i/o: list extents to free */
314 xfs_extdelta_t *delta); /* o: change made to incore
315 extents */
316 298
317/* 299/*
318 * Map file blocks to filesystem blocks, simple version. 300 * Map file blocks to filesystem blocks, simple version.
@@ -346,8 +328,6 @@ xfs_bunmapi(
346 xfs_fsblock_t *firstblock, /* first allocated block 328 xfs_fsblock_t *firstblock, /* first allocated block
347 controls a.g. for allocs */ 329 controls a.g. for allocs */
348 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 330 xfs_bmap_free_t *flist, /* i/o: list extents to free */
349 xfs_extdelta_t *delta, /* o: change made to incore
350 extents */
351 int *done); /* set if not done yet */ 331 int *done); /* set if not done yet */
352 332
353/* 333/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b8..87d3c10b695 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
38#include "xfs_alloc.h" 34#include "xfs_alloc.h"
39#include "xfs_btree.h" 35#include "xfs_btree.h"
40#include "xfs_btree_trace.h" 36#include "xfs_btree_trace.h"
41#include "xfs_ialloc.h"
42#include "xfs_itable.h" 37#include "xfs_itable.h"
43#include "xfs_bmap.h" 38#include "xfs_bmap.h"
44#include "xfs_error.h" 39#include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f249..04f9cca8da7 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
38#include "xfs_btree.h" 34#include "xfs_btree.h"
39#include "xfs_btree_trace.h" 35#include "xfs_btree_trace.h"
40#include "xfs_ialloc.h"
41#include "xfs_error.h" 36#include "xfs_error.h"
42#include "xfs_trace.h" 37#include "xfs_trace.h"
43 38
@@ -222,7 +217,7 @@ xfs_btree_del_cursor(
222 */ 217 */
223 for (i = 0; i < cur->bc_nlevels; i++) { 218 for (i = 0; i < cur->bc_nlevels; i++) {
224 if (cur->bc_bufs[i]) 219 if (cur->bc_bufs[i])
225 xfs_btree_setbuf(cur, i, NULL); 220 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
226 else if (!error) 221 else if (!error)
227 break; 222 break;
228 } 223 }
@@ -661,7 +656,7 @@ xfs_btree_reada_bufl(
661 656
662 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
663 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
664 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
665} 660}
666 661
667/* 662/*
@@ -681,7 +676,7 @@ xfs_btree_reada_bufs(
681 ASSERT(agno != NULLAGNUMBER); 676 ASSERT(agno != NULLAGNUMBER);
682 ASSERT(agbno != NULLAGBLOCK); 677 ASSERT(agbno != NULLAGBLOCK);
683 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 678 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
684 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 679 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
685} 680}
686 681
687STATIC int 682STATIC int
@@ -768,22 +763,19 @@ xfs_btree_readahead(
768 * Set the buffer for level "lev" in the cursor to bp, releasing 763 * Set the buffer for level "lev" in the cursor to bp, releasing
769 * any previous buffer. 764 * any previous buffer.
770 */ 765 */
771void 766STATIC void
772xfs_btree_setbuf( 767xfs_btree_setbuf(
773 xfs_btree_cur_t *cur, /* btree cursor */ 768 xfs_btree_cur_t *cur, /* btree cursor */
774 int lev, /* level in btree */ 769 int lev, /* level in btree */
775 xfs_buf_t *bp) /* new buffer to set */ 770 xfs_buf_t *bp) /* new buffer to set */
776{ 771{
777 struct xfs_btree_block *b; /* btree block */ 772 struct xfs_btree_block *b; /* btree block */
778 xfs_buf_t *obp; /* old buffer pointer */
779 773
780 obp = cur->bc_bufs[lev]; 774 if (cur->bc_bufs[lev])
781 if (obp) 775 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
782 xfs_trans_brelse(cur->bc_tp, obp);
783 cur->bc_bufs[lev] = bp; 776 cur->bc_bufs[lev] = bp;
784 cur->bc_ra[lev] = 0; 777 cur->bc_ra[lev] = 0;
785 if (!bp) 778
786 return;
787 b = XFS_BUF_TO_BLOCK(bp); 779 b = XFS_BUF_TO_BLOCK(bp);
788 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 780 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
789 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 781 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3016,6 +3008,43 @@ out0:
3016 return 0; 3008 return 0;
3017} 3009}
3018 3010
3011/*
3012 * Kill the current root node, and replace it with it's only child node.
3013 */
3014STATIC int
3015xfs_btree_kill_root(
3016 struct xfs_btree_cur *cur,
3017 struct xfs_buf *bp,
3018 int level,
3019 union xfs_btree_ptr *newroot)
3020{
3021 int error;
3022
3023 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3024 XFS_BTREE_STATS_INC(cur, killroot);
3025
3026 /*
3027 * Update the root pointer, decreasing the level by 1 and then
3028 * free the old root.
3029 */
3030 cur->bc_ops->set_root(cur, newroot, -1);
3031
3032 error = cur->bc_ops->free_block(cur, bp);
3033 if (error) {
3034 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3035 return error;
3036 }
3037
3038 XFS_BTREE_STATS_INC(cur, free);
3039
3040 cur->bc_bufs[level] = NULL;
3041 cur->bc_ra[level] = 0;
3042 cur->bc_nlevels--;
3043
3044 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3045 return 0;
3046}
3047
3019STATIC int 3048STATIC int
3020xfs_btree_dec_cursor( 3049xfs_btree_dec_cursor(
3021 struct xfs_btree_cur *cur, 3050 struct xfs_btree_cur *cur,
@@ -3200,7 +3229,7 @@ xfs_btree_delrec(
3200 * Make it the new root of the btree. 3229 * Make it the new root of the btree.
3201 */ 3230 */
3202 pp = xfs_btree_ptr_addr(cur, 1, block); 3231 pp = xfs_btree_ptr_addr(cur, 1, block);
3203 error = cur->bc_ops->kill_root(cur, bp, level, pp); 3232 error = xfs_btree_kill_root(cur, bp, level, pp);
3204 if (error) 3233 if (error)
3205 goto error0; 3234 goto error0;
3206 } else if (level > 0) { 3235 } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdd..82fafc66bd1 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
152 152
153 /* update btree root pointer */ 153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur, 154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change); 155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158 156
159 /* block allocation / freeing */ 157 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur, 158 int (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
399 xfs_agblock_t agbno, /* allocation group block number */ 397 xfs_agblock_t agbno, /* allocation group block number */
400 xfs_extlen_t count); /* count of filesystem blocks */ 398 xfs_extlen_t count); /* count of filesystem blocks */
401 399
402/*
403 * Set the buffer for level "lev" in the cursor to bp, releasing
404 * any previous buffer.
405 */
406void
407xfs_btree_setbuf(
408 xfs_btree_cur_t *cur, /* btree cursor */
409 int lev, /* level in btree */
410 struct xfs_buf *bp); /* new buffer to set */
411
412 400
413/* 401/*
414 * Common btree core entry points. 402 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 02a80984aa0..2686d0d54c5 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_buf_item.h" 28#include "xfs_buf_item.h"
30#include "xfs_trans_priv.h" 29#include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
34 33
35kmem_zone_t *xfs_buf_item_zone; 34kmem_zone_t *xfs_buf_item_zone;
36 35
36static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
37{
38 return container_of(lip, struct xfs_buf_log_item, bli_item);
39}
40
41
37#ifdef XFS_TRANS_DEBUG 42#ifdef XFS_TRANS_DEBUG
38/* 43/*
39 * This function uses an alternate strategy for tracking the bytes 44 * This function uses an alternate strategy for tracking the bytes
@@ -151,12 +156,13 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
151 */ 156 */
152STATIC uint 157STATIC uint
153xfs_buf_item_size( 158xfs_buf_item_size(
154 xfs_buf_log_item_t *bip) 159 struct xfs_log_item *lip)
155{ 160{
156 uint nvecs; 161 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
157 int next_bit; 162 struct xfs_buf *bp = bip->bli_buf;
158 int last_bit; 163 uint nvecs;
159 xfs_buf_t *bp; 164 int next_bit;
165 int last_bit;
160 166
161 ASSERT(atomic_read(&bip->bli_refcount) > 0); 167 ASSERT(atomic_read(&bip->bli_refcount) > 0);
162 if (bip->bli_flags & XFS_BLI_STALE) { 168 if (bip->bli_flags & XFS_BLI_STALE) {
@@ -170,7 +176,6 @@ xfs_buf_item_size(
170 return 1; 176 return 1;
171 } 177 }
172 178
173 bp = bip->bli_buf;
174 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 179 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
175 nvecs = 1; 180 nvecs = 1;
176 last_bit = xfs_next_bit(bip->bli_format.blf_data_map, 181 last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -219,13 +224,13 @@ xfs_buf_item_size(
219 */ 224 */
220STATIC void 225STATIC void
221xfs_buf_item_format( 226xfs_buf_item_format(
222 xfs_buf_log_item_t *bip, 227 struct xfs_log_item *lip,
223 xfs_log_iovec_t *log_vector) 228 struct xfs_log_iovec *vecp)
224{ 229{
230 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
231 struct xfs_buf *bp = bip->bli_buf;
225 uint base_size; 232 uint base_size;
226 uint nvecs; 233 uint nvecs;
227 xfs_log_iovec_t *vecp;
228 xfs_buf_t *bp;
229 int first_bit; 234 int first_bit;
230 int last_bit; 235 int last_bit;
231 int next_bit; 236 int next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
235 ASSERT(atomic_read(&bip->bli_refcount) > 0); 240 ASSERT(atomic_read(&bip->bli_refcount) > 0);
236 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 241 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
237 (bip->bli_flags & XFS_BLI_STALE)); 242 (bip->bli_flags & XFS_BLI_STALE));
238 bp = bip->bli_buf;
239 vecp = log_vector;
240 243
241 /* 244 /*
242 * The size of the base structure is the size of the 245 * The size of the base structure is the size of the
@@ -248,7 +251,7 @@ xfs_buf_item_format(
248 base_size = 251 base_size =
249 (uint)(sizeof(xfs_buf_log_format_t) + 252 (uint)(sizeof(xfs_buf_log_format_t) +
250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 253 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 254 vecp->i_addr = &bip->bli_format;
252 vecp->i_len = base_size; 255 vecp->i_len = base_size;
253 vecp->i_type = XLOG_REG_TYPE_BFORMAT; 256 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
254 vecp++; 257 vecp++;
@@ -263,7 +266,7 @@ xfs_buf_item_format(
263 */ 266 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 267 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 268 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item))) 269 xfs_log_item_in_current_chkpt(lip)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; 270 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 271 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 } 272 }
@@ -356,66 +359,90 @@ xfs_buf_item_format(
356 359
357/* 360/*
358 * This is called to pin the buffer associated with the buf log item in memory 361 * This is called to pin the buffer associated with the buf log item in memory
359 * so it cannot be written out. Simply call bpin() on the buffer to do this. 362 * so it cannot be written out.
360 * 363 *
361 * We also always take a reference to the buffer log item here so that the bli 364 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can 365 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the 366 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed. 367 * transaction is completed.
365 */ 368 */
366
367STATIC void 369STATIC void
368xfs_buf_item_pin( 370xfs_buf_item_pin(
369 xfs_buf_log_item_t *bip) 371 struct xfs_log_item *lip)
370{ 372{
371 xfs_buf_t *bp; 373 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
372 374
373 bp = bip->bli_buf; 375 ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
374 ASSERT(XFS_BUF_ISBUSY(bp));
375 ASSERT(atomic_read(&bip->bli_refcount) > 0); 376 ASSERT(atomic_read(&bip->bli_refcount) > 0);
376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 377 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
377 (bip->bli_flags & XFS_BLI_STALE)); 378 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount); 379
379 trace_xfs_buf_item_pin(bip); 380 trace_xfs_buf_item_pin(bip);
380 xfs_bpin(bp);
381}
382 381
382 atomic_inc(&bip->bli_refcount);
383 atomic_inc(&bip->bli_buf->b_pin_count);
384}
383 385
384/* 386/*
385 * This is called to unpin the buffer associated with the buf log 387 * This is called to unpin the buffer associated with the buf log
386 * item which was previously pinned with a call to xfs_buf_item_pin(). 388 * item which was previously pinned with a call to xfs_buf_item_pin().
387 * Just call bunpin() on the buffer to do this.
388 * 389 *
389 * Also drop the reference to the buf item for the current transaction. 390 * Also drop the reference to the buf item for the current transaction.
390 * If the XFS_BLI_STALE flag is set and we are the last reference, 391 * If the XFS_BLI_STALE flag is set and we are the last reference,
391 * then free up the buf log item and unlock the buffer. 392 * then free up the buf log item and unlock the buffer.
393 *
394 * If the remove flag is set we are called from uncommit in the
395 * forced-shutdown path. If that is true and the reference count on
396 * the log item is going to drop to zero we need to free the item's
397 * descriptor in the transaction.
392 */ 398 */
393STATIC void 399STATIC void
394xfs_buf_item_unpin( 400xfs_buf_item_unpin(
395 xfs_buf_log_item_t *bip) 401 struct xfs_log_item *lip,
402 int remove)
396{ 403{
397 struct xfs_ail *ailp; 404 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
398 xfs_buf_t *bp; 405 xfs_buf_t *bp = bip->bli_buf;
399 int freed; 406 struct xfs_ail *ailp = lip->li_ailp;
400 int stale = bip->bli_flags & XFS_BLI_STALE; 407 int stale = bip->bli_flags & XFS_BLI_STALE;
408 int freed;
401 409
402 bp = bip->bli_buf;
403 ASSERT(bp != NULL);
404 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); 410 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
405 ASSERT(atomic_read(&bip->bli_refcount) > 0); 411 ASSERT(atomic_read(&bip->bli_refcount) > 0);
412
406 trace_xfs_buf_item_unpin(bip); 413 trace_xfs_buf_item_unpin(bip);
407 414
408 freed = atomic_dec_and_test(&bip->bli_refcount); 415 freed = atomic_dec_and_test(&bip->bli_refcount);
409 ailp = bip->bli_item.li_ailp; 416
410 xfs_bunpin(bp); 417 if (atomic_dec_and_test(&bp->b_pin_count))
418 wake_up_all(&bp->b_waiters);
419
411 if (freed && stale) { 420 if (freed && stale) {
412 ASSERT(bip->bli_flags & XFS_BLI_STALE); 421 ASSERT(bip->bli_flags & XFS_BLI_STALE);
413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 422 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 423 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
415 ASSERT(XFS_BUF_ISSTALE(bp)); 424 ASSERT(XFS_BUF_ISSTALE(bp));
416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 425 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
426
417 trace_xfs_buf_item_unpin_stale(bip); 427 trace_xfs_buf_item_unpin_stale(bip);
418 428
429 if (remove) {
430 /*
431 * We have to remove the log item from the transaction
432 * as we are about to release our reference to the
433 * buffer. If we don't, the unlock that occurs later
434 * in xfs_trans_uncommit() will ry to reference the
435 * buffer which we no longer have a hold on.
436 */
437 xfs_trans_del_item(lip);
438
439 /*
440 * Since the transaction no longer refers to the buffer,
441 * the buffer should no longer refer to the transaction.
442 */
443 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
444 }
445
419 /* 446 /*
420 * If we get called here because of an IO error, we may 447 * If we get called here because of an IO error, we may
421 * or may not have the item on the AIL. xfs_trans_ail_delete() 448 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -437,48 +464,6 @@ xfs_buf_item_unpin(
437} 464}
438 465
439/* 466/*
440 * this is called from uncommit in the forced-shutdown path.
441 * we need to check to see if the reference count on the log item
442 * is going to drop to zero. If so, unpin will free the log item
443 * so we need to free the item's descriptor (that points to the item)
444 * in the transaction.
445 */
446STATIC void
447xfs_buf_item_unpin_remove(
448 xfs_buf_log_item_t *bip,
449 xfs_trans_t *tp)
450{
451 /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
452 if ((atomic_read(&bip->bli_refcount) == 1) &&
453 (bip->bli_flags & XFS_BLI_STALE)) {
454 /*
455 * yes -- We can safely do some work here and then call
456 * buf_item_unpin to do the rest because we are
457 * are holding the buffer locked so no one else will be
458 * able to bump up the refcount. We have to remove the
459 * log item from the transaction as we are about to release
460 * our reference to the buffer. If we don't, the unlock that
461 * occurs later in the xfs_trans_uncommit() will try to
462 * reference the buffer which we no longer have a hold on.
463 */
464 struct xfs_log_item_desc *lidp;
465
466 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
467 trace_xfs_buf_item_unpin_stale(bip);
468
469 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
470 xfs_trans_free_item(tp, lidp);
471
472 /*
473 * Since the transaction no longer refers to the buffer, the
474 * buffer should no longer refer to the transaction.
475 */
476 XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
477 }
478 xfs_buf_item_unpin(bip);
479}
480
481/*
482 * This is called to attempt to lock the buffer associated with this 467 * This is called to attempt to lock the buffer associated with this
483 * buf log item. Don't sleep on the buffer lock. If we can't get 468 * buf log item. Don't sleep on the buffer lock. If we can't get
484 * the lock right away, return 0. If we can get the lock, take a 469 * the lock right away, return 0. If we can get the lock, take a
@@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove(
488 */ 473 */
489STATIC uint 474STATIC uint
490xfs_buf_item_trylock( 475xfs_buf_item_trylock(
491 xfs_buf_log_item_t *bip) 476 struct xfs_log_item *lip)
492{ 477{
493 xfs_buf_t *bp; 478 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
479 struct xfs_buf *bp = bip->bli_buf;
494 480
495 bp = bip->bli_buf;
496 if (XFS_BUF_ISPINNED(bp)) 481 if (XFS_BUF_ISPINNED(bp))
497 return XFS_ITEM_PINNED; 482 return XFS_ITEM_PINNED;
498 if (!XFS_BUF_CPSEMA(bp)) 483 if (!XFS_BUF_CPSEMA(bp))
@@ -529,13 +514,12 @@ xfs_buf_item_trylock(
529 */ 514 */
530STATIC void 515STATIC void
531xfs_buf_item_unlock( 516xfs_buf_item_unlock(
532 xfs_buf_log_item_t *bip) 517 struct xfs_log_item *lip)
533{ 518{
534 int aborted; 519 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
535 xfs_buf_t *bp; 520 struct xfs_buf *bp = bip->bli_buf;
536 uint hold; 521 int aborted;
537 522 uint hold;
538 bp = bip->bli_buf;
539 523
540 /* Clear the buffer's association with this transaction. */ 524 /* Clear the buffer's association with this transaction. */
541 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 525 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
@@ -546,7 +530,7 @@ xfs_buf_item_unlock(
546 * (cancelled) buffers at unpin time, but we'll never go through the 530 * (cancelled) buffers at unpin time, but we'll never go through the
547 * pin/unpin cycle if we abort inside commit. 531 * pin/unpin cycle if we abort inside commit.
548 */ 532 */
549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 533 aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
550 534
551 /* 535 /*
552 * Before possibly freeing the buf item, determine if we should 536 * Before possibly freeing the buf item, determine if we should
@@ -607,16 +591,16 @@ xfs_buf_item_unlock(
607 */ 591 */
608STATIC xfs_lsn_t 592STATIC xfs_lsn_t
609xfs_buf_item_committed( 593xfs_buf_item_committed(
610 xfs_buf_log_item_t *bip, 594 struct xfs_log_item *lip,
611 xfs_lsn_t lsn) 595 xfs_lsn_t lsn)
612{ 596{
597 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
598
613 trace_xfs_buf_item_committed(bip); 599 trace_xfs_buf_item_committed(bip);
614 600
615 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 601 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
616 (bip->bli_item.li_lsn != 0)) { 602 return lip->li_lsn;
617 return bip->bli_item.li_lsn; 603 return lsn;
618 }
619 return (lsn);
620} 604}
621 605
622/* 606/*
@@ -626,15 +610,16 @@ xfs_buf_item_committed(
626 */ 610 */
627STATIC void 611STATIC void
628xfs_buf_item_push( 612xfs_buf_item_push(
629 xfs_buf_log_item_t *bip) 613 struct xfs_log_item *lip)
630{ 614{
631 xfs_buf_t *bp; 615 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
616 struct xfs_buf *bp = bip->bli_buf;
632 617
633 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 618 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
619 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
620
634 trace_xfs_buf_item_push(bip); 621 trace_xfs_buf_item_push(bip);
635 622
636 bp = bip->bli_buf;
637 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
638 xfs_buf_relse(bp); 623 xfs_buf_relse(bp);
639} 624}
640 625
@@ -646,22 +631,24 @@ xfs_buf_item_push(
646 */ 631 */
647STATIC void 632STATIC void
648xfs_buf_item_pushbuf( 633xfs_buf_item_pushbuf(
649 xfs_buf_log_item_t *bip) 634 struct xfs_log_item *lip)
650{ 635{
651 xfs_buf_t *bp; 636 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
637 struct xfs_buf *bp = bip->bli_buf;
652 638
653 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 639 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
640 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
641
654 trace_xfs_buf_item_pushbuf(bip); 642 trace_xfs_buf_item_pushbuf(bip);
655 643
656 bp = bip->bli_buf;
657 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
658 xfs_buf_delwri_promote(bp); 644 xfs_buf_delwri_promote(bp);
659 xfs_buf_relse(bp); 645 xfs_buf_relse(bp);
660} 646}
661 647
662/* ARGSUSED */
663STATIC void 648STATIC void
664xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) 649xfs_buf_item_committing(
650 struct xfs_log_item *lip,
651 xfs_lsn_t commit_lsn)
665{ 652{
666} 653}
667 654
@@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
669 * This is the ops vector shared by all buf log items. 656 * This is the ops vector shared by all buf log items.
670 */ 657 */
671static struct xfs_item_ops xfs_buf_item_ops = { 658static struct xfs_item_ops xfs_buf_item_ops = {
672 .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, 659 .iop_size = xfs_buf_item_size,
673 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 660 .iop_format = xfs_buf_item_format,
674 xfs_buf_item_format, 661 .iop_pin = xfs_buf_item_pin,
675 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 662 .iop_unpin = xfs_buf_item_unpin,
676 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, 663 .iop_trylock = xfs_buf_item_trylock,
677 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 664 .iop_unlock = xfs_buf_item_unlock,
678 xfs_buf_item_unpin_remove, 665 .iop_committed = xfs_buf_item_committed,
679 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 666 .iop_push = xfs_buf_item_push,
680 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, 667 .iop_pushbuf = xfs_buf_item_pushbuf,
681 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 668 .iop_committing = xfs_buf_item_committing
682 xfs_buf_item_committed,
683 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
684 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
685 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
686 xfs_buf_item_committing
687}; 669};
688 670
689 671
@@ -710,9 +692,7 @@ xfs_buf_item_init(
710 * the first. If we do already have one, there is 692 * the first. If we do already have one, there is
711 * nothing to do here so return. 693 * nothing to do here so return.
712 */ 694 */
713 if (bp->b_mount != mp) 695 ASSERT(bp->b_target->bt_mount == mp);
714 bp->b_mount = mp;
715 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
716 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 696 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
717 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 697 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
718 if (lip->li_type == XFS_LI_BUF) { 698 if (lip->li_type == XFS_LI_BUF) {
@@ -993,7 +973,7 @@ xfs_buf_iodone_callbacks(
993 xfs_buf_do_callbacks(bp, lip); 973 xfs_buf_do_callbacks(bp, lip);
994 XFS_BUF_SET_FSPRIVATE(bp, NULL); 974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
995 XFS_BUF_CLR_IODONE_FUNC(bp); 975 XFS_BUF_CLR_IODONE_FUNC(bp);
996 xfs_biodone(bp); 976 xfs_buf_ioend(bp, 0);
997 return; 977 return;
998 } 978 }
999 979
@@ -1052,7 +1032,7 @@ xfs_buf_iodone_callbacks(
1052 xfs_buf_do_callbacks(bp, lip); 1032 xfs_buf_do_callbacks(bp, lip);
1053 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1033 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1054 XFS_BUF_CLR_IODONE_FUNC(bp); 1034 XFS_BUF_CLR_IODONE_FUNC(bp);
1055 xfs_biodone(bp); 1035 xfs_buf_ioend(bp, 0);
1056} 1036}
1057 1037
1058/* 1038/*
@@ -1098,15 +1078,14 @@ xfs_buf_error_relse(
1098 * It is called by xfs_buf_iodone_callbacks() above which will take 1078 * It is called by xfs_buf_iodone_callbacks() above which will take
1099 * care of cleaning up the buffer itself. 1079 * care of cleaning up the buffer itself.
1100 */ 1080 */
1101/* ARGSUSED */
1102void 1081void
1103xfs_buf_iodone( 1082xfs_buf_iodone(
1104 xfs_buf_t *bp, 1083 struct xfs_buf *bp,
1105 xfs_buf_log_item_t *bip) 1084 struct xfs_log_item *lip)
1106{ 1085{
1107 struct xfs_ail *ailp = bip->bli_item.li_ailp; 1086 struct xfs_ail *ailp = lip->li_ailp;
1108 1087
1109 ASSERT(bip->bli_buf == bp); 1088 ASSERT(BUF_ITEM(lip)->bli_buf == bp);
1110 1089
1111 xfs_buf_rele(bp); 1090 xfs_buf_rele(bp);
1112 1091
@@ -1120,6 +1099,6 @@ xfs_buf_iodone(
1120 * Either way, AIL is useless if we're forcing a shutdown. 1099 * Either way, AIL is useless if we're forcing a shutdown.
1121 */ 1100 */
1122 spin_lock(&ailp->xa_lock); 1101 spin_lock(&ailp->xa_lock);
1123 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 1102 xfs_trans_ail_delete(ailp, lip);
1124 xfs_buf_item_free(bip); 1103 xfs_buf_item_free(BUF_ITEM(lip));
1125} 1104}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f20bb472d58..0e2ed43f16c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -124,7 +124,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
124 void(*)(struct xfs_buf *, xfs_log_item_t *), 124 void(*)(struct xfs_buf *, xfs_log_item_t *),
125 xfs_log_item_t *); 125 xfs_log_item_t *);
126void xfs_buf_iodone_callbacks(struct xfs_buf *); 126void xfs_buf_iodone_callbacks(struct xfs_buf *);
127void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); 127void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
128 128
129#ifdef XFS_TRANS_DEBUG 129#ifdef XFS_TRANS_DEBUG
130void 130void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf3..1c00bedb317 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h" 31#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
39#include "xfs_alloc.h" 35#include "xfs_alloc.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 36#include "xfs_bmap.h"
42#include "xfs_attr.h" 37#include "xfs_attr.h"
43#include "xfs_attr_leaf.h" 38#include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
581 xfs_da_intnode_t *node; 576 xfs_da_intnode_t *node;
582 xfs_da_node_entry_t *btree; 577 xfs_da_node_entry_t *btree;
583 int tmp; 578 int tmp;
584 xfs_mount_t *mp;
585 579
586 node = oldblk->bp->data; 580 node = oldblk->bp->data;
587 mp = state->mp;
588 ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); 581 ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
589 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); 582 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
590 ASSERT(newblk->blkno != 0); 583 ASSERT(newblk->blkno != 0);
591 if (state->args->whichfork == XFS_DATA_FORK) 584 if (state->args->whichfork == XFS_DATA_FORK)
592 ASSERT(newblk->blkno >= mp->m_dirleafblk && 585 ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
593 newblk->blkno < mp->m_dirfreeblk); 586 newblk->blkno < state->mp->m_dirfreeblk);
594 587
595 /* 588 /*
596 * We may need to make some room before we insert the new node. 589 * We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1601 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1594 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1602 XFS_BMAPI_CONTIG, 1595 XFS_BMAPI_CONTIG,
1603 args->firstblock, args->total, &map, &nmap, 1596 args->firstblock, args->total, &map, &nmap,
1604 args->flist, NULL))) { 1597 args->flist))) {
1605 return error; 1598 return error;
1606 } 1599 }
1607 ASSERT(nmap <= 1); 1600 ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1622 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| 1615 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
1623 XFS_BMAPI_METADATA, 1616 XFS_BMAPI_METADATA,
1624 args->firstblock, args->total, 1617 args->firstblock, args->total,
1625 &mapp[mapi], &nmap, args->flist, 1618 &mapp[mapi], &nmap, args->flist))) {
1626 NULL))) {
1627 kmem_free(mapp); 1619 kmem_free(mapp);
1628 return error; 1620 return error;
1629 } 1621 }
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1884 */ 1876 */
1885 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, 1877 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1886 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 1878 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1887 0, args->firstblock, args->flist, NULL, 1879 0, args->firstblock, args->flist,
1888 &done)) == ENOSPC) { 1880 &done)) == ENOSPC) {
1889 if (w != XFS_DATA_FORK) 1881 if (w != XFS_DATA_FORK)
1890 break; 1882 break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
1989 nfsb, 1981 nfsb,
1990 XFS_BMAPI_METADATA | 1982 XFS_BMAPI_METADATA |
1991 xfs_bmapi_aflag(whichfork), 1983 xfs_bmapi_aflag(whichfork),
1992 NULL, 0, mapp, &nmap, NULL, NULL))) 1984 NULL, 0, mapp, &nmap, NULL)))
1993 goto exit0; 1985 goto exit0;
1994 } 1986 }
1995 } else { 1987 } else {
@@ -2050,7 +2042,7 @@ xfs_da_do_buf(
2050 mappedbno, nmapped, 0, &bp); 2042 mappedbno, nmapped, 0, &bp);
2051 break; 2043 break;
2052 case 3: 2044 case 3:
2053 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); 2045 xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
2054 error = 0; 2046 error = 0;
2055 bp = NULL; 2047 bp = NULL;
2056 break; 2048 break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7f159d2a429..3b9582c60a2 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
38#include "xfs_bmap.h" 32#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_itable.h" 33#include "xfs_itable.h"
42#include "xfs_dfrag.h" 34#include "xfs_dfrag.h"
43#include "xfs_error.h" 35#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_vnodeops.h" 36#include "xfs_vnodeops.h"
46#include "xfs_trace.h" 37#include "xfs_trace.h"
47 38
@@ -425,11 +416,8 @@ xfs_swap_extents(
425 } 416 }
426 417
427 418
428 IHOLD(ip); 419 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
429 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 420 xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
430
431 IHOLD(tip);
432 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
433 421
434 xfs_trans_log_inode(tp, ip, ilf_fields); 422 xfs_trans_log_inode(tp, ip, ilf_fields);
435 xfs_trans_log_inode(tp, tip, tilf_fields); 423 xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a..dffba9ba0db 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
49 __be32 di_uid; /* owner's user id */ 49 __be32 di_uid; /* owner's user id */
50 __be32 di_gid; /* owner's group id */ 50 __be32 di_gid; /* owner's group id */
51 __be32 di_nlink; /* number of links to file */ 51 __be32 di_nlink; /* number of links to file */
52 __be16 di_projid; /* owner's project id */ 52 __be16 di_projid_lo; /* lower part of owner's project id */
53 __u8 di_pad[8]; /* unused, zeroed space */ 53 __be16 di_projid_hi; /* higher part owner's project id */
54 __u8 di_pad[6]; /* unused, zeroed space */
54 __be16 di_flushiter; /* incremented on flush */ 55 __be16 di_flushiter; /* incremented on flush */
55 xfs_timestamp_t di_atime; /* time last accessed */ 56 xfs_timestamp_t di_atime; /* time last accessed */
56 xfs_timestamp_t di_mtime; /* time last modified */ 57 xfs_timestamp_t di_mtime; /* time last modified */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f04126..a1321bc7f19 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 31#include "xfs_alloc_btree.h"
33#include "xfs_dir2_sf.h" 32#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 33#include "xfs_dinode.h"
36#include "xfs_inode.h" 34#include "xfs_inode.h"
37#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
382 int rval; /* return value */ 380 int rval; /* return value */
383 int v; /* type-checking value */ 381 int v; /* type-checking value */
384 382
385 xfs_itrace_entry(dp); 383 trace_xfs_readdir(dp);
386 384
387 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 385 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
388 return XFS_ERROR(EIO); 386 return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
549 if ((error = xfs_bmapi(tp, dp, bno, count, 547 if ((error = xfs_bmapi(tp, dp, bno, count,
550 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, 548 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
551 args->firstblock, args->total, &map, &nmap, 549 args->firstblock, args->total, &map, &nmap,
552 args->flist, NULL))) 550 args->flist)))
553 return error; 551 return error;
554 ASSERT(nmap <= 1); 552 ASSERT(nmap <= 1);
555 if (nmap == 1) { 553 if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
581 if ((error = xfs_bmapi(tp, dp, b, c, 579 if ((error = xfs_bmapi(tp, dp, b, c,
582 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, 580 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
583 args->firstblock, args->total, 581 args->firstblock, args->total,
584 &mapp[mapi], &nmap, args->flist, 582 &mapp[mapi], &nmap, args->flist))) {
585 NULL))) {
586 kmem_free(mapp); 583 kmem_free(mapp);
587 return error; 584 return error;
588 } 585 }
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
715 */ 712 */
716 if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, 713 if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
717 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, 714 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
718 NULL, &done))) { 715 &done))) {
719 /* 716 /*
720 * ENOSPC actually can happen if we're in a removename with 717 * ENOSPC actually can happen if we're in a removename with
721 * no space reservation, and the resulting block removal 718 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a8..580d99cef9e 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
1073 */ 1071 */
1074 1072
1075 buf_len = dp->i_df.if_bytes; 1073 buf_len = dp->i_df.if_bytes;
1076 buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); 1074 buf = kmem_alloc(buf_len, KM_SLEEP);
1077 1075
1078 memcpy(buf, sfp, dp->i_df.if_bytes); 1076 memcpy(buf, sfp, buf_len);
1079 xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); 1077 xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
1080 dp->i_d.di_size = 0; 1078 dp->i_d.di_size = 0;
1081 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1079 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1082 /* 1080 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d69433..921595b84f5 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_dir2_data.h" 33#include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9..ae891223be9 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dir2_sf.h" 31#include "xfs_dir2_sf.h"
34#include "xfs_dinode.h" 32#include "xfs_dinode.h"
35#include "xfs_inode.h" 33#include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
875 xfs_dir2_byte_to_da(mp, 873 xfs_dir2_byte_to_da(mp,
876 XFS_DIR2_LEAF_OFFSET) - map_off, 874 XFS_DIR2_LEAF_OFFSET) - map_off,
877 XFS_BMAPI_METADATA, NULL, 0, 875 XFS_BMAPI_METADATA, NULL, 0,
878 &map[map_valid], &nmap, NULL, NULL); 876 &map[map_valid], &nmap, NULL);
879 /* 877 /*
880 * Don't know if we should ignore this or 878 * Don't know if we should ignore this or
881 * try to return an error. 879 * try to return an error.
@@ -963,7 +961,7 @@ xfs_dir2_leaf_getdents(
963 if (i > ra_current && 961 if (i > ra_current &&
964 map[ra_index].br_blockcount >= 962 map[ra_index].br_blockcount >=
965 mp->m_dirblkfsbs) { 963 mp->m_dirblkfsbs) {
966 xfs_baread(mp->m_ddev_targp, 964 xfs_buf_readahead(mp->m_ddev_targp,
967 XFS_FSB_TO_DADDR(mp, 965 XFS_FSB_TO_DADDR(mp,
968 map[ra_index].br_startblock + 966 map[ra_index].br_startblock +
969 ra_offset), 967 ra_offset),
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae75..f9a0864b696 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_bmap.h" 33#include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463..b1bae6b1eed 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd7237..00000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DMAPI_H__
19#define __XFS_DMAPI_H__
20
21/* Values used to define the on-disk version of dm_attrname_t. All
22 * on-disk attribute names start with the 8-byte string "SGI_DMI_".
23 *
24 * In the on-disk inode, DMAPI attribute names consist of the user-provided
25 * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
26 * changed.
27 */
28
29#define DMATTR_PREFIXLEN 8
30#define DMATTR_PREFIXSTRING "SGI_DMI_"
31
32typedef enum {
33 DM_EVENT_INVALID = -1,
34 DM_EVENT_CANCEL = 0, /* not supported */
35 DM_EVENT_MOUNT = 1,
36 DM_EVENT_PREUNMOUNT = 2,
37 DM_EVENT_UNMOUNT = 3,
38 DM_EVENT_DEBUT = 4, /* not supported */
39 DM_EVENT_CREATE = 5,
40 DM_EVENT_CLOSE = 6, /* not supported */
41 DM_EVENT_POSTCREATE = 7,
42 DM_EVENT_REMOVE = 8,
43 DM_EVENT_POSTREMOVE = 9,
44 DM_EVENT_RENAME = 10,
45 DM_EVENT_POSTRENAME = 11,
46 DM_EVENT_LINK = 12,
47 DM_EVENT_POSTLINK = 13,
48 DM_EVENT_SYMLINK = 14,
49 DM_EVENT_POSTSYMLINK = 15,
50 DM_EVENT_READ = 16,
51 DM_EVENT_WRITE = 17,
52 DM_EVENT_TRUNCATE = 18,
53 DM_EVENT_ATTRIBUTE = 19,
54 DM_EVENT_DESTROY = 20,
55 DM_EVENT_NOSPACE = 21,
56 DM_EVENT_USER = 22,
57 DM_EVENT_MAX = 23
58} dm_eventtype_t;
59#define HAVE_DM_EVENTTYPE_T
60
61typedef enum {
62 DM_RIGHT_NULL,
63 DM_RIGHT_SHARED,
64 DM_RIGHT_EXCL
65} dm_right_t;
66#define HAVE_DM_RIGHT_T
67
68/* Defines for determining if an event message should be sent. */
69#ifdef HAVE_DMAPI
70#define DM_EVENT_ENABLED(ip, event) ( \
71 unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
72 ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
73 ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
74 )
75#else
76#define DM_EVENT_ENABLED(ip, event) (0)
77#endif
78
79#define DM_XFS_VALID_FS_EVENTS ( \
80 (1 << DM_EVENT_PREUNMOUNT) | \
81 (1 << DM_EVENT_UNMOUNT) | \
82 (1 << DM_EVENT_NOSPACE) | \
83 (1 << DM_EVENT_DEBUT) | \
84 (1 << DM_EVENT_CREATE) | \
85 (1 << DM_EVENT_POSTCREATE) | \
86 (1 << DM_EVENT_REMOVE) | \
87 (1 << DM_EVENT_POSTREMOVE) | \
88 (1 << DM_EVENT_RENAME) | \
89 (1 << DM_EVENT_POSTRENAME) | \
90 (1 << DM_EVENT_LINK) | \
91 (1 << DM_EVENT_POSTLINK) | \
92 (1 << DM_EVENT_SYMLINK) | \
93 (1 << DM_EVENT_POSTSYMLINK) | \
94 (1 << DM_EVENT_ATTRIBUTE) | \
95 (1 << DM_EVENT_DESTROY) )
96
97/* Events valid in dm_set_eventlist() when called with a file handle for
98 a regular file or a symlink. These events are persistent.
99*/
100
101#define DM_XFS_VALID_FILE_EVENTS ( \
102 (1 << DM_EVENT_ATTRIBUTE) | \
103 (1 << DM_EVENT_DESTROY) )
104
105/* Events valid in dm_set_eventlist() when called with a file handle for
106 a directory. These events are persistent.
107*/
108
109#define DM_XFS_VALID_DIRECTORY_EVENTS ( \
110 (1 << DM_EVENT_CREATE) | \
111 (1 << DM_EVENT_POSTCREATE) | \
112 (1 << DM_EVENT_REMOVE) | \
113 (1 << DM_EVENT_POSTREMOVE) | \
114 (1 << DM_EVENT_RENAME) | \
115 (1 << DM_EVENT_POSTRENAME) | \
116 (1 << DM_EVENT_LINK) | \
117 (1 << DM_EVENT_POSTLINK) | \
118 (1 << DM_EVENT_SYMLINK) | \
119 (1 << DM_EVENT_POSTSYMLINK) | \
120 (1 << DM_EVENT_ATTRIBUTE) | \
121 (1 << DM_EVENT_DESTROY) )
122
123/* Events supported by the XFS filesystem. */
124#define DM_XFS_SUPPORTED_EVENTS ( \
125 (1 << DM_EVENT_MOUNT) | \
126 (1 << DM_EVENT_PREUNMOUNT) | \
127 (1 << DM_EVENT_UNMOUNT) | \
128 (1 << DM_EVENT_NOSPACE) | \
129 (1 << DM_EVENT_CREATE) | \
130 (1 << DM_EVENT_POSTCREATE) | \
131 (1 << DM_EVENT_REMOVE) | \
132 (1 << DM_EVENT_POSTREMOVE) | \
133 (1 << DM_EVENT_RENAME) | \
134 (1 << DM_EVENT_POSTRENAME) | \
135 (1 << DM_EVENT_LINK) | \
136 (1 << DM_EVENT_POSTLINK) | \
137 (1 << DM_EVENT_SYMLINK) | \
138 (1 << DM_EVENT_POSTSYMLINK) | \
139 (1 << DM_EVENT_READ) | \
140 (1 << DM_EVENT_WRITE) | \
141 (1 << DM_EVENT_TRUNCATE) | \
142 (1 << DM_EVENT_ATTRIBUTE) | \
143 (1 << DM_EVENT_DESTROY) )
144
145
146/*
147 * Definitions used for the flags field on dm_send_*_event().
148 */
149
150#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
151#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
152#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */
153#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
154#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
155
156/*
157 * Pull in platform specific event flags defines
158 */
159#include "xfs_dmapi_priv.h"
160
161/*
162 * Macros to turn caller specified delay/block flags into
163 * dm_send_xxxx_event flag DM_FLAGS_NDELAY.
164 */
165
166#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
167 DM_FLAGS_NDELAY : 0)
168#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
169
170#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c..00000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_dmapi.h"
25#include "xfs_inum.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28
29
30static struct xfs_dmops xfs_dmcore_stub = {
31 .xfs_send_data = (xfs_send_data_t)fs_nosys,
32 .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr,
33 .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys,
34 .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys,
35 .xfs_send_mount = (xfs_send_mount_t)fs_nosys,
36 .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr,
37};
38
39int
40xfs_dmops_get(struct xfs_mount *mp)
41{
42 if (mp->m_flags & XFS_MOUNT_DMAPI) {
43 cmn_err(CE_WARN,
44 "XFS: dmapi support not available in this kernel.");
45 return EINVAL;
46 }
47
48 mp->m_dm_ops = &xfs_dmcore_stub;
49 return 0;
50}
51
52void
53xfs_dmops_put(struct xfs_mount *mp)
54{
55}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 047b8a8e5c2..ed999026766 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
30#include "xfs_dir2_sf.h"
31#include "xfs_attr_sf.h"
32#include "xfs_dinode.h" 28#include "xfs_dinode.h"
33#include "xfs_inode.h" 29#include "xfs_inode.h"
34#include "xfs_utils.h" 30#include "xfs_utils.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 409fe81585f..a55e687bf56 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
24#include "xfs_buf_item.h" 24#include "xfs_buf_item.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
30#include "xfs_extfree_item.h" 29#include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
33kmem_zone_t *xfs_efi_zone; 32kmem_zone_t *xfs_efi_zone;
34kmem_zone_t *xfs_efd_zone; 33kmem_zone_t *xfs_efd_zone;
35 34
36STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); 35static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
36{
37 return container_of(lip, struct xfs_efi_log_item, efi_item);
38}
37 39
38void 40void
39xfs_efi_item_free(xfs_efi_log_item_t *efip) 41xfs_efi_item_free(
42 struct xfs_efi_log_item *efip)
40{ 43{
41 int nexts = efip->efi_format.efi_nextents; 44 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
42
43 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
44 kmem_free(efip); 45 kmem_free(efip);
45 } else { 46 else
46 kmem_zone_free(xfs_efi_zone, efip); 47 kmem_zone_free(xfs_efi_zone, efip);
47 }
48} 48}
49 49
50/* 50/*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 52 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 53 * structure.
54 */ 54 */
55/*ARGSUSED*/
56STATIC uint 55STATIC uint
57xfs_efi_item_size(xfs_efi_log_item_t *efip) 56xfs_efi_item_size(
57 struct xfs_log_item *lip)
58{ 58{
59 return 1; 59 return 1;
60} 60}
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
67 * slots in the efi item have been filled. 67 * slots in the efi item have been filled.
68 */ 68 */
69STATIC void 69STATIC void
70xfs_efi_item_format(xfs_efi_log_item_t *efip, 70xfs_efi_item_format(
71 xfs_log_iovec_t *log_vector) 71 struct xfs_log_item *lip,
72 struct xfs_log_iovec *log_vector)
72{ 73{
73 uint size; 74 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size;
74 76
75 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
76 78
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
80 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); 82 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
81 efip->efi_format.efi_size = 1; 83 efip->efi_format.efi_size = 1;
82 84
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 85 log_vector->i_addr = &efip->efi_format;
84 log_vector->i_len = size; 86 log_vector->i_len = size;
85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 87 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 88 ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
90/* 92/*
91 * Pinning has no meaning for an efi item, so just return. 93 * Pinning has no meaning for an efi item, so just return.
92 */ 94 */
93/*ARGSUSED*/
94STATIC void 95STATIC void
95xfs_efi_item_pin(xfs_efi_log_item_t *efip) 96xfs_efi_item_pin(
97 struct xfs_log_item *lip)
96{ 98{
97 return;
98} 99}
99 100
100
101/* 101/*
102 * While EFIs cannot really be pinned, the unpin operation is the 102 * While EFIs cannot really be pinned, the unpin operation is the
103 * last place at which the EFI is manipulated during a transaction. 103 * last place at which the EFI is manipulated during a transaction.
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 104 * Here we coordinate with xfs_efi_cancel() to determine who gets to
105 * free the EFI. 105 * free the EFI.
106 */ 106 */
107/*ARGSUSED*/
108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
110{
111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112
113 spin_lock(&ailp->xa_lock);
114 if (efip->efi_flags & XFS_EFI_CANCELED) {
115 /* xfs_trans_ail_delete() drops the AIL lock. */
116 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
117 xfs_efi_item_free(efip);
118 } else {
119 efip->efi_flags |= XFS_EFI_COMMITTED;
120 spin_unlock(&ailp->xa_lock);
121 }
122}
123
124/*
125 * like unpin only we have to also clear the xaction descriptor
126 * pointing the log item if we free the item. This routine duplicates
127 * unpin because efi_flags is protected by the AIL lock. Freeing
128 * the descriptor and then calling unpin would force us to drop the AIL
129 * lock which would open up a race condition.
130 */
131STATIC void 107STATIC void
132xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) 108xfs_efi_item_unpin(
109 struct xfs_log_item *lip,
110 int remove)
133{ 111{
134 struct xfs_ail *ailp = efip->efi_item.li_ailp; 112 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
135 xfs_log_item_desc_t *lidp; 113 struct xfs_ail *ailp = lip->li_ailp;
136 114
137 spin_lock(&ailp->xa_lock); 115 spin_lock(&ailp->xa_lock);
138 if (efip->efi_flags & XFS_EFI_CANCELED) { 116 if (efip->efi_flags & XFS_EFI_CANCELED) {
139 /* 117 if (remove)
140 * free the xaction descriptor pointing to this item 118 xfs_trans_del_item(lip);
141 */
142 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
143 xfs_trans_free_item(tp, lidp);
144 119
145 /* xfs_trans_ail_delete() drops the AIL lock. */ 120 /* xfs_trans_ail_delete() drops the AIL lock. */
146 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); 121 xfs_trans_ail_delete(ailp, lip);
147 xfs_efi_item_free(efip); 122 xfs_efi_item_free(efip);
148 } else { 123 } else {
149 efip->efi_flags |= XFS_EFI_COMMITTED; 124 efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
158 * XFS_ITEM_PINNED so that the caller will eventually flush the log. 133 * XFS_ITEM_PINNED so that the caller will eventually flush the log.
159 * This should help in getting the EFI out of the AIL. 134 * This should help in getting the EFI out of the AIL.
160 */ 135 */
161/*ARGSUSED*/
162STATIC uint 136STATIC uint
163xfs_efi_item_trylock(xfs_efi_log_item_t *efip) 137xfs_efi_item_trylock(
138 struct xfs_log_item *lip)
164{ 139{
165 return XFS_ITEM_PINNED; 140 return XFS_ITEM_PINNED;
166} 141}
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
168/* 143/*
169 * Efi items have no locking, so just return. 144 * Efi items have no locking, so just return.
170 */ 145 */
171/*ARGSUSED*/
172STATIC void 146STATIC void
173xfs_efi_item_unlock(xfs_efi_log_item_t *efip) 147xfs_efi_item_unlock(
148 struct xfs_log_item *lip)
174{ 149{
175 if (efip->efi_item.li_flags & XFS_LI_ABORTED) 150 if (lip->li_flags & XFS_LI_ABORTED)
176 xfs_efi_item_free(efip); 151 xfs_efi_item_free(EFI_ITEM(lip));
177 return;
178} 152}
179 153
180/* 154/*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
183 * flag is not paid any attention here. Checking for that is delayed 157 * flag is not paid any attention here. Checking for that is delayed
184 * until the EFI is unpinned. 158 * until the EFI is unpinned.
185 */ 159 */
186/*ARGSUSED*/
187STATIC xfs_lsn_t 160STATIC xfs_lsn_t
188xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) 161xfs_efi_item_committed(
162 struct xfs_log_item *lip,
163 xfs_lsn_t lsn)
189{ 164{
190 return lsn; 165 return lsn;
191} 166}
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
195 * stuck waiting for all of its corresponding efd items to be 170 * stuck waiting for all of its corresponding efd items to be
196 * committed to disk. 171 * committed to disk.
197 */ 172 */
198/*ARGSUSED*/
199STATIC void 173STATIC void
200xfs_efi_item_push(xfs_efi_log_item_t *efip) 174xfs_efi_item_push(
175 struct xfs_log_item *lip)
201{ 176{
202 return;
203} 177}
204 178
205/* 179/*
@@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
209 * example, for inodes, the inode is locked throughout the extent freeing 183 * example, for inodes, the inode is locked throughout the extent freeing
210 * so the dependency should be recorded there. 184 * so the dependency should be recorded there.
211 */ 185 */
212/*ARGSUSED*/
213STATIC void 186STATIC void
214xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) 187xfs_efi_item_committing(
188 struct xfs_log_item *lip,
189 xfs_lsn_t lsn)
215{ 190{
216 return;
217} 191}
218 192
219/* 193/*
220 * This is the ops vector shared by all efi log items. 194 * This is the ops vector shared by all efi log items.
221 */ 195 */
222static struct xfs_item_ops xfs_efi_item_ops = { 196static struct xfs_item_ops xfs_efi_item_ops = {
223 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, 197 .iop_size = xfs_efi_item_size,
224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 198 .iop_format = xfs_efi_item_format,
225 xfs_efi_item_format, 199 .iop_pin = xfs_efi_item_pin,
226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, 200 .iop_unpin = xfs_efi_item_unpin,
227 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, 201 .iop_trylock = xfs_efi_item_trylock,
228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 202 .iop_unlock = xfs_efi_item_unlock,
229 xfs_efi_item_unpin_remove, 203 .iop_committed = xfs_efi_item_committed,
230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, 204 .iop_push = xfs_efi_item_push,
231 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, 205 .iop_committing = xfs_efi_item_committing
232 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
233 xfs_efi_item_committed,
234 .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
235 .iop_pushbuf = NULL,
236 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
237 xfs_efi_item_committing
238}; 206};
239 207
240 208
241/* 209/*
242 * Allocate and initialize an efi item with the given number of extents. 210 * Allocate and initialize an efi item with the given number of extents.
243 */ 211 */
244xfs_efi_log_item_t * 212struct xfs_efi_log_item *
245xfs_efi_init(xfs_mount_t *mp, 213xfs_efi_init(
246 uint nextents) 214 struct xfs_mount *mp,
215 uint nextents)
247 216
248{ 217{
249 xfs_efi_log_item_t *efip; 218 struct xfs_efi_log_item *efip;
250 uint size; 219 uint size;
251 220
252 ASSERT(nextents > 0); 221 ASSERT(nextents > 0);
253 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 222 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
254 size = (uint)(sizeof(xfs_efi_log_item_t) + 223 size = (uint)(sizeof(xfs_efi_log_item_t) +
255 ((nextents - 1) * sizeof(xfs_extent_t))); 224 ((nextents - 1) * sizeof(xfs_extent_t)));
256 efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); 225 efip = kmem_zalloc(size, KM_SLEEP);
257 } else { 226 } else {
258 efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, 227 efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
259 KM_SLEEP);
260 } 228 }
261 229
262 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
263 efip->efi_format.efi_nextents = nextents; 231 efip->efi_format.efi_nextents = nextents;
264 efip->efi_format.efi_id = (__psint_t)(void*)efip; 232 efip->efi_format.efi_id = (__psint_t)(void*)efip;
265 233
266 return (efip); 234 return efip;
267} 235}
268 236
269/* 237/*
@@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t *mp,
276int 244int
277xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) 245xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
278{ 246{
279 xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr; 247 xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
280 uint i; 248 uint i;
281 uint len = sizeof(xfs_efi_log_format_t) + 249 uint len = sizeof(xfs_efi_log_format_t) +
282 (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); 250 (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
@@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289 memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); 257 memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
290 return 0; 258 return 0;
291 } else if (buf->i_len == len32) { 259 } else if (buf->i_len == len32) {
292 xfs_efi_log_format_32_t *src_efi_fmt_32 = 260 xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
293 (xfs_efi_log_format_32_t *)buf->i_addr;
294 261
295 dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; 262 dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
296 dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; 263 dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
304 } 271 }
305 return 0; 272 return 0;
306 } else if (buf->i_len == len64) { 273 } else if (buf->i_len == len64) {
307 xfs_efi_log_format_64_t *src_efi_fmt_64 = 274 xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
308 (xfs_efi_log_format_64_t *)buf->i_addr;
309 275
310 dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; 276 dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
311 dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; 277 dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
356 } 322 }
357} 323}
358 324
359STATIC void 325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
360xfs_efd_item_free(xfs_efd_log_item_t *efdp)
361{ 326{
362 int nexts = efdp->efd_format.efd_nextents; 327 return container_of(lip, struct xfs_efd_log_item, efd_item);
328}
363 329
364 if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { 330STATIC void
331xfs_efd_item_free(struct xfs_efd_log_item *efdp)
332{
333 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
365 kmem_free(efdp); 334 kmem_free(efdp);
366 } else { 335 else
367 kmem_zone_free(xfs_efd_zone, efdp); 336 kmem_zone_free(xfs_efd_zone, efdp);
368 }
369} 337}
370 338
371/* 339/*
@@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
373 * We only need 1 iovec for an efd item. It just logs the efd_log_format 341 * We only need 1 iovec for an efd item. It just logs the efd_log_format
374 * structure. 342 * structure.
375 */ 343 */
376/*ARGSUSED*/
377STATIC uint 344STATIC uint
378xfs_efd_item_size(xfs_efd_log_item_t *efdp) 345xfs_efd_item_size(
346 struct xfs_log_item *lip)
379{ 347{
380 return 1; 348 return 1;
381} 349}
@@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
388 * slots in the efd item have been filled. 356 * slots in the efd item have been filled.
389 */ 357 */
390STATIC void 358STATIC void
391xfs_efd_item_format(xfs_efd_log_item_t *efdp, 359xfs_efd_item_format(
392 xfs_log_iovec_t *log_vector) 360 struct xfs_log_item *lip,
361 struct xfs_log_iovec *log_vector)
393{ 362{
394 uint size; 363 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
364 uint size;
395 365
396 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 366 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
397 367
@@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
401 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); 371 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
402 efdp->efd_format.efd_size = 1; 372 efdp->efd_format.efd_size = 1;
403 373
404 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 374 log_vector->i_addr = &efdp->efd_format;
405 log_vector->i_len = size; 375 log_vector->i_len = size;
406 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 376 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
407 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 377 ASSERT(size >= sizeof(xfs_efd_log_format_t));
408} 378}
409 379
410
411/* 380/*
412 * Pinning has no meaning for an efd item, so just return. 381 * Pinning has no meaning for an efd item, so just return.
413 */ 382 */
414/*ARGSUSED*/
415STATIC void 383STATIC void
416xfs_efd_item_pin(xfs_efd_log_item_t *efdp) 384xfs_efd_item_pin(
385 struct xfs_log_item *lip)
417{ 386{
418 return;
419} 387}
420 388
421
422/* 389/*
423 * Since pinning has no meaning for an efd item, unpinning does 390 * Since pinning has no meaning for an efd item, unpinning does
424 * not either. 391 * not either.
425 */ 392 */
426/*ARGSUSED*/
427STATIC void
428xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
429{
430 return;
431}
432
433/*ARGSUSED*/
434STATIC void 393STATIC void
435xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) 394xfs_efd_item_unpin(
395 struct xfs_log_item *lip,
396 int remove)
436{ 397{
437 return;
438} 398}
439 399
440/* 400/*
441 * Efd items have no locking, so just return success. 401 * Efd items have no locking, so just return success.
442 */ 402 */
443/*ARGSUSED*/
444STATIC uint 403STATIC uint
445xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) 404xfs_efd_item_trylock(
405 struct xfs_log_item *lip)
446{ 406{
447 return XFS_ITEM_LOCKED; 407 return XFS_ITEM_LOCKED;
448} 408}
@@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
451 * Efd items have no locking or pushing, so return failure 411 * Efd items have no locking or pushing, so return failure
452 * so that the caller doesn't bother with us. 412 * so that the caller doesn't bother with us.
453 */ 413 */
454/*ARGSUSED*/
455STATIC void 414STATIC void
456xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) 415xfs_efd_item_unlock(
416 struct xfs_log_item *lip)
457{ 417{
458 if (efdp->efd_item.li_flags & XFS_LI_ABORTED) 418 if (lip->li_flags & XFS_LI_ABORTED)
459 xfs_efd_item_free(efdp); 419 xfs_efd_item_free(EFD_ITEM(lip));
460 return;
461} 420}
462 421
463/* 422/*
@@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
467 * return -1 to keep the transaction code from further referencing 426 * return -1 to keep the transaction code from further referencing
468 * this item. 427 * this item.
469 */ 428 */
470/*ARGSUSED*/
471STATIC xfs_lsn_t 429STATIC xfs_lsn_t
472xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) 430xfs_efd_item_committed(
431 struct xfs_log_item *lip,
432 xfs_lsn_t lsn)
473{ 433{
434 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
435
474 /* 436 /*
475 * If we got a log I/O error, it's always the case that the LR with the 437 * If we got a log I/O error, it's always the case that the LR with the
476 * EFI got unpinned and freed before the EFD got aborted. 438 * EFI got unpinned and freed before the EFD got aborted.
477 */ 439 */
478 if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) 440 if (!(lip->li_flags & XFS_LI_ABORTED))
479 xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); 441 xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
480 442
481 xfs_efd_item_free(efdp); 443 xfs_efd_item_free(efdp);
@@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
486 * There isn't much you can do to push on an efd item. It is simply 448 * There isn't much you can do to push on an efd item. It is simply
487 * stuck waiting for the log to be flushed to disk. 449 * stuck waiting for the log to be flushed to disk.
488 */ 450 */
489/*ARGSUSED*/
490STATIC void 451STATIC void
491xfs_efd_item_push(xfs_efd_log_item_t *efdp) 452xfs_efd_item_push(
453 struct xfs_log_item *lip)
492{ 454{
493 return;
494} 455}
495 456
496/* 457/*
@@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
500 * example, for inodes, the inode is locked throughout the extent freeing 461 * example, for inodes, the inode is locked throughout the extent freeing
501 * so the dependency should be recorded there. 462 * so the dependency should be recorded there.
502 */ 463 */
503/*ARGSUSED*/
504STATIC void 464STATIC void
505xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) 465xfs_efd_item_committing(
466 struct xfs_log_item *lip,
467 xfs_lsn_t lsn)
506{ 468{
507 return;
508} 469}
509 470
510/* 471/*
511 * This is the ops vector shared by all efd log items. 472 * This is the ops vector shared by all efd log items.
512 */ 473 */
513static struct xfs_item_ops xfs_efd_item_ops = { 474static struct xfs_item_ops xfs_efd_item_ops = {
514 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, 475 .iop_size = xfs_efd_item_size,
515 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 476 .iop_format = xfs_efd_item_format,
516 xfs_efd_item_format, 477 .iop_pin = xfs_efd_item_pin,
517 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, 478 .iop_unpin = xfs_efd_item_unpin,
518 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, 479 .iop_trylock = xfs_efd_item_trylock,
519 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 480 .iop_unlock = xfs_efd_item_unlock,
520 xfs_efd_item_unpin_remove, 481 .iop_committed = xfs_efd_item_committed,
521 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, 482 .iop_push = xfs_efd_item_push,
522 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, 483 .iop_committing = xfs_efd_item_committing
523 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
524 xfs_efd_item_committed,
525 .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
526 .iop_pushbuf = NULL,
527 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
528 xfs_efd_item_committing
529}; 484};
530 485
531
532/* 486/*
533 * Allocate and initialize an efd item with the given number of extents. 487 * Allocate and initialize an efd item with the given number of extents.
534 */ 488 */
535xfs_efd_log_item_t * 489struct xfs_efd_log_item *
536xfs_efd_init(xfs_mount_t *mp, 490xfs_efd_init(
537 xfs_efi_log_item_t *efip, 491 struct xfs_mount *mp,
538 uint nextents) 492 struct xfs_efi_log_item *efip,
493 uint nextents)
539 494
540{ 495{
541 xfs_efd_log_item_t *efdp; 496 struct xfs_efd_log_item *efdp;
542 uint size; 497 uint size;
543 498
544 ASSERT(nextents > 0); 499 ASSERT(nextents > 0);
545 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 500 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
546 size = (uint)(sizeof(xfs_efd_log_item_t) + 501 size = (uint)(sizeof(xfs_efd_log_item_t) +
547 ((nextents - 1) * sizeof(xfs_extent_t))); 502 ((nextents - 1) * sizeof(xfs_extent_t)));
548 efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); 503 efdp = kmem_zalloc(size, KM_SLEEP);
549 } else { 504 } else {
550 efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, 505 efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
551 KM_SLEEP);
552 } 506 }
553 507
554 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); 508 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
@@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t *mp,
556 efdp->efd_format.efd_nextents = nextents; 510 efdp->efd_format.efd_nextents = nextents;
557 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 511 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
558 512
559 return (efdp); 513 return efdp;
560} 514}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee660..9b715dce569 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_bmap_btree.h" 19#include "xfs_bmap_btree.h"
20#include "xfs_inum.h" 20#include "xfs_inum.h"
21#include "xfs_dir2.h"
22#include "xfs_dir2_sf.h"
23#include "xfs_attr_sf.h"
24#include "xfs_dinode.h" 21#include "xfs_dinode.h"
25#include "xfs_inode.h" 22#include "xfs_inode.h"
26#include "xfs_ag.h" 23#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_log.h" 24#include "xfs_log.h"
29#include "xfs_trans.h" 25#include "xfs_trans.h"
30#include "xfs_sb.h" 26#include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
127 xfs_inode_t *pip; /* Parent directory inode pointer. */ 123 xfs_inode_t *pip; /* Parent directory inode pointer. */
128} fstrm_item_t; 124} fstrm_item_t;
129 125
126/*
127 * Allocation group filestream associations are tracked with per-ag atomic
128 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
129 * particular AG already has active filestreams associated with it. The mount
130 * point's m_peraglock is used to protect these counters from per-ag array
131 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
132 * about to reallocate the array, it calls xfs_filestream_flush() with the
133 * m_peraglock held in write mode.
134 *
135 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
136 * the cache elements have finished executing before it returns, it's safe for
137 * the free functions to use the atomic counters without m_peraglock protection.
138 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
139 * whether it was called with the m_peraglock held in read mode, write mode or
140 * not held at all. The race condition this addresses is the following:
141 *
142 * - The work queue scheduler fires and pulls a filestream directory cache
143 * element off the LRU end of the cache for deletion, then gets pre-empted.
144 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
145 * remaining items from the cache and reallocates the mount point's per-ag
146 * array, resetting all the counters to zero.
147 * - The work queue thread resumes and calls the free function for the element
148 * it started cleaning up earlier. In the process it decrements the
149 * filestreams counter for an AG that now has no references.
150 *
151 * With a shrinkfs feature, the above scenario could panic the system.
152 *
153 * All other uses of the following macros should be protected by either the
154 * m_peraglock held in read mode, or the cache's internal locking exposed by the
155 * interval between a call to xfs_mru_cache_lookup() and a call to
156 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
157 * when new elements are added to the cache.
158 *
159 * Combined, these locking rules ensure that no associations will ever exist in
160 * the cache that reference per-ag array elements that have since been
161 * reallocated.
162 */
163static int
164xfs_filestream_peek_ag(
165 xfs_mount_t *mp,
166 xfs_agnumber_t agno)
167{
168 struct xfs_perag *pag;
169 int ret;
170
171 pag = xfs_perag_get(mp, agno);
172 ret = atomic_read(&pag->pagf_fstrms);
173 xfs_perag_put(pag);
174 return ret;
175}
176
177static int
178xfs_filestream_get_ag(
179 xfs_mount_t *mp,
180 xfs_agnumber_t agno)
181{
182 struct xfs_perag *pag;
183 int ret;
184
185 pag = xfs_perag_get(mp, agno);
186 ret = atomic_inc_return(&pag->pagf_fstrms);
187 xfs_perag_put(pag);
188 return ret;
189}
190
191static void
192xfs_filestream_put_ag(
193 xfs_mount_t *mp,
194 xfs_agnumber_t agno)
195{
196 struct xfs_perag *pag;
197
198 pag = xfs_perag_get(mp, agno);
199 atomic_dec(&pag->pagf_fstrms);
200 xfs_perag_put(pag);
201}
130 202
131/* 203/*
132 * Scan the AGs starting at startag looking for an AG that isn't in use and has 204 * Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
355{ 427{
356 fstrm_item_t *item = (fstrm_item_t *)data; 428 fstrm_item_t *item = (fstrm_item_t *)data;
357 xfs_inode_t *ip = item->ip; 429 xfs_inode_t *ip = item->ip;
358 int ref;
359 430
360 ASSERT(ip->i_ino == ino); 431 ASSERT(ip->i_ino == ino);
361 432
362 xfs_iflags_clear(ip, XFS_IFILESTREAM); 433 xfs_iflags_clear(ip, XFS_IFILESTREAM);
363 434
364 /* Drop the reference taken on the AG when the item was added. */ 435 /* Drop the reference taken on the AG when the item was added. */
365 ref = xfs_filestream_put_ag(ip->i_mount, item->ag); 436 xfs_filestream_put_ag(ip->i_mount, item->ag);
366 437
367 ASSERT(ref >= 0);
368 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, 438 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
369 xfs_filestream_peek_ag(ip->i_mount, item->ag)); 439 xfs_filestream_peek_ag(ip->i_mount, item->ag));
370 440
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5..09dd9af4543 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
42 42
43#endif 43#endif
44 44
45/*
46 * Allocation group filestream associations are tracked with per-ag atomic
47 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
48 * particular AG already has active filestreams associated with it. The mount
49 * point's m_peraglock is used to protect these counters from per-ag array
50 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
51 * about to reallocate the array, it calls xfs_filestream_flush() with the
52 * m_peraglock held in write mode.
53 *
54 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
55 * the cache elements have finished executing before it returns, it's safe for
56 * the free functions to use the atomic counters without m_peraglock protection.
57 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
58 * whether it was called with the m_peraglock held in read mode, write mode or
59 * not held at all. The race condition this addresses is the following:
60 *
61 * - The work queue scheduler fires and pulls a filestream directory cache
62 * element off the LRU end of the cache for deletion, then gets pre-empted.
63 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
64 * remaining items from the cache and reallocates the mount point's per-ag
65 * array, resetting all the counters to zero.
66 * - The work queue thread resumes and calls the free function for the element
67 * it started cleaning up earlier. In the process it decrements the
68 * filestreams counter for an AG that now has no references.
69 *
70 * With a shrinkfs feature, the above scenario could panic the system.
71 *
72 * All other uses of the following macros should be protected by either the
73 * m_peraglock held in read mode, or the cache's internal locking exposed by the
74 * interval between a call to xfs_mru_cache_lookup() and a call to
75 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
76 * when new elements are added to the cache.
77 *
78 * Combined, these locking rules ensure that no associations will ever exist in
79 * the cache that reference per-ag array elements that have since been
80 * reallocated.
81 */
82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
85static inline int
86xfs_filestream_peek_ag(
87 xfs_mount_t *mp,
88 xfs_agnumber_t agno)
89{
90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
97}
98
99static inline int
100xfs_filestream_get_ag(
101 xfs_mount_t *mp,
102 xfs_agnumber_t agno)
103{
104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
111}
112
113static inline int
114xfs_filestream_put_ag(
115 xfs_mount_t *mp,
116 xfs_agnumber_t agno)
117{
118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
125}
126
127/* allocation selection flags */ 45/* allocation selection flags */
128typedef enum xfs_fstrm_alloc { 46typedef enum xfs_fstrm_alloc {
129 XFS_PICK_USERDATA = 1, 47 XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5..8f6fc1a9638 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
117#define BMV_IF_VALID \ 118#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 119 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
120 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
119 121
120/* bmv_oflags values - returned for each non-header segment */ 122/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 123#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
@@ -291,9 +293,11 @@ typedef struct xfs_bstat {
291 __s32 bs_extsize; /* extent size */ 293 __s32 bs_extsize; /* extent size */
292 __s32 bs_extents; /* number of extents */ 294 __s32 bs_extents; /* number of extents */
293 __u32 bs_gen; /* generation count */ 295 __u32 bs_gen; /* generation count */
294 __u16 bs_projid; /* project id */ 296 __u16 bs_projid_lo; /* lower part of project id */
297#define bs_projid bs_projid_lo /* (previously just bs_projid) */
295 __u16 bs_forkoff; /* inode fork offset in bytes */ 298 __u16 bs_forkoff; /* inode fork offset in bytes */
296 unsigned char bs_pad[12]; /* pad space, unused */ 299 __u16 bs_projid_hi; /* higher part of project id */
300 unsigned char bs_pad[10]; /* pad space, unused */
297 __u32 bs_dmevmask; /* DMIG event mask */ 301 __u32 bs_dmevmask; /* DMIG event mask */
298 __u16 bs_dmstate; /* DMIG state info */ 302 __u16 bs_dmstate; /* DMIG state info */
299 __u16 bs_aextents; /* attribute number of extents */ 303 __u16 bs_aextents; /* attribute number of extents */
@@ -446,6 +450,7 @@ typedef struct xfs_handle {
446/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ 450/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
447/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 451/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
448#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 452#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
453#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
449 454
450/* 455/*
451 * ioctl commands that replace IRIX syssgi()'s 456 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b..a7c116e814a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
@@ -148,12 +144,11 @@ xfs_growfs_data_private(
148 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
149 return error; 145 return error;
150 dpct = pct - mp->m_sb.sb_imax_pct; 146 dpct = pct - mp->m_sb.sb_imax_pct;
151 error = xfs_read_buf(mp, mp->m_ddev_targp, 147 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
152 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
153 XFS_FSS_TO_BB(mp, 1), 0, &bp); 149 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
154 if (error) 150 if (!bp)
155 return error; 151 return EIO;
156 ASSERT(bp);
157 xfs_buf_relse(bp); 152 xfs_buf_relse(bp);
158 153
159 new = nb; /* use new as a temporary here */ 154 new = nb; /* use new as a temporary here */
@@ -601,39 +596,44 @@ out:
601 * the extra reserve blocks from the reserve..... 596 * the extra reserve blocks from the reserve.....
602 */ 597 */
603 int error; 598 int error;
604 error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); 599 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
600 fdblks_delta, 0);
605 if (error == ENOSPC) 601 if (error == ENOSPC)
606 goto retry; 602 goto retry;
607 } 603 }
608 return 0; 604 return 0;
609} 605}
610 606
607/*
608 * Dump a transaction into the log that contains no real change. This is needed
609 * to be able to make the log dirty or stamp the current tail LSN into the log
610 * during the covering operation.
611 *
612 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead.
615 */
611int 616int
612xfs_fs_log_dummy( 617xfs_fs_log_dummy(
613 xfs_mount_t *mp) 618 xfs_mount_t *mp,
619 int flags)
614{ 620{
615 xfs_trans_t *tp; 621 xfs_trans_t *tp;
616 xfs_inode_t *ip;
617 int error; 622 int error;
618 623
619 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
620 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 625 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
626 XFS_DEFAULT_LOG_COUNT);
621 if (error) { 627 if (error) {
622 xfs_trans_cancel(tp, 0); 628 xfs_trans_cancel(tp, 0);
623 return error; 629 return error;
624 } 630 }
625 631
626 ip = mp->m_rootip; 632 /* log the UUID because it is an unchanging field */
627 xfs_ilock(ip, XFS_ILOCK_EXCL); 633 xfs_mod_sb(tp, XFS_SB_UUID);
628 634 if (flags & SYNC_WAIT)
629 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 635 xfs_trans_set_sync(tp);
630 xfs_trans_ihold(tp, ip); 636 return xfs_trans_commit(tp, 0);
631 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
632 xfs_trans_set_sync(tp);
633 error = xfs_trans_commit(tp, 0);
634
635 xfs_iunlock(ip, XFS_ILOCK_EXCL);
636 return error;
637} 637}
638 638
639int 639int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c..a786c5212c1 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c7142a064c4..0626a32c344 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
@@ -216,7 +212,7 @@ xfs_ialloc_inode_init(
216 * to log a whole cluster of inodes instead of all the 212 * to log a whole cluster of inodes instead of all the
217 * individual transactions causing a lot of log traffic. 213 * individual transactions causing a lot of log traffic.
218 */ 214 */
219 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
220 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
221 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
222 uint isize = sizeof(struct xfs_dinode); 218 uint isize = sizeof(struct xfs_dinode);
@@ -1217,7 +1213,6 @@ xfs_imap_lookup(
1217 struct xfs_inobt_rec_incore rec; 1213 struct xfs_inobt_rec_incore rec;
1218 struct xfs_btree_cur *cur; 1214 struct xfs_btree_cur *cur;
1219 struct xfs_buf *agbp; 1215 struct xfs_buf *agbp;
1220 xfs_agino_t startino;
1221 int error; 1216 int error;
1222 int i; 1217 int i;
1223 1218
@@ -1231,13 +1226,13 @@ xfs_imap_lookup(
1231 } 1226 }
1232 1227
1233 /* 1228 /*
1234 * derive and lookup the exact inode record for the given agino. If the 1229 * Lookup the inode record for the given agino. If the record cannot be
1235 * record cannot be found, then it's an invalid inode number and we 1230 * found, then it's an invalid inode number and we should abort. Once
1236 * should abort. 1231 * we have a record, we need to ensure it contains the inode number
1232 * we are looking up.
1237 */ 1233 */
1238 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1239 startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); 1235 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1240 error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
1241 if (!error) { 1236 if (!error) {
1242 if (i) 1237 if (i)
1243 error = xfs_inobt_get_rec(cur, &rec, &i); 1238 error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1250,6 +1245,11 @@ xfs_imap_lookup(
1250 if (error) 1245 if (error)
1251 return error; 1246 return error;
1252 1247
1248 /* check that the returned record contains the required inode */
1249 if (rec.ir_startino > agino ||
1250 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1251 return EINVAL;
1252
1253 /* for untrusted inodes check it is allocated first */ 1253 /* for untrusted inodes check it is allocated first */
1254 if ((flags & XFS_IGET_UNTRUSTED) && 1254 if ((flags & XFS_IGET_UNTRUSTED) &&
1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) 1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af539..16921f55c54 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
@@ -187,38 +183,6 @@ xfs_inobt_key_diff(
187 cur->bc_rec.i.ir_startino; 183 cur->bc_rec.i.ir_startino;
188} 184}
189 185
190STATIC int
191xfs_inobt_kill_root(
192 struct xfs_btree_cur *cur,
193 struct xfs_buf *bp,
194 int level,
195 union xfs_btree_ptr *newroot)
196{
197 int error;
198
199 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
200 XFS_BTREE_STATS_INC(cur, killroot);
201
202 /*
203 * Update the root pointer, decreasing the level by 1 and then
204 * free the old root.
205 */
206 xfs_inobt_set_root(cur, newroot, -1);
207 error = xfs_inobt_free_block(cur, bp);
208 if (error) {
209 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
210 return error;
211 }
212
213 XFS_BTREE_STATS_INC(cur, free);
214
215 cur->bc_bufs[level] = NULL;
216 cur->bc_nlevels--;
217
218 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
219 return 0;
220}
221
222#ifdef DEBUG 186#ifdef DEBUG
223STATIC int 187STATIC int
224xfs_inobt_keys_inorder( 188xfs_inobt_keys_inorder(
@@ -313,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
313 277
314 .dup_cursor = xfs_inobt_dup_cursor, 278 .dup_cursor = xfs_inobt_dup_cursor,
315 .set_root = xfs_inobt_set_root, 279 .set_root = xfs_inobt_set_root,
316 .kill_root = xfs_inobt_kill_root,
317 .alloc_block = xfs_inobt_alloc_block, 280 .alloc_block = xfs_inobt_alloc_block,
318 .free_block = xfs_inobt_free_block, 281 .free_block = xfs_inobt_free_block,
319 .get_minrecs = xfs_inobt_get_minrecs, 282 .get_minrecs = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8f8b91be2c9..0cdd26932d8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_btree.h" 34#include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
95 return ip; 91 return ip;
96} 92}
97 93
98STATIC void 94void
99xfs_inode_free( 95xfs_inode_free(
100 struct xfs_inode *ip) 96 struct xfs_inode *ip)
101{ 97{
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
212 ip->i_flags &= ~XFS_INEW; 208 ip->i_flags &= ~XFS_INEW;
213 ip->i_flags |= XFS_IRECLAIMABLE; 209 ip->i_flags |= XFS_IRECLAIMABLE;
214 __xfs_inode_set_reclaim_tag(pag, ip); 210 __xfs_inode_set_reclaim_tag(pag, ip);
215 trace_xfs_iget_reclaim(ip); 211 trace_xfs_iget_reclaim_fail(ip);
216 goto out_error; 212 goto out_error;
217 } 213 }
218 214
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
227 } else { 223 } else {
228 /* If the VFS inode is being torn down, pause and try again. */ 224 /* If the VFS inode is being torn down, pause and try again. */
229 if (!igrab(inode)) { 225 if (!igrab(inode)) {
226 trace_xfs_iget_skip(ip);
230 error = EAGAIN; 227 error = EAGAIN;
231 goto out_error; 228 goto out_error;
232 } 229 }
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
234 /* We've got a live one. */ 231 /* We've got a live one. */
235 spin_unlock(&ip->i_flags_lock); 232 spin_unlock(&ip->i_flags_lock);
236 read_unlock(&pag->pag_ici_lock); 233 read_unlock(&pag->pag_ici_lock);
234 trace_xfs_iget_hit(ip);
237 } 235 }
238 236
239 if (lock_flags != 0) 237 if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
242 xfs_iflags_clear(ip, XFS_ISTALE); 240 xfs_iflags_clear(ip, XFS_ISTALE);
243 XFS_STATS_INC(xs_ig_found); 241 XFS_STATS_INC(xs_ig_found);
244 242
245 trace_xfs_iget_found(ip);
246 return 0; 243 return 0;
247 244
248out_error: 245out_error:
@@ -264,7 +261,6 @@ xfs_iget_cache_miss(
264{ 261{
265 struct xfs_inode *ip; 262 struct xfs_inode *ip;
266 int error; 263 int error;
267 unsigned long first_index, mask;
268 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 264 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
269 265
270 ip = xfs_inode_alloc(mp, ino); 266 ip = xfs_inode_alloc(mp, ino);
@@ -275,7 +271,7 @@ xfs_iget_cache_miss(
275 if (error) 271 if (error)
276 goto out_destroy; 272 goto out_destroy;
277 273
278 xfs_itrace_entry(ip); 274 trace_xfs_iget_miss(ip);
279 275
280 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 276 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
281 error = ENOENT; 277 error = ENOENT;
@@ -301,8 +297,6 @@ xfs_iget_cache_miss(
301 BUG(); 297 BUG();
302 } 298 }
303 299
304 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
305 first_index = agino & mask;
306 write_lock(&pag->pag_ici_lock); 300 write_lock(&pag->pag_ici_lock);
307 301
308 /* insert the new inode */ 302 /* insert the new inode */
@@ -321,7 +315,6 @@ xfs_iget_cache_miss(
321 write_unlock(&pag->pag_ici_lock); 315 write_unlock(&pag->pag_ici_lock);
322 radix_tree_preload_end(); 316 radix_tree_preload_end();
323 317
324 trace_xfs_iget_alloc(ip);
325 *ipp = ip; 318 *ipp = ip;
326 return 0; 319 return 0;
327 320
@@ -372,8 +365,8 @@ xfs_iget(
372 xfs_perag_t *pag; 365 xfs_perag_t *pag;
373 xfs_agino_t agino; 366 xfs_agino_t agino;
374 367
375 /* the radix tree exists only in inode capable AGs */ 368 /* reject inode numbers outside existing AGs */
376 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
377 return EINVAL; 370 return EINVAL;
378 371
379 /* get the perag structure and ensure that it's inode capable */ 372 /* get the perag structure and ensure that it's inode capable */
@@ -422,97 +415,6 @@ out_error_or_again:
422} 415}
423 416
424/* 417/*
425 * Decrement reference count of an inode structure and unlock it.
426 *
427 * ip -- the inode being released
428 * lock_flags -- this parameter indicates the inode's locks to be
429 * to be released. See the comment on xfs_iunlock() for a list
430 * of valid values.
431 */
432void
433xfs_iput(xfs_inode_t *ip,
434 uint lock_flags)
435{
436 xfs_itrace_entry(ip);
437 xfs_iunlock(ip, lock_flags);
438 IRELE(ip);
439}
440
441/*
442 * Special iput for brand-new inodes that are still locked
443 */
444void
445xfs_iput_new(
446 xfs_inode_t *ip,
447 uint lock_flags)
448{
449 struct inode *inode = VFS_I(ip);
450
451 xfs_itrace_entry(ip);
452
453 if ((ip->i_d.di_mode == 0)) {
454 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
455 make_bad_inode(inode);
456 }
457 if (inode->i_state & I_NEW)
458 unlock_new_inode(inode);
459 if (lock_flags)
460 xfs_iunlock(ip, lock_flags);
461 IRELE(ip);
462}
463
464/*
465 * This is called free all the memory associated with an inode.
466 * It must free the inode itself and any buffers allocated for
467 * if_extents/if_data and if_broot. It must also free the lock
468 * associated with the inode.
469 *
470 * Note: because we don't initialise everything on reallocation out
471 * of the zone, we must ensure we nullify everything correctly before
472 * freeing the structure.
473 */
474void
475xfs_ireclaim(
476 struct xfs_inode *ip)
477{
478 struct xfs_mount *mp = ip->i_mount;
479 struct xfs_perag *pag;
480 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
481
482 XFS_STATS_INC(xs_ig_reclaims);
483
484 /*
485 * Remove the inode from the per-AG radix tree.
486 *
487 * Because radix_tree_delete won't complain even if the item was never
488 * added to the tree assert that it's been there before to catch
489 * problems with the inode life time early on.
490 */
491 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
492 write_lock(&pag->pag_ici_lock);
493 if (!radix_tree_delete(&pag->pag_ici_root, agino))
494 ASSERT(0);
495 write_unlock(&pag->pag_ici_lock);
496 xfs_perag_put(pag);
497
498 /*
499 * Here we do an (almost) spurious inode lock in order to coordinate
500 * with inode cache radix tree lookups. This is because the lookup
501 * can reference the inodes in the cache without taking references.
502 *
503 * We make that OK here by ensuring that we wait until the inode is
504 * unlocked after the lookup before we go ahead and free it. We get
505 * both the ilock and the iolock because the code may need to drop the
506 * ilock one but will still hold the iolock.
507 */
508 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
509 xfs_qm_dqdetach(ip);
510 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
511
512 xfs_inode_free(ip);
513}
514
515/*
516 * This is a wrapper routine around the xfs_ilock() routine 418 * This is a wrapper routine around the xfs_ilock() routine
517 * used to centralize some grungy code. It is used in places 419 * used to centralize some grungy code. It is used in places
518 * that wish to lock the inode solely for reading the extents. 420 * that wish to lock the inode solely for reading the extents.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b76a829d7e2..108c7a085f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
27#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
29#include "xfs_ag.h" 29#include "xfs_ag.h"
30#include "xfs_dir2.h"
31#include "xfs_dmapi.h"
32#include "xfs_mount.h" 30#include "xfs_mount.h"
33#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
34#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
35#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_attr_sf.h" 34#include "xfs_attr_sf.h"
38#include "xfs_dinode.h" 35#include "xfs_dinode.h"
39#include "xfs_inode.h" 36#include "xfs_inode.h"
@@ -44,7 +41,6 @@
44#include "xfs_alloc.h" 41#include "xfs_alloc.h"
45#include "xfs_ialloc.h" 42#include "xfs_ialloc.h"
46#include "xfs_bmap.h" 43#include "xfs_bmap.h"
47#include "xfs_rw.h"
48#include "xfs_error.h" 44#include "xfs_error.h"
49#include "xfs_utils.h" 45#include "xfs_utils.h"
50#include "xfs_quota.h" 46#include "xfs_quota.h"
@@ -426,7 +422,7 @@ xfs_iformat(
426 if (!XFS_DFORK_Q(dip)) 422 if (!XFS_DFORK_Q(dip))
427 return 0; 423 return 0;
428 ASSERT(ip->i_afp == NULL); 424 ASSERT(ip->i_afp == NULL);
429 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 425 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
430 ip->i_afp->if_ext_max = 426 ip->i_afp->if_ext_max =
431 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 427 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
432 switch (dip->di_aformat) { 428 switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
509 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 505 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
510 else { 506 else {
511 real_size = roundup(size, 4); 507 real_size = roundup(size, 4);
512 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 508 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
513 } 509 }
514 ifp->if_bytes = size; 510 ifp->if_bytes = size;
515 ifp->if_real_bytes = real_size; 511 ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
636 } 632 }
637 633
638 ifp->if_broot_bytes = size; 634 ifp->if_broot_bytes = size;
639 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 635 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
640 ASSERT(ifp->if_broot != NULL); 636 ASSERT(ifp->if_broot != NULL);
641 /* 637 /*
642 * Copy and convert from the on-disk structure 638 * Copy and convert from the on-disk structure
@@ -664,7 +660,8 @@ xfs_dinode_from_disk(
664 to->di_uid = be32_to_cpu(from->di_uid); 660 to->di_uid = be32_to_cpu(from->di_uid);
665 to->di_gid = be32_to_cpu(from->di_gid); 661 to->di_gid = be32_to_cpu(from->di_gid);
666 to->di_nlink = be32_to_cpu(from->di_nlink); 662 to->di_nlink = be32_to_cpu(from->di_nlink);
667 to->di_projid = be16_to_cpu(from->di_projid); 663 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
664 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
668 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 665 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
669 to->di_flushiter = be16_to_cpu(from->di_flushiter); 666 to->di_flushiter = be16_to_cpu(from->di_flushiter);
670 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 667 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -699,7 +696,8 @@ xfs_dinode_to_disk(
699 to->di_uid = cpu_to_be32(from->di_uid); 696 to->di_uid = cpu_to_be32(from->di_uid);
700 to->di_gid = cpu_to_be32(from->di_gid); 697 to->di_gid = cpu_to_be32(from->di_gid);
701 to->di_nlink = cpu_to_be32(from->di_nlink); 698 to->di_nlink = cpu_to_be32(from->di_nlink);
702 to->di_projid = cpu_to_be16(from->di_projid); 699 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
700 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
703 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 701 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
704 to->di_flushiter = cpu_to_be16(from->di_flushiter); 702 to->di_flushiter = cpu_to_be16(from->di_flushiter);
705 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 703 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -878,7 +876,7 @@ xfs_iread(
878 if (ip->i_d.di_version == 1) { 876 if (ip->i_d.di_version == 1) {
879 ip->i_d.di_nlink = ip->i_d.di_onlink; 877 ip->i_d.di_nlink = ip->i_d.di_onlink;
880 ip->i_d.di_onlink = 0; 878 ip->i_d.di_onlink = 0;
881 ip->i_d.di_projid = 0; 879 xfs_set_projid(ip, 0);
882 } 880 }
883 881
884 ip->i_delayed_blks = 0; 882 ip->i_delayed_blks = 0;
@@ -922,7 +920,6 @@ xfs_iread_extents(
922 int error; 920 int error;
923 xfs_ifork_t *ifp; 921 xfs_ifork_t *ifp;
924 xfs_extnum_t nextents; 922 xfs_extnum_t nextents;
925 size_t size;
926 923
927 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 924 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
928 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 925 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -930,7 +927,6 @@ xfs_iread_extents(
930 return XFS_ERROR(EFSCORRUPTED); 927 return XFS_ERROR(EFSCORRUPTED);
931 } 928 }
932 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 929 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
933 size = nextents * sizeof(xfs_bmbt_rec_t);
934 ifp = XFS_IFORK_PTR(ip, whichfork); 930 ifp = XFS_IFORK_PTR(ip, whichfork);
935 931
936 /* 932 /*
@@ -988,8 +984,7 @@ xfs_ialloc(
988 mode_t mode, 984 mode_t mode,
989 xfs_nlink_t nlink, 985 xfs_nlink_t nlink,
990 xfs_dev_t rdev, 986 xfs_dev_t rdev,
991 cred_t *cr, 987 prid_t prid,
992 xfs_prid_t prid,
993 int okalloc, 988 int okalloc,
994 xfs_buf_t **ialloc_context, 989 xfs_buf_t **ialloc_context,
995 boolean_t *call_again, 990 boolean_t *call_again,
@@ -1033,7 +1028,7 @@ xfs_ialloc(
1033 ASSERT(ip->i_d.di_nlink == nlink); 1028 ASSERT(ip->i_d.di_nlink == nlink);
1034 ip->i_d.di_uid = current_fsuid(); 1029 ip->i_d.di_uid = current_fsuid();
1035 ip->i_d.di_gid = current_fsgid(); 1030 ip->i_d.di_gid = current_fsgid();
1036 ip->i_d.di_projid = prid; 1031 xfs_set_projid(ip, prid);
1037 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1032 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1038 1033
1039 /* 1034 /*
@@ -1226,7 +1221,7 @@ xfs_isize_check(
1226 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1221 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1227 map_first), 1222 map_first),
1228 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1223 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1229 NULL, NULL)) 1224 NULL))
1230 return; 1225 return;
1231 ASSERT(nimaps == 1); 1226 ASSERT(nimaps == 1);
1232 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1227 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1460,7 +1455,7 @@ xfs_itruncate_finish(
1460 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1455 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1461 ASSERT(ip->i_transp == *tp); 1456 ASSERT(ip->i_transp == *tp);
1462 ASSERT(ip->i_itemp != NULL); 1457 ASSERT(ip->i_itemp != NULL);
1463 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1458 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1464 1459
1465 1460
1466 ntp = *tp; 1461 ntp = *tp;
@@ -1589,11 +1584,10 @@ xfs_itruncate_finish(
1589 xfs_bmap_init(&free_list, &first_block); 1584 xfs_bmap_init(&free_list, &first_block);
1590 error = xfs_bunmapi(ntp, ip, 1585 error = xfs_bunmapi(ntp, ip,
1591 first_unmap_block, unmap_len, 1586 first_unmap_block, unmap_len,
1592 xfs_bmapi_aflag(fork) | 1587 xfs_bmapi_aflag(fork),
1593 (sync ? 0 : XFS_BMAPI_ASYNC),
1594 XFS_ITRUNC_MAX_EXTENTS, 1588 XFS_ITRUNC_MAX_EXTENTS,
1595 &first_block, &free_list, 1589 &first_block, &free_list,
1596 NULL, &done); 1590 &done);
1597 if (error) { 1591 if (error) {
1598 /* 1592 /*
1599 * If the bunmapi call encounters an error, 1593 * If the bunmapi call encounters an error,
@@ -1612,12 +1606,8 @@ xfs_itruncate_finish(
1612 */ 1606 */
1613 error = xfs_bmap_finish(tp, &free_list, &committed); 1607 error = xfs_bmap_finish(tp, &free_list, &committed);
1614 ntp = *tp; 1608 ntp = *tp;
1615 if (committed) { 1609 if (committed)
1616 /* link the inode into the next xact in the chain */ 1610 xfs_trans_ijoin(ntp, ip);
1617 xfs_trans_ijoin(ntp, ip,
1618 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1619 xfs_trans_ihold(ntp, ip);
1620 }
1621 1611
1622 if (error) { 1612 if (error) {
1623 /* 1613 /*
@@ -1646,9 +1636,7 @@ xfs_itruncate_finish(
1646 error = xfs_trans_commit(*tp, 0); 1636 error = xfs_trans_commit(*tp, 0);
1647 *tp = ntp; 1637 *tp = ntp;
1648 1638
1649 /* link the inode into the next transaction in the chain */ 1639 xfs_trans_ijoin(ntp, ip);
1650 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1651 xfs_trans_ihold(ntp, ip);
1652 1640
1653 if (error) 1641 if (error)
1654 return error; 1642 return error;
@@ -1927,6 +1915,11 @@ xfs_iunlink_remove(
1927 return 0; 1915 return 0;
1928} 1916}
1929 1917
1918/*
1919 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1920 * inodes that are in memory - they all must be marked stale and attached to
1921 * the cluster buffer.
1922 */
1930STATIC void 1923STATIC void
1931xfs_ifree_cluster( 1924xfs_ifree_cluster(
1932 xfs_inode_t *free_ip, 1925 xfs_inode_t *free_ip,
@@ -1958,8 +1951,6 @@ xfs_ifree_cluster(
1958 } 1951 }
1959 1952
1960 for (j = 0; j < nbufs; j++, inum += ninodes) { 1953 for (j = 0; j < nbufs; j++, inum += ninodes) {
1961 int found = 0;
1962
1963 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1954 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1964 XFS_INO_TO_AGBNO(mp, inum)); 1955 XFS_INO_TO_AGBNO(mp, inum));
1965 1956
@@ -1978,23 +1969,25 @@ xfs_ifree_cluster(
1978 /* 1969 /*
1979 * Walk the inodes already attached to the buffer and mark them 1970 * Walk the inodes already attached to the buffer and mark them
1980 * stale. These will all have the flush locks held, so an 1971 * stale. These will all have the flush locks held, so an
1981 * in-memory inode walk can't lock them. 1972 * in-memory inode walk can't lock them. By marking them all
1973 * stale first, we will not attempt to lock them in the loop
1974 * below as the XFS_ISTALE flag will be set.
1982 */ 1975 */
1983 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1976 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1984 while (lip) { 1977 while (lip) {
1985 if (lip->li_type == XFS_LI_INODE) { 1978 if (lip->li_type == XFS_LI_INODE) {
1986 iip = (xfs_inode_log_item_t *)lip; 1979 iip = (xfs_inode_log_item_t *)lip;
1987 ASSERT(iip->ili_logged == 1); 1980 ASSERT(iip->ili_logged == 1);
1988 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 1981 lip->li_cb = xfs_istale_done;
1989 xfs_trans_ail_copy_lsn(mp->m_ail, 1982 xfs_trans_ail_copy_lsn(mp->m_ail,
1990 &iip->ili_flush_lsn, 1983 &iip->ili_flush_lsn,
1991 &iip->ili_item.li_lsn); 1984 &iip->ili_item.li_lsn);
1992 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1985 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1993 found++;
1994 } 1986 }
1995 lip = lip->li_bio_list; 1987 lip = lip->li_bio_list;
1996 } 1988 }
1997 1989
1990
1998 /* 1991 /*
1999 * For each inode in memory attempt to add it to the inode 1992 * For each inode in memory attempt to add it to the inode
2000 * buffer and set it up for being staled on buffer IO 1993 * buffer and set it up for being staled on buffer IO
@@ -2006,6 +1999,7 @@ xfs_ifree_cluster(
2006 * even trying to lock them. 1999 * even trying to lock them.
2007 */ 2000 */
2008 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2002retry:
2009 read_lock(&pag->pag_ici_lock); 2003 read_lock(&pag->pag_ici_lock);
2010 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2011 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2016,38 +2010,36 @@ xfs_ifree_cluster(
2016 continue; 2010 continue;
2017 } 2011 }
2018 2012
2019 /* don't try to lock/unlock the current inode */ 2013 /*
2014 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not
2017 * already marked stale. If we can't lock it, back off
2018 * and retry.
2019 */
2020 if (ip != free_ip && 2020 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2022 read_unlock(&pag->pag_ici_lock);
2023 continue; 2023 delay(1);
2024 goto retry;
2024 } 2025 }
2025 read_unlock(&pag->pag_ici_lock); 2026 read_unlock(&pag->pag_ici_lock);
2026 2027
2027 if (!xfs_iflock_nowait(ip)) { 2028 xfs_iflock(ip);
2028 if (ip != free_ip)
2029 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2030 continue;
2031 }
2032
2033 xfs_iflags_set(ip, XFS_ISTALE); 2029 xfs_iflags_set(ip, XFS_ISTALE);
2034 if (xfs_inode_clean(ip)) {
2035 ASSERT(ip != free_ip);
2036 xfs_ifunlock(ip);
2037 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2038 continue;
2039 }
2040 2030
2031 /*
2032 * we don't need to attach clean inodes or those only
2033 * with unlogged changes (which we throw away, anyway).
2034 */
2041 iip = ip->i_itemp; 2035 iip = ip->i_itemp;
2042 if (!iip) { 2036 if (!iip || xfs_inode_clean(ip)) {
2043 /* inode with unlogged changes only */
2044 ASSERT(ip != free_ip); 2037 ASSERT(ip != free_ip);
2045 ip->i_update_core = 0; 2038 ip->i_update_core = 0;
2046 xfs_ifunlock(ip); 2039 xfs_ifunlock(ip);
2047 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2040 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2048 continue; 2041 continue;
2049 } 2042 }
2050 found++;
2051 2043
2052 iip->ili_last_fields = iip->ili_format.ilf_fields; 2044 iip->ili_last_fields = iip->ili_format.ilf_fields;
2053 iip->ili_format.ilf_fields = 0; 2045 iip->ili_format.ilf_fields = 0;
@@ -2055,16 +2047,14 @@ xfs_ifree_cluster(
2055 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2047 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2056 &iip->ili_item.li_lsn); 2048 &iip->ili_item.li_lsn);
2057 2049
2058 xfs_buf_attach_iodone(bp, 2050 xfs_buf_attach_iodone(bp, xfs_istale_done,
2059 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2051 &iip->ili_item);
2060 xfs_istale_done, (xfs_log_item_t *)iip);
2061 2052
2062 if (ip != free_ip) 2053 if (ip != free_ip)
2063 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2054 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2064 } 2055 }
2065 2056
2066 if (found) 2057 xfs_trans_stale_inode_buf(tp, bp);
2067 xfs_trans_stale_inode_buf(tp, bp);
2068 xfs_trans_binval(tp, bp); 2058 xfs_trans_binval(tp, bp);
2069 } 2059 }
2070 2060
@@ -2203,7 +2193,7 @@ xfs_iroot_realloc(
2203 */ 2193 */
2204 if (ifp->if_broot_bytes == 0) { 2194 if (ifp->if_broot_bytes == 0) {
2205 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2195 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2206 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); 2196 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2207 ifp->if_broot_bytes = (int)new_size; 2197 ifp->if_broot_bytes = (int)new_size;
2208 return; 2198 return;
2209 } 2199 }
@@ -2219,7 +2209,7 @@ xfs_iroot_realloc(
2219 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2209 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2220 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 2210 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2221 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2211 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2222 KM_SLEEP); 2212 KM_SLEEP | KM_NOFS);
2223 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2213 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2224 ifp->if_broot_bytes); 2214 ifp->if_broot_bytes);
2225 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2215 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2245,7 +2235,7 @@ xfs_iroot_realloc(
2245 else 2235 else
2246 new_size = 0; 2236 new_size = 0;
2247 if (new_size > 0) { 2237 if (new_size > 0) {
2248 new_broot = kmem_alloc(new_size, KM_SLEEP); 2238 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2249 /* 2239 /*
2250 * First copy over the btree block header. 2240 * First copy over the btree block header.
2251 */ 2241 */
@@ -2349,7 +2339,8 @@ xfs_idata_realloc(
2349 real_size = roundup(new_size, 4); 2339 real_size = roundup(new_size, 4);
2350 if (ifp->if_u1.if_data == NULL) { 2340 if (ifp->if_u1.if_data == NULL) {
2351 ASSERT(ifp->if_real_bytes == 0); 2341 ASSERT(ifp->if_real_bytes == 0);
2352 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2342 ifp->if_u1.if_data = kmem_alloc(real_size,
2343 KM_SLEEP | KM_NOFS);
2353 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2344 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2354 /* 2345 /*
2355 * Only do the realloc if the underlying size 2346 * Only do the realloc if the underlying size
@@ -2360,11 +2351,12 @@ xfs_idata_realloc(
2360 kmem_realloc(ifp->if_u1.if_data, 2351 kmem_realloc(ifp->if_u1.if_data,
2361 real_size, 2352 real_size,
2362 ifp->if_real_bytes, 2353 ifp->if_real_bytes,
2363 KM_SLEEP); 2354 KM_SLEEP | KM_NOFS);
2364 } 2355 }
2365 } else { 2356 } else {
2366 ASSERT(ifp->if_real_bytes == 0); 2357 ASSERT(ifp->if_real_bytes == 0);
2367 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2358 ifp->if_u1.if_data = kmem_alloc(real_size,
2359 KM_SLEEP | KM_NOFS);
2368 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2360 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2369 ifp->if_bytes); 2361 ifp->if_bytes);
2370 } 2362 }
@@ -2731,11 +2723,10 @@ cluster_corrupt_out:
2731 * mark it as stale and brelse. 2723 * mark it as stale and brelse.
2732 */ 2724 */
2733 if (XFS_BUF_IODONE_FUNC(bp)) { 2725 if (XFS_BUF_IODONE_FUNC(bp)) {
2734 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
2735 XFS_BUF_UNDONE(bp); 2726 XFS_BUF_UNDONE(bp);
2736 XFS_BUF_STALE(bp); 2727 XFS_BUF_STALE(bp);
2737 XFS_BUF_ERROR(bp,EIO); 2728 XFS_BUF_ERROR(bp,EIO);
2738 xfs_biodone(bp); 2729 xfs_buf_ioend(bp, 0);
2739 } else { 2730 } else {
2740 XFS_BUF_STALE(bp); 2731 XFS_BUF_STALE(bp);
2741 xfs_buf_relse(bp); 2732 xfs_buf_relse(bp);
@@ -3018,7 +3009,7 @@ xfs_iflush_int(
3018 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3009 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3019 memset(&(dip->di_pad[0]), 0, 3010 memset(&(dip->di_pad[0]), 0,
3020 sizeof(dip->di_pad)); 3011 sizeof(dip->di_pad));
3021 ASSERT(ip->i_d.di_projid == 0); 3012 ASSERT(xfs_get_projid(ip) == 0);
3022 } 3013 }
3023 } 3014 }
3024 3015
@@ -3069,8 +3060,7 @@ xfs_iflush_int(
3069 * and unlock the inode's flush lock when the inode is 3060 * and unlock the inode's flush lock when the inode is
3070 * completely written to disk. 3061 * completely written to disk.
3071 */ 3062 */
3072 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3063 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3073 xfs_iflush_done, (xfs_log_item_t *)iip);
3074 3064
3075 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3065 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3076 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3066 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3514,13 +3504,11 @@ xfs_iext_remove_indirect(
3514 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3504 xfs_extnum_t ext_diff; /* extents to remove in current list */
3515 xfs_extnum_t nex1; /* number of extents before idx */ 3505 xfs_extnum_t nex1; /* number of extents before idx */
3516 xfs_extnum_t nex2; /* extents after idx + count */ 3506 xfs_extnum_t nex2; /* extents after idx + count */
3517 int nlists; /* entries in indirection array */
3518 int page_idx = idx; /* index in target extent list */ 3507 int page_idx = idx; /* index in target extent list */
3519 3508
3520 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3509 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3521 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3510 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3522 ASSERT(erp != NULL); 3511 ASSERT(erp != NULL);
3523 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3524 nex1 = page_idx; 3512 nex1 = page_idx;
3525 ext_cnt = count; 3513 ext_cnt = count;
3526 while (ext_cnt) { 3514 while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 78550df13cd..fb2ca2e4cdc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
134 __uint32_t di_uid; /* owner's user id */ 134 __uint32_t di_uid; /* owner's user id */
135 __uint32_t di_gid; /* owner's group id */ 135 __uint32_t di_gid; /* owner's group id */
136 __uint32_t di_nlink; /* number of links to file */ 136 __uint32_t di_nlink; /* number of links to file */
137 __uint16_t di_projid; /* owner's project id */ 137 __uint16_t di_projid_lo; /* lower part of owner's project id */
138 __uint8_t di_pad[8]; /* unused, zeroed space */ 138 __uint16_t di_projid_hi; /* higher part of owner's project id */
139 __uint8_t di_pad[6]; /* unused, zeroed space */
139 __uint16_t di_flushiter; /* incremented on flush */ 140 __uint16_t di_flushiter; /* incremented on flush */
140 xfs_ictimestamp_t di_atime; /* time last accessed */ 141 xfs_ictimestamp_t di_atime; /* time last accessed */
141 xfs_ictimestamp_t di_mtime; /* time last modified */ 142 xfs_ictimestamp_t di_mtime; /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
212#ifdef __KERNEL__ 213#ifdef __KERNEL__
213 214
214struct bhv_desc; 215struct bhv_desc;
215struct cred;
216struct xfs_buf; 216struct xfs_buf;
217struct xfs_bmap_free; 217struct xfs_bmap_free;
218struct xfs_bmbt_irec; 218struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
335} 335}
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen
340 * to retain compatibility with "old" filesystems).
341 */
342static inline prid_t
343xfs_get_projid(struct xfs_inode *ip)
344{
345 return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
346}
347
348static inline void
349xfs_set_projid(struct xfs_inode *ip,
350 prid_t projid)
351{
352 ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
353 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
354}
355
356/*
338 * Manage the i_flush queue embedded in the inode. This completion 357 * Manage the i_flush queue embedded in the inode. This completion
339 * queue synchronizes processes attempting to flush the in-core 358 * queue synchronizes processes attempting to flush the in-core
340 * inode back to disk. 359 * inode back to disk.
@@ -443,8 +462,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
443 */ 462 */
444int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 463int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
445 uint, uint, xfs_inode_t **); 464 uint, uint, xfs_inode_t **);
446void xfs_iput(xfs_inode_t *, uint);
447void xfs_iput_new(xfs_inode_t *, uint);
448void xfs_ilock(xfs_inode_t *, uint); 465void xfs_ilock(xfs_inode_t *, uint);
449int xfs_ilock_nowait(xfs_inode_t *, uint); 466int xfs_ilock_nowait(xfs_inode_t *, uint);
450void xfs_iunlock(xfs_inode_t *, uint); 467void xfs_iunlock(xfs_inode_t *, uint);
@@ -452,14 +469,14 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
452int xfs_isilocked(xfs_inode_t *, uint); 469int xfs_isilocked(xfs_inode_t *, uint);
453uint xfs_ilock_map_shared(xfs_inode_t *); 470uint xfs_ilock_map_shared(xfs_inode_t *);
454void xfs_iunlock_map_shared(xfs_inode_t *, uint); 471void xfs_iunlock_map_shared(xfs_inode_t *, uint);
455void xfs_ireclaim(xfs_inode_t *); 472void xfs_inode_free(struct xfs_inode *ip);
456 473
457/* 474/*
458 * xfs_inode.c prototypes. 475 * xfs_inode.c prototypes.
459 */ 476 */
460int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 477int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
461 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 478 xfs_nlink_t, xfs_dev_t, prid_t, int,
462 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 479 struct xfs_buf **, boolean_t *, xfs_inode_t **);
463 480
464uint xfs_ip2xflags(struct xfs_inode *); 481uint xfs_ip2xflags(struct xfs_inode *);
465uint xfs_dic2xflags(struct xfs_dinode *); 482uint xfs_dic2xflags(struct xfs_dinode *);
@@ -473,7 +490,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
473void xfs_iext_realloc(xfs_inode_t *, int, int); 490void xfs_iext_realloc(xfs_inode_t *, int, int);
474void xfs_iunpin_wait(xfs_inode_t *); 491void xfs_iunpin_wait(xfs_inode_t *);
475int xfs_iflush(xfs_inode_t *, uint); 492int xfs_iflush(xfs_inode_t *, uint);
476void xfs_ichgtime(xfs_inode_t *, int);
477void xfs_lock_inodes(xfs_inode_t **, int, uint); 493void xfs_lock_inodes(xfs_inode_t **, int, uint);
478void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 494void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
479 495
@@ -484,7 +500,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *);
484#define IHOLD(ip) \ 500#define IHOLD(ip) \
485do { \ 501do { \
486 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 502 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
487 atomic_inc(&(VFS_I(ip)->i_count)); \ 503 ihold(VFS_I(ip)); \
488 trace_xfs_ihold(ip, _THIS_IP_); \ 504 trace_xfs_ihold(ip, _THIS_IP_); \
489} while (0) 505} while (0)
490 506
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cf8249a6000..c7ac020705d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h" 26#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 27#include "xfs_mount.h"
31#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
32#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 30#include "xfs_dinode.h"
38#include "xfs_inode.h" 31#include "xfs_inode.h"
39#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
40#include "xfs_btree.h"
41#include "xfs_ialloc.h"
42#include "xfs_rw.h"
43#include "xfs_error.h" 33#include "xfs_error.h"
44#include "xfs_trace.h" 34#include "xfs_trace.h"
45 35
46 36
47kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 37kmem_zone_t *xfs_ili_zone; /* inode log item zone */
48 38
39static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
40{
41 return container_of(lip, struct xfs_inode_log_item, ili_item);
42}
43
44
49/* 45/*
50 * This returns the number of iovecs needed to log the given inode item. 46 * This returns the number of iovecs needed to log the given inode item.
51 * 47 *
@@ -55,13 +51,11 @@ kmem_zone_t *xfs_ili_zone; /* inode log item zone */
55 */ 51 */
56STATIC uint 52STATIC uint
57xfs_inode_item_size( 53xfs_inode_item_size(
58 xfs_inode_log_item_t *iip) 54 struct xfs_log_item *lip)
59{ 55{
60 uint nvecs; 56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
61 xfs_inode_t *ip; 57 struct xfs_inode *ip = iip->ili_inode;
62 58 uint nvecs = 2;
63 ip = iip->ili_inode;
64 nvecs = 2;
65 59
66 /* 60 /*
67 * Only log the data/extents/b-tree root if there is something 61 * Only log the data/extents/b-tree root if there is something
@@ -212,36 +206,23 @@ xfs_inode_item_size(
212 */ 206 */
213STATIC void 207STATIC void
214xfs_inode_item_format( 208xfs_inode_item_format(
215 xfs_inode_log_item_t *iip, 209 struct xfs_log_item *lip,
216 xfs_log_iovec_t *log_vector) 210 struct xfs_log_iovec *vecp)
217{ 211{
212 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
213 struct xfs_inode *ip = iip->ili_inode;
218 uint nvecs; 214 uint nvecs;
219 xfs_log_iovec_t *vecp;
220 xfs_inode_t *ip;
221 size_t data_bytes; 215 size_t data_bytes;
222 xfs_bmbt_rec_t *ext_buffer; 216 xfs_bmbt_rec_t *ext_buffer;
223 int nrecs;
224 xfs_mount_t *mp; 217 xfs_mount_t *mp;
225 218
226 ip = iip->ili_inode; 219 vecp->i_addr = &iip->ili_format;
227 vecp = log_vector;
228
229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
230 vecp->i_len = sizeof(xfs_inode_log_format_t); 220 vecp->i_len = sizeof(xfs_inode_log_format_t);
231 vecp->i_type = XLOG_REG_TYPE_IFORMAT; 221 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
232 vecp++; 222 vecp++;
233 nvecs = 1; 223 nvecs = 1;
234 224
235 /* 225 /*
236 * Make sure the linux inode is dirty. We do this before
237 * clearing i_update_core as the VFS will call back into
238 * XFS here and set i_update_core, so we need to dirty the
239 * inode first so that the ordering of i_update_core and
240 * unlogged modifications still works as described below.
241 */
242 xfs_mark_inode_dirty_sync(ip);
243
244 /*
245 * Clear i_update_core if the timestamps (or any other 226 * Clear i_update_core if the timestamps (or any other
246 * non-transactional modification) need flushing/logging 227 * non-transactional modification) need flushing/logging
247 * and we're about to log them with the rest of the core. 228 * and we're about to log them with the rest of the core.
@@ -277,7 +258,7 @@ xfs_inode_item_format(
277 */ 258 */
278 xfs_synchronize_times(ip); 259 xfs_synchronize_times(ip);
279 260
280 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 261 vecp->i_addr = &ip->i_d;
281 vecp->i_len = sizeof(struct xfs_icdinode); 262 vecp->i_len = sizeof(struct xfs_icdinode);
282 vecp->i_type = XLOG_REG_TYPE_ICORE; 263 vecp->i_type = XLOG_REG_TYPE_ICORE;
283 vecp++; 264 vecp++;
@@ -323,18 +304,17 @@ xfs_inode_item_format(
323 ASSERT(ip->i_df.if_u1.if_extents != NULL); 304 ASSERT(ip->i_df.if_u1.if_extents != NULL);
324 ASSERT(ip->i_d.di_nextents > 0); 305 ASSERT(ip->i_d.di_nextents > 0);
325 ASSERT(iip->ili_extents_buf == NULL); 306 ASSERT(iip->ili_extents_buf == NULL);
326 nrecs = ip->i_df.if_bytes / 307 ASSERT((ip->i_df.if_bytes /
327 (uint)sizeof(xfs_bmbt_rec_t); 308 (uint)sizeof(xfs_bmbt_rec_t)) > 0);
328 ASSERT(nrecs > 0);
329#ifdef XFS_NATIVE_HOST 309#ifdef XFS_NATIVE_HOST
330 if (nrecs == ip->i_d.di_nextents) { 310 if (ip->i_d.di_nextents == ip->i_df.if_bytes /
311 (uint)sizeof(xfs_bmbt_rec_t)) {
331 /* 312 /*
332 * There are no delayed allocation 313 * There are no delayed allocation
333 * extents, so just point to the 314 * extents, so just point to the
334 * real extents array. 315 * real extents array.
335 */ 316 */
336 vecp->i_addr = 317 vecp->i_addr = ip->i_df.if_u1.if_extents;
337 (char *)(ip->i_df.if_u1.if_extents);
338 vecp->i_len = ip->i_df.if_bytes; 318 vecp->i_len = ip->i_df.if_bytes;
339 vecp->i_type = XLOG_REG_TYPE_IEXT; 319 vecp->i_type = XLOG_REG_TYPE_IEXT;
340 } else 320 } else
@@ -352,7 +332,7 @@ xfs_inode_item_format(
352 ext_buffer = kmem_alloc(ip->i_df.if_bytes, 332 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
353 KM_SLEEP); 333 KM_SLEEP);
354 iip->ili_extents_buf = ext_buffer; 334 iip->ili_extents_buf = ext_buffer;
355 vecp->i_addr = (xfs_caddr_t)ext_buffer; 335 vecp->i_addr = ext_buffer;
356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 336 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
357 XFS_DATA_FORK); 337 XFS_DATA_FORK);
358 vecp->i_type = XLOG_REG_TYPE_IEXT; 338 vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +351,7 @@ xfs_inode_item_format(
371 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { 351 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
372 ASSERT(ip->i_df.if_broot_bytes > 0); 352 ASSERT(ip->i_df.if_broot_bytes > 0);
373 ASSERT(ip->i_df.if_broot != NULL); 353 ASSERT(ip->i_df.if_broot != NULL);
374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 354 vecp->i_addr = ip->i_df.if_broot;
375 vecp->i_len = ip->i_df.if_broot_bytes; 355 vecp->i_len = ip->i_df.if_broot_bytes;
376 vecp->i_type = XLOG_REG_TYPE_IBROOT; 356 vecp->i_type = XLOG_REG_TYPE_IBROOT;
377 vecp++; 357 vecp++;
@@ -389,7 +369,7 @@ xfs_inode_item_format(
389 ASSERT(ip->i_df.if_u1.if_data != NULL); 369 ASSERT(ip->i_df.if_u1.if_data != NULL);
390 ASSERT(ip->i_d.di_size > 0); 370 ASSERT(ip->i_d.di_size > 0);
391 371
392 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; 372 vecp->i_addr = ip->i_df.if_u1.if_data;
393 /* 373 /*
394 * Round i_bytes up to a word boundary. 374 * Round i_bytes up to a word boundary.
395 * The underlying memory is guaranteed to 375 * The underlying memory is guaranteed to
@@ -437,7 +417,7 @@ xfs_inode_item_format(
437 * Assert that no attribute-related log flags are set. 417 * Assert that no attribute-related log flags are set.
438 */ 418 */
439 if (!XFS_IFORK_Q(ip)) { 419 if (!XFS_IFORK_Q(ip)) {
440 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 420 ASSERT(nvecs == lip->li_desc->lid_size);
441 iip->ili_format.ilf_size = nvecs; 421 iip->ili_format.ilf_size = nvecs;
442 ASSERT(!(iip->ili_format.ilf_fields & 422 ASSERT(!(iip->ili_format.ilf_fields &
443 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 423 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +429,21 @@ xfs_inode_item_format(
449 ASSERT(!(iip->ili_format.ilf_fields & 429 ASSERT(!(iip->ili_format.ilf_fields &
450 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); 430 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
451 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { 431 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
452 ASSERT(ip->i_afp->if_bytes > 0);
453 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
454 ASSERT(ip->i_d.di_anextents > 0);
455#ifdef DEBUG 432#ifdef DEBUG
456 nrecs = ip->i_afp->if_bytes / 433 int nrecs = ip->i_afp->if_bytes /
457 (uint)sizeof(xfs_bmbt_rec_t); 434 (uint)sizeof(xfs_bmbt_rec_t);
458#endif
459 ASSERT(nrecs > 0); 435 ASSERT(nrecs > 0);
460 ASSERT(nrecs == ip->i_d.di_anextents); 436 ASSERT(nrecs == ip->i_d.di_anextents);
437 ASSERT(ip->i_afp->if_bytes > 0);
438 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
439 ASSERT(ip->i_d.di_anextents > 0);
440#endif
461#ifdef XFS_NATIVE_HOST 441#ifdef XFS_NATIVE_HOST
462 /* 442 /*
463 * There are not delayed allocation extents 443 * There are not delayed allocation extents
464 * for attributes, so just point at the array. 444 * for attributes, so just point at the array.
465 */ 445 */
466 vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); 446 vecp->i_addr = ip->i_afp->if_u1.if_extents;
467 vecp->i_len = ip->i_afp->if_bytes; 447 vecp->i_len = ip->i_afp->if_bytes;
468#else 448#else
469 ASSERT(iip->ili_aextents_buf == NULL); 449 ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +453,7 @@ xfs_inode_item_format(
473 ext_buffer = kmem_alloc(ip->i_afp->if_bytes, 453 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
474 KM_SLEEP); 454 KM_SLEEP);
475 iip->ili_aextents_buf = ext_buffer; 455 iip->ili_aextents_buf = ext_buffer;
476 vecp->i_addr = (xfs_caddr_t)ext_buffer; 456 vecp->i_addr = ext_buffer;
477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 457 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
478 XFS_ATTR_FORK); 458 XFS_ATTR_FORK);
479#endif 459#endif
@@ -490,7 +470,7 @@ xfs_inode_item_format(
490 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { 470 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
491 ASSERT(ip->i_afp->if_broot_bytes > 0); 471 ASSERT(ip->i_afp->if_broot_bytes > 0);
492 ASSERT(ip->i_afp->if_broot != NULL); 472 ASSERT(ip->i_afp->if_broot != NULL);
493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 473 vecp->i_addr = ip->i_afp->if_broot;
494 vecp->i_len = ip->i_afp->if_broot_bytes; 474 vecp->i_len = ip->i_afp->if_broot_bytes;
495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 475 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
496 vecp++; 476 vecp++;
@@ -506,7 +486,7 @@ xfs_inode_item_format(
506 ASSERT(ip->i_afp->if_bytes > 0); 486 ASSERT(ip->i_afp->if_bytes > 0);
507 ASSERT(ip->i_afp->if_u1.if_data != NULL); 487 ASSERT(ip->i_afp->if_u1.if_data != NULL);
508 488
509 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; 489 vecp->i_addr = ip->i_afp->if_u1.if_data;
510 /* 490 /*
511 * Round i_bytes up to a word boundary. 491 * Round i_bytes up to a word boundary.
512 * The underlying memory is guaranteed to 492 * The underlying memory is guaranteed to
@@ -528,7 +508,7 @@ xfs_inode_item_format(
528 break; 508 break;
529 } 509 }
530 510
531 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 511 ASSERT(nvecs == lip->li_desc->lid_size);
532 iip->ili_format.ilf_size = nvecs; 512 iip->ili_format.ilf_size = nvecs;
533} 513}
534 514
@@ -539,12 +519,14 @@ xfs_inode_item_format(
539 */ 519 */
540STATIC void 520STATIC void
541xfs_inode_item_pin( 521xfs_inode_item_pin(
542 xfs_inode_log_item_t *iip) 522 struct xfs_log_item *lip)
543{ 523{
544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 524 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
525
526 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
545 527
546 trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); 528 trace_xfs_inode_pin(ip, _RET_IP_);
547 atomic_inc(&iip->ili_inode->i_pincount); 529 atomic_inc(&ip->i_pincount);
548} 530}
549 531
550 532
@@ -554,12 +536,12 @@ xfs_inode_item_pin(
554 * 536 *
555 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. 537 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
556 */ 538 */
557/* ARGSUSED */
558STATIC void 539STATIC void
559xfs_inode_item_unpin( 540xfs_inode_item_unpin(
560 xfs_inode_log_item_t *iip) 541 struct xfs_log_item *lip,
542 int remove)
561{ 543{
562 struct xfs_inode *ip = iip->ili_inode; 544 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
563 545
564 trace_xfs_inode_unpin(ip, _RET_IP_); 546 trace_xfs_inode_unpin(ip, _RET_IP_);
565 ASSERT(atomic_read(&ip->i_pincount) > 0); 547 ASSERT(atomic_read(&ip->i_pincount) > 0);
@@ -567,15 +549,6 @@ xfs_inode_item_unpin(
567 wake_up(&ip->i_ipin_wait); 549 wake_up(&ip->i_ipin_wait);
568} 550}
569 551
570/* ARGSUSED */
571STATIC void
572xfs_inode_item_unpin_remove(
573 xfs_inode_log_item_t *iip,
574 xfs_trans_t *tp)
575{
576 xfs_inode_item_unpin(iip);
577}
578
579/* 552/*
580 * This is called to attempt to lock the inode associated with this 553 * This is called to attempt to lock the inode associated with this
581 * inode log item, in preparation for the push routine which does the actual 554 * inode log item, in preparation for the push routine which does the actual
@@ -591,19 +564,16 @@ xfs_inode_item_unpin_remove(
591 */ 564 */
592STATIC uint 565STATIC uint
593xfs_inode_item_trylock( 566xfs_inode_item_trylock(
594 xfs_inode_log_item_t *iip) 567 struct xfs_log_item *lip)
595{ 568{
596 register xfs_inode_t *ip; 569 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
597 570 struct xfs_inode *ip = iip->ili_inode;
598 ip = iip->ili_inode;
599 571
600 if (xfs_ipincount(ip) > 0) { 572 if (xfs_ipincount(ip) > 0)
601 return XFS_ITEM_PINNED; 573 return XFS_ITEM_PINNED;
602 }
603 574
604 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 575 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
605 return XFS_ITEM_LOCKED; 576 return XFS_ITEM_LOCKED;
606 }
607 577
608 if (!xfs_iflock_nowait(ip)) { 578 if (!xfs_iflock_nowait(ip)) {
609 /* 579 /*
@@ -629,7 +599,7 @@ xfs_inode_item_trylock(
629 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 599 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
630 ASSERT(iip->ili_format.ilf_fields != 0); 600 ASSERT(iip->ili_format.ilf_fields != 0);
631 ASSERT(iip->ili_logged == 0); 601 ASSERT(iip->ili_logged == 0);
632 ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); 602 ASSERT(lip->li_flags & XFS_LI_IN_AIL);
633 } 603 }
634#endif 604#endif
635 return XFS_ITEM_SUCCESS; 605 return XFS_ITEM_SUCCESS;
@@ -643,26 +613,18 @@ xfs_inode_item_trylock(
643 */ 613 */
644STATIC void 614STATIC void
645xfs_inode_item_unlock( 615xfs_inode_item_unlock(
646 xfs_inode_log_item_t *iip) 616 struct xfs_log_item *lip)
647{ 617{
648 uint hold; 618 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
649 uint iolocked; 619 struct xfs_inode *ip = iip->ili_inode;
650 uint lock_flags; 620 unsigned short lock_flags;
651 xfs_inode_t *ip;
652 621
653 ASSERT(iip != NULL);
654 ASSERT(iip->ili_inode->i_itemp != NULL); 622 ASSERT(iip->ili_inode->i_itemp != NULL);
655 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 623 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
656 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 624
657 XFS_ILI_IOLOCKED_EXCL)) ||
658 xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
659 ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
660 XFS_ILI_IOLOCKED_SHARED)) ||
661 xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
662 /* 625 /*
663 * Clear the transaction pointer in the inode. 626 * Clear the transaction pointer in the inode.
664 */ 627 */
665 ip = iip->ili_inode;
666 ip->i_transp = NULL; 628 ip->i_transp = NULL;
667 629
668 /* 630 /*
@@ -686,34 +648,11 @@ xfs_inode_item_unlock(
686 iip->ili_aextents_buf = NULL; 648 iip->ili_aextents_buf = NULL;
687 } 649 }
688 650
689 /* 651 lock_flags = iip->ili_lock_flags;
690 * Figure out if we should unlock the inode or not. 652 iip->ili_lock_flags = 0;
691 */ 653 if (lock_flags) {
692 hold = iip->ili_flags & XFS_ILI_HOLD; 654 xfs_iunlock(iip->ili_inode, lock_flags);
693 655 IRELE(iip->ili_inode);
694 /*
695 * Before clearing out the flags, remember whether we
696 * are holding the inode's IO lock.
697 */
698 iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
699
700 /*
701 * Clear out the fields of the inode log item particular
702 * to the current transaction.
703 */
704 iip->ili_flags = 0;
705
706 /*
707 * Unlock the inode if XFS_ILI_HOLD was not set.
708 */
709 if (!hold) {
710 lock_flags = XFS_ILOCK_EXCL;
711 if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
712 lock_flags |= XFS_IOLOCK_EXCL;
713 } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
714 lock_flags |= XFS_IOLOCK_SHARED;
715 }
716 xfs_iput(iip->ili_inode, lock_flags);
717 } 656 }
718} 657}
719 658
@@ -725,13 +664,12 @@ xfs_inode_item_unlock(
725 * is the only one that matters. Therefore, simply return the 664 * is the only one that matters. Therefore, simply return the
726 * given lsn. 665 * given lsn.
727 */ 666 */
728/*ARGSUSED*/
729STATIC xfs_lsn_t 667STATIC xfs_lsn_t
730xfs_inode_item_committed( 668xfs_inode_item_committed(
731 xfs_inode_log_item_t *iip, 669 struct xfs_log_item *lip,
732 xfs_lsn_t lsn) 670 xfs_lsn_t lsn)
733{ 671{
734 return (lsn); 672 return lsn;
735} 673}
736 674
737/* 675/*
@@ -743,13 +681,12 @@ xfs_inode_item_committed(
743 */ 681 */
744STATIC void 682STATIC void
745xfs_inode_item_pushbuf( 683xfs_inode_item_pushbuf(
746 xfs_inode_log_item_t *iip) 684 struct xfs_log_item *lip)
747{ 685{
748 xfs_inode_t *ip; 686 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
749 xfs_mount_t *mp; 687 struct xfs_inode *ip = iip->ili_inode;
750 xfs_buf_t *bp; 688 struct xfs_buf *bp;
751 689
752 ip = iip->ili_inode;
753 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 690 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
754 691
755 /* 692 /*
@@ -757,14 +694,13 @@ xfs_inode_item_pushbuf(
757 * inode was taken off the AIL. So, just get out. 694 * inode was taken off the AIL. So, just get out.
758 */ 695 */
759 if (completion_done(&ip->i_flush) || 696 if (completion_done(&ip->i_flush) ||
760 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 697 !(lip->li_flags & XFS_LI_IN_AIL)) {
761 xfs_iunlock(ip, XFS_ILOCK_SHARED); 698 xfs_iunlock(ip, XFS_ILOCK_SHARED);
762 return; 699 return;
763 } 700 }
764 701
765 mp = ip->i_mount; 702 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
766 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 703 iip->ili_format.ilf_len, XBF_TRYLOCK);
767 iip->ili_format.ilf_len, XBF_TRYLOCK);
768 704
769 xfs_iunlock(ip, XFS_ILOCK_SHARED); 705 xfs_iunlock(ip, XFS_ILOCK_SHARED);
770 if (!bp) 706 if (!bp)
@@ -772,10 +708,8 @@ xfs_inode_item_pushbuf(
772 if (XFS_BUF_ISDELAYWRITE(bp)) 708 if (XFS_BUF_ISDELAYWRITE(bp))
773 xfs_buf_delwri_promote(bp); 709 xfs_buf_delwri_promote(bp);
774 xfs_buf_relse(bp); 710 xfs_buf_relse(bp);
775 return;
776} 711}
777 712
778
779/* 713/*
780 * This is called to asynchronously write the inode associated with this 714 * This is called to asynchronously write the inode associated with this
781 * inode log item out to disk. The inode will already have been locked by 715 * inode log item out to disk. The inode will already have been locked by
@@ -783,14 +717,14 @@ xfs_inode_item_pushbuf(
783 */ 717 */
784STATIC void 718STATIC void
785xfs_inode_item_push( 719xfs_inode_item_push(
786 xfs_inode_log_item_t *iip) 720 struct xfs_log_item *lip)
787{ 721{
788 xfs_inode_t *ip; 722 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
789 723 struct xfs_inode *ip = iip->ili_inode;
790 ip = iip->ili_inode;
791 724
792 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 725 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
793 ASSERT(!completion_done(&ip->i_flush)); 726 ASSERT(!completion_done(&ip->i_flush));
727
794 /* 728 /*
795 * Since we were able to lock the inode's flush lock and 729 * Since we were able to lock the inode's flush lock and
796 * we found it on the AIL, the inode must be dirty. This 730 * we found it on the AIL, the inode must be dirty. This
@@ -813,43 +747,34 @@ xfs_inode_item_push(
813 */ 747 */
814 (void) xfs_iflush(ip, 0); 748 (void) xfs_iflush(ip, 0);
815 xfs_iunlock(ip, XFS_ILOCK_SHARED); 749 xfs_iunlock(ip, XFS_ILOCK_SHARED);
816
817 return;
818} 750}
819 751
820/* 752/*
821 * XXX rcc - this one really has to do something. Probably needs 753 * XXX rcc - this one really has to do something. Probably needs
822 * to stamp in a new field in the incore inode. 754 * to stamp in a new field in the incore inode.
823 */ 755 */
824/* ARGSUSED */
825STATIC void 756STATIC void
826xfs_inode_item_committing( 757xfs_inode_item_committing(
827 xfs_inode_log_item_t *iip, 758 struct xfs_log_item *lip,
828 xfs_lsn_t lsn) 759 xfs_lsn_t lsn)
829{ 760{
830 iip->ili_last_lsn = lsn; 761 INODE_ITEM(lip)->ili_last_lsn = lsn;
831 return;
832} 762}
833 763
834/* 764/*
835 * This is the ops vector shared by all buf log items. 765 * This is the ops vector shared by all buf log items.
836 */ 766 */
837static struct xfs_item_ops xfs_inode_item_ops = { 767static struct xfs_item_ops xfs_inode_item_ops = {
838 .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, 768 .iop_size = xfs_inode_item_size,
839 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 769 .iop_format = xfs_inode_item_format,
840 xfs_inode_item_format, 770 .iop_pin = xfs_inode_item_pin,
841 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 771 .iop_unpin = xfs_inode_item_unpin,
842 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, 772 .iop_trylock = xfs_inode_item_trylock,
843 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 773 .iop_unlock = xfs_inode_item_unlock,
844 xfs_inode_item_unpin_remove, 774 .iop_committed = xfs_inode_item_committed,
845 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 775 .iop_push = xfs_inode_item_push,
846 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, 776 .iop_pushbuf = xfs_inode_item_pushbuf,
847 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 777 .iop_committing = xfs_inode_item_committing
848 xfs_inode_item_committed,
849 .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
850 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
851 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
852 xfs_inode_item_committing
853}; 778};
854 779
855 780
@@ -858,10 +783,10 @@ static struct xfs_item_ops xfs_inode_item_ops = {
858 */ 783 */
859void 784void
860xfs_inode_item_init( 785xfs_inode_item_init(
861 xfs_inode_t *ip, 786 struct xfs_inode *ip,
862 xfs_mount_t *mp) 787 struct xfs_mount *mp)
863{ 788{
864 xfs_inode_log_item_t *iip; 789 struct xfs_inode_log_item *iip;
865 790
866 ASSERT(ip->i_itemp == NULL); 791 ASSERT(ip->i_itemp == NULL);
867 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 792 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
@@ -899,14 +824,14 @@ xfs_inode_item_destroy(
899 * from the AIL if it has not been re-logged, and unlocking the inode's 824 * from the AIL if it has not been re-logged, and unlocking the inode's
900 * flush lock. 825 * flush lock.
901 */ 826 */
902/*ARGSUSED*/
903void 827void
904xfs_iflush_done( 828xfs_iflush_done(
905 xfs_buf_t *bp, 829 struct xfs_buf *bp,
906 xfs_inode_log_item_t *iip) 830 struct xfs_log_item *lip)
907{ 831{
832 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
908 xfs_inode_t *ip = iip->ili_inode; 833 xfs_inode_t *ip = iip->ili_inode;
909 struct xfs_ail *ailp = iip->ili_item.li_ailp; 834 struct xfs_ail *ailp = lip->li_ailp;
910 835
911 /* 836 /*
912 * We only want to pull the item from the AIL if it is 837 * We only want to pull the item from the AIL if it is
@@ -917,12 +842,11 @@ xfs_iflush_done(
917 * the lock since it's cheaper, and then we recheck while 842 * the lock since it's cheaper, and then we recheck while
918 * holding the lock before removing the inode from the AIL. 843 * holding the lock before removing the inode from the AIL.
919 */ 844 */
920 if (iip->ili_logged && 845 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
921 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
922 spin_lock(&ailp->xa_lock); 846 spin_lock(&ailp->xa_lock);
923 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { 847 if (lip->li_lsn == iip->ili_flush_lsn) {
924 /* xfs_trans_ail_delete() drops the AIL lock. */ 848 /* xfs_trans_ail_delete() drops the AIL lock. */
925 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip); 849 xfs_trans_ail_delete(ailp, lip);
926 } else { 850 } else {
927 spin_unlock(&ailp->xa_lock); 851 spin_unlock(&ailp->xa_lock);
928 } 852 }
@@ -940,8 +864,6 @@ xfs_iflush_done(
940 * Release the inode's flush lock since we're done with it. 864 * Release the inode's flush lock since we're done with it.
941 */ 865 */
942 xfs_ifunlock(ip); 866 xfs_ifunlock(ip);
943
944 return;
945} 867}
946 868
947/* 869/*
@@ -957,10 +879,8 @@ xfs_iflush_abort(
957 xfs_inode_t *ip) 879 xfs_inode_t *ip)
958{ 880{
959 xfs_inode_log_item_t *iip = ip->i_itemp; 881 xfs_inode_log_item_t *iip = ip->i_itemp;
960 xfs_mount_t *mp;
961 882
962 iip = ip->i_itemp; 883 iip = ip->i_itemp;
963 mp = ip->i_mount;
964 if (iip) { 884 if (iip) {
965 struct xfs_ail *ailp = iip->ili_item.li_ailp; 885 struct xfs_ail *ailp = iip->ili_item.li_ailp;
966 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 886 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -991,10 +911,10 @@ xfs_iflush_abort(
991 911
992void 912void
993xfs_istale_done( 913xfs_istale_done(
994 xfs_buf_t *bp, 914 struct xfs_buf *bp,
995 xfs_inode_log_item_t *iip) 915 struct xfs_log_item *lip)
996{ 916{
997 xfs_iflush_abort(iip->ili_inode); 917 xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
998} 918}
999 919
1000/* 920/*
@@ -1007,9 +927,8 @@ xfs_inode_item_format_convert(
1007 xfs_inode_log_format_t *in_f) 927 xfs_inode_log_format_t *in_f)
1008{ 928{
1009 if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { 929 if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
1010 xfs_inode_log_format_32_t *in_f32; 930 xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
1011 931
1012 in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
1013 in_f->ilf_type = in_f32->ilf_type; 932 in_f->ilf_type = in_f32->ilf_type;
1014 in_f->ilf_size = in_f32->ilf_size; 933 in_f->ilf_size = in_f32->ilf_size;
1015 in_f->ilf_fields = in_f32->ilf_fields; 934 in_f->ilf_fields = in_f32->ilf_fields;
@@ -1025,9 +944,8 @@ xfs_inode_item_format_convert(
1025 in_f->ilf_boffset = in_f32->ilf_boffset; 944 in_f->ilf_boffset = in_f32->ilf_boffset;
1026 return 0; 945 return 0;
1027 } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ 946 } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
1028 xfs_inode_log_format_64_t *in_f64; 947 xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
1029 948
1030 in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
1031 in_f->ilf_type = in_f64->ilf_type; 949 in_f->ilf_type = in_f64->ilf_type;
1032 in_f->ilf_size = in_f64->ilf_size; 950 in_f->ilf_size = in_f64->ilf_size;
1033 in_f->ilf_fields = in_f64->ilf_fields; 951 in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecd..d3dee61e6d9 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
103 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ 103 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
104 XFS_ILOG_ABROOT) 104 XFS_ILOG_ABROOT)
105 105
106#define XFS_ILI_HOLD 0x1
107#define XFS_ILI_IOLOCKED_EXCL 0x2
108#define XFS_ILI_IOLOCKED_SHARED 0x4
109
110#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
111
112static inline int xfs_ilog_fbroot(int w) 106static inline int xfs_ilog_fbroot(int w)
113{ 107{
114 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); 108 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 131 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 132 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 133 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_flags; /* misc flags */ 134 unsigned short ili_lock_flags; /* lock flags */
141 unsigned short ili_logged; /* flushed logged data */ 135 unsigned short ili_logged; /* flushed logged data */
142 unsigned int ili_last_fields; /* fields when flushed */ 136 unsigned int ili_last_fields; /* fields when flushed */
143 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged 137 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
161 155
162extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 156extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
163extern void xfs_inode_item_destroy(struct xfs_inode *); 157extern void xfs_inode_item_destroy(struct xfs_inode *);
164extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); 158extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
165extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); 159extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
166extern void xfs_iflush_abort(struct xfs_inode *); 160extern void xfs_iflush_abort(struct xfs_inode *);
167extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 161extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
168 xfs_inode_log_format_t *); 162 xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ef14943829d..20576146369 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_btree.h" 34#include "xfs_btree.h"
40#include "xfs_bmap.h" 35#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 36#include "xfs_rtalloc.h"
@@ -123,7 +118,7 @@ xfs_iomap(
123 error = xfs_bmapi(NULL, ip, offset_fsb, 118 error = xfs_bmapi(NULL, ip, offset_fsb,
124 (xfs_filblks_t)(end_fsb - offset_fsb), 119 (xfs_filblks_t)(end_fsb - offset_fsb),
125 bmapi_flags, NULL, 0, imap, 120 bmapi_flags, NULL, 0, imap,
126 nimaps, NULL, NULL); 121 nimaps, NULL);
127 122
128 if (error) 123 if (error)
129 goto out; 124 goto out;
@@ -138,7 +133,7 @@ xfs_iomap(
138 break; 133 break;
139 } 134 }
140 135
141 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { 136 if (flags & BMAPI_DIRECT) {
142 error = xfs_iomap_write_direct(ip, offset, count, flags, 137 error = xfs_iomap_write_direct(ip, offset, count, flags,
143 imap, nimaps); 138 imap, nimaps);
144 } else { 139 } else {
@@ -247,7 +242,7 @@ xfs_iomap_write_direct(
247 xfs_off_t offset, 242 xfs_off_t offset,
248 size_t count, 243 size_t count,
249 int flags, 244 int flags,
250 xfs_bmbt_irec_t *ret_imap, 245 xfs_bmbt_irec_t *imap,
251 int *nmaps) 246 int *nmaps)
252{ 247{
253 xfs_mount_t *mp = ip->i_mount; 248 xfs_mount_t *mp = ip->i_mount;
@@ -261,7 +256,6 @@ xfs_iomap_write_direct(
261 int quota_flag; 256 int quota_flag;
262 int rt; 257 int rt;
263 xfs_trans_t *tp; 258 xfs_trans_t *tp;
264 xfs_bmbt_irec_t imap;
265 xfs_bmap_free_t free_list; 259 xfs_bmap_free_t free_list;
266 uint qblocks, resblks, resrtextents; 260 uint qblocks, resblks, resrtextents;
267 int committed; 261 int committed;
@@ -285,10 +279,10 @@ xfs_iomap_write_direct(
285 if (error) 279 if (error)
286 goto error_out; 280 goto error_out;
287 } else { 281 } else {
288 if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) 282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
289 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 283 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
290 ret_imap->br_blockcount + 284 imap->br_blockcount +
291 ret_imap->br_startoff); 285 imap->br_startoff);
292 } 286 }
293 count_fsb = last_fsb - offset_fsb; 287 count_fsb = last_fsb - offset_fsb;
294 ASSERT(count_fsb > 0); 288 ASSERT(count_fsb > 0);
@@ -334,20 +328,22 @@ xfs_iomap_write_direct(
334 if (error) 328 if (error)
335 goto error1; 329 goto error1;
336 330
337 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 331 xfs_trans_ijoin(tp, ip);
338 xfs_trans_ihold(tp, ip);
339 332
340 bmapi_flag = XFS_BMAPI_WRITE; 333 bmapi_flag = XFS_BMAPI_WRITE;
341 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
342 bmapi_flag |= XFS_BMAPI_PREALLOC; 335 bmapi_flag |= XFS_BMAPI_PREALLOC;
343 336
344 /* 337 /*
345 * Issue the xfs_bmapi() call to allocate the blocks 338 * Issue the xfs_bmapi() call to allocate the blocks.
339 *
340 * From this point onwards we overwrite the imap pointer that the
341 * caller gave to us.
346 */ 342 */
347 xfs_bmap_init(&free_list, &firstfsb); 343 xfs_bmap_init(&free_list, &firstfsb);
348 nimaps = 1; 344 nimaps = 1;
349 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 345 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
350 &firstfsb, 0, &imap, &nimaps, &free_list, NULL); 346 &firstfsb, 0, imap, &nimaps, &free_list);
351 if (error) 347 if (error)
352 goto error0; 348 goto error0;
353 349
@@ -369,12 +365,11 @@ xfs_iomap_write_direct(
369 goto error_out; 365 goto error_out;
370 } 366 }
371 367
372 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { 368 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
373 error = xfs_cmn_err_fsblock_zero(ip, &imap); 369 error = xfs_cmn_err_fsblock_zero(ip, imap);
374 goto error_out; 370 goto error_out;
375 } 371 }
376 372
377 *ret_imap = imap;
378 *nmaps = 1; 373 *nmaps = 1;
379 return 0; 374 return 0;
380 375
@@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
425 imaps = nimaps; 420 imaps = nimaps;
426 firstblock = NULLFSBLOCK; 421 firstblock = NULLFSBLOCK;
427 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, 422 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
428 &firstblock, 0, imap, &imaps, NULL, NULL); 423 &firstblock, 0, imap, &imaps, NULL);
429 if (error) 424 if (error)
430 return error; 425 return error;
431 for (n = 0; n < imaps; n++) { 426 for (n = 0; n < imaps; n++) {
@@ -500,7 +495,7 @@ retry:
500 (xfs_filblks_t)(last_fsb - offset_fsb), 495 (xfs_filblks_t)(last_fsb - offset_fsb),
501 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
502 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
503 &nimaps, NULL, NULL); 498 &nimaps, NULL);
504 if (error && (error != ENOSPC)) 499 if (error && (error != ENOSPC))
505 return XFS_ERROR(error); 500 return XFS_ERROR(error);
506 501
@@ -548,7 +543,7 @@ xfs_iomap_write_allocate(
548 xfs_inode_t *ip, 543 xfs_inode_t *ip,
549 xfs_off_t offset, 544 xfs_off_t offset,
550 size_t count, 545 size_t count,
551 xfs_bmbt_irec_t *map, 546 xfs_bmbt_irec_t *imap,
552 int *retmap) 547 int *retmap)
553{ 548{
554 xfs_mount_t *mp = ip->i_mount; 549 xfs_mount_t *mp = ip->i_mount;
@@ -557,7 +552,6 @@ xfs_iomap_write_allocate(
557 xfs_fsblock_t first_block; 552 xfs_fsblock_t first_block;
558 xfs_bmap_free_t free_list; 553 xfs_bmap_free_t free_list;
559 xfs_filblks_t count_fsb; 554 xfs_filblks_t count_fsb;
560 xfs_bmbt_irec_t imap;
561 xfs_trans_t *tp; 555 xfs_trans_t *tp;
562 int nimaps, committed; 556 int nimaps, committed;
563 int error = 0; 557 int error = 0;
@@ -573,8 +567,8 @@ xfs_iomap_write_allocate(
573 return XFS_ERROR(error); 567 return XFS_ERROR(error);
574 568
575 offset_fsb = XFS_B_TO_FSBT(mp, offset); 569 offset_fsb = XFS_B_TO_FSBT(mp, offset);
576 count_fsb = map->br_blockcount; 570 count_fsb = imap->br_blockcount;
577 map_start_fsb = map->br_startoff; 571 map_start_fsb = imap->br_startoff;
578 572
579 XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); 573 XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
580 574
@@ -602,8 +596,7 @@ xfs_iomap_write_allocate(
602 return XFS_ERROR(error); 596 return XFS_ERROR(error);
603 } 597 }
604 xfs_ilock(ip, XFS_ILOCK_EXCL); 598 xfs_ilock(ip, XFS_ILOCK_EXCL);
605 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 599 xfs_trans_ijoin(tp, ip);
606 xfs_trans_ihold(tp, ip);
607 600
608 xfs_bmap_init(&free_list, &first_block); 601 xfs_bmap_init(&free_list, &first_block);
609 602
@@ -654,10 +647,15 @@ xfs_iomap_write_allocate(
654 } 647 }
655 } 648 }
656 649
657 /* Go get the actual blocks */ 650 /*
651 * Go get the actual blocks.
652 *
653 * From this point onwards we overwrite the imap
654 * pointer that the caller gave to us.
655 */
658 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, 656 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
659 XFS_BMAPI_WRITE, &first_block, 1, 657 XFS_BMAPI_WRITE, &first_block, 1,
660 &imap, &nimaps, &free_list, NULL); 658 imap, &nimaps, &free_list);
661 if (error) 659 if (error)
662 goto trans_cancel; 660 goto trans_cancel;
663 661
@@ -676,13 +674,12 @@ xfs_iomap_write_allocate(
676 * See if we were able to allocate an extent that 674 * See if we were able to allocate an extent that
677 * covers at least part of the callers request 675 * covers at least part of the callers request
678 */ 676 */
679 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 677 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
680 return xfs_cmn_err_fsblock_zero(ip, &imap); 678 return xfs_cmn_err_fsblock_zero(ip, imap);
681 679
682 if ((offset_fsb >= imap.br_startoff) && 680 if ((offset_fsb >= imap->br_startoff) &&
683 (offset_fsb < (imap.br_startoff + 681 (offset_fsb < (imap->br_startoff +
684 imap.br_blockcount))) { 682 imap->br_blockcount))) {
685 *map = imap;
686 *retmap = 1; 683 *retmap = 1;
687 XFS_STATS_INC(xs_xstrat_quick); 684 XFS_STATS_INC(xs_xstrat_quick);
688 return 0; 685 return 0;
@@ -692,8 +689,8 @@ xfs_iomap_write_allocate(
692 * So far we have not mapped the requested part of the 689 * So far we have not mapped the requested part of the
693 * file, just surrounding data, try again. 690 * file, just surrounding data, try again.
694 */ 691 */
695 count_fsb -= imap.br_blockcount; 692 count_fsb -= imap->br_blockcount;
696 map_start_fsb = imap.br_startoff + imap.br_blockcount; 693 map_start_fsb = imap->br_startoff + imap->br_blockcount;
697 } 694 }
698 695
699trans_cancel: 696trans_cancel:
@@ -766,8 +763,7 @@ xfs_iomap_write_unwritten(
766 } 763 }
767 764
768 xfs_ilock(ip, XFS_ILOCK_EXCL); 765 xfs_ilock(ip, XFS_ILOCK_EXCL);
769 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 766 xfs_trans_ijoin(tp, ip);
770 xfs_trans_ihold(tp, ip);
771 767
772 /* 768 /*
773 * Modify the unwritten extent state of the buffer. 769 * Modify the unwritten extent state of the buffer.
@@ -776,7 +772,7 @@ xfs_iomap_write_unwritten(
776 nimaps = 1; 772 nimaps = 1;
777 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 773 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
778 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 774 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
779 1, &imap, &nimaps, &free_list, NULL); 775 1, &imap, &nimaps, &free_list);
780 if (error) 776 if (error)
781 goto error_on_bmapi_transaction; 777 goto error_on_bmapi_transaction;
782 778
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 81ac4afd45b..7748a430f50 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,17 +18,16 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21typedef enum { 21/* base extent manipulation calls */
22 /* base extent manipulation calls */ 22#define BMAPI_READ (1 << 0) /* read extents */
23 BMAPI_READ = (1 << 0), /* read extents */ 23#define BMAPI_WRITE (1 << 1) /* create extents */
24 BMAPI_WRITE = (1 << 1), /* create extents */ 24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25 BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ 25
26 /* modifiers */ 26/* modifiers */
27 BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ 27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28 BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ 28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29 BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ 29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30 BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ 30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31} bmapi_flags_t;
32 31
33#define BMAPI_FLAGS \ 32#define BMAPI_FLAGS \
34 { BMAPI_READ, "READ" }, \ 33 { BMAPI_READ, "READ" }, \
@@ -36,7 +35,6 @@ typedef enum {
36 { BMAPI_ALLOCATE, "ALLOCATE" }, \ 35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
37 { BMAPI_IGNSTATE, "IGNSTATE" }, \ 36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
38 { BMAPI_DIRECT, "DIRECT" }, \ 37 { BMAPI_DIRECT, "DIRECT" }, \
39 { BMAPI_MMAP, "MMAP" }, \
40 { BMAPI_TRYLOCK, "TRYLOCK" } 38 { BMAPI_TRYLOCK, "TRYLOCK" }
41 39
42struct xfs_inode; 40struct xfs_inode;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2b86f861051..dc1882adaf5 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
38#include "xfs_itable.h" 34#include "xfs_itable.h"
39#include "xfs_error.h" 35#include "xfs_error.h"
40#include "xfs_btree.h" 36#include "xfs_btree.h"
37#include "xfs_trace.h"
41 38
42STATIC int 39STATIC int
43xfs_internal_inum( 40xfs_internal_inum(
@@ -95,7 +92,8 @@ xfs_bulkstat_one_int(
95 * further change. 92 * further change.
96 */ 93 */
97 buf->bs_nlink = dic->di_nlink; 94 buf->bs_nlink = dic->di_nlink;
98 buf->bs_projid = dic->di_projid; 95 buf->bs_projid_lo = dic->di_projid_lo;
96 buf->bs_projid_hi = dic->di_projid_hi;
99 buf->bs_ino = ino; 97 buf->bs_ino = ino;
100 buf->bs_mode = dic->di_mode; 98 buf->bs_mode = dic->di_mode;
101 buf->bs_uid = dic->di_uid; 99 buf->bs_uid = dic->di_uid;
@@ -143,7 +141,8 @@ xfs_bulkstat_one_int(
143 buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; 141 buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
144 break; 142 break;
145 } 143 }
146 xfs_iput(ip, XFS_ILOCK_SHARED); 144 xfs_iunlock(ip, XFS_ILOCK_SHARED);
145 IRELE(ip);
147 146
148 error = formatter(buffer, ubsize, ubused, buf); 147 error = formatter(buffer, ubsize, ubused, buf);
149 148
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5215abc8023..cee4ab9f8a9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_error.h" 28#include "xfs_error.h"
31#include "xfs_log_priv.h" 29#include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
35#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
36#include "xfs_log_recover.h" 34#include "xfs_log_recover.h"
37#include "xfs_trans_priv.h" 35#include "xfs_trans_priv.h"
38#include "xfs_dir2_sf.h"
39#include "xfs_attr_sf.h"
40#include "xfs_dinode.h" 36#include "xfs_dinode.h"
41#include "xfs_inode.h" 37#include "xfs_inode.h"
42#include "xfs_rw.h" 38#include "xfs_rw.h"
@@ -337,7 +333,6 @@ xfs_log_reserve(
337 int retval = 0; 333 int retval = 0;
338 334
339 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 335 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
340 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
341 336
342 if (XLOG_FORCED_SHUTDOWN(log)) 337 if (XLOG_FORCED_SHUTDOWN(log))
343 return XFS_ERROR(EIO); 338 return XFS_ERROR(EIO);
@@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
552 .magic = XLOG_UNMOUNT_TYPE, 547 .magic = XLOG_UNMOUNT_TYPE,
553 }; 548 };
554 struct xfs_log_iovec reg = { 549 struct xfs_log_iovec reg = {
555 .i_addr = (void *)&magic, 550 .i_addr = &magic,
556 .i_len = sizeof(magic), 551 .i_len = sizeof(magic),
557 .i_type = XLOG_REG_TYPE_UNMOUNT, 552 .i_type = XLOG_REG_TYPE_UNMOUNT,
558 }; 553 };
@@ -922,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
922 l = iclog->ic_log; 917 l = iclog->ic_log;
923 918
924 /* 919 /*
925 * If the _XFS_BARRIER_FAILED flag was set by a lower
926 * layer, it means the underlying device no longer supports
927 * barrier I/O. Warn loudly and turn off barriers.
928 */
929 if (bp->b_flags & _XFS_BARRIER_FAILED) {
930 bp->b_flags &= ~_XFS_BARRIER_FAILED;
931 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
932 xfs_fs_cmn_err(CE_WARN, l->l_mp,
933 "xlog_iodone: Barriers are no longer supported"
934 " by device. Disabling barriers\n");
935 }
936
937 /*
938 * Race to shutdown the filesystem if we see an error. 920 * Race to shutdown the filesystem if we see an error.
939 */ 921 */
940 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 922 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1047,7 +1029,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 xlog_in_core_t *iclog, *prev_iclog=NULL; 1029 xlog_in_core_t *iclog, *prev_iclog=NULL;
1048 xfs_buf_t *bp; 1030 xfs_buf_t *bp;
1049 int i; 1031 int i;
1050 int iclogsize;
1051 int error = ENOMEM; 1032 int error = ENOMEM;
1052 uint log2_size = 0; 1033 uint log2_size = 0;
1053 1034
@@ -1127,7 +1108,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1127 * with different amounts of memory. See the definition of 1108 * with different amounts of memory. See the definition of
1128 * xlog_in_core_t in xfs_log_priv.h for details. 1109 * xlog_in_core_t in xfs_log_priv.h for details.
1129 */ 1110 */
1130 iclogsize = log->l_iclog_size;
1131 ASSERT(log->l_iclog_size >= 4096); 1111 ASSERT(log->l_iclog_size >= 4096);
1132 for (i=0; i < log->l_iclog_bufs; i++) { 1112 for (i=0; i < log->l_iclog_bufs; i++) {
1133 *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); 1113 *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1138,7 +1118,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1138 iclog->ic_prev = prev_iclog; 1118 iclog->ic_prev = prev_iclog;
1139 prev_iclog = iclog; 1119 prev_iclog = iclog;
1140 1120
1141 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); 1121 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1122 log->l_iclog_size, 0);
1142 if (!bp) 1123 if (!bp)
1143 goto out_free_iclog; 1124 goto out_free_iclog;
1144 if (!XFS_BUF_CPSEMA(bp)) 1125 if (!XFS_BUF_CPSEMA(bp))
@@ -1316,7 +1297,7 @@ xlog_bdstrat(
1316 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1297 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1317 XFS_BUF_ERROR(bp, EIO); 1298 XFS_BUF_ERROR(bp, EIO);
1318 XFS_BUF_STALE(bp); 1299 XFS_BUF_STALE(bp);
1319 xfs_biodone(bp); 1300 xfs_buf_ioend(bp, 0);
1320 /* 1301 /*
1321 * It would seem logical to return EIO here, but we rely on 1302 * It would seem logical to return EIO here, but we rely on
1322 * the log state machine to propagate I/O errors instead of 1303 * the log state machine to propagate I/O errors instead of
@@ -1428,11 +1409,8 @@ xlog_sync(xlog_t *log,
1428 XFS_BUF_BUSY(bp); 1409 XFS_BUF_BUSY(bp);
1429 XFS_BUF_ASYNC(bp); 1410 XFS_BUF_ASYNC(bp);
1430 bp->b_flags |= XBF_LOG_BUFFER; 1411 bp->b_flags |= XBF_LOG_BUFFER;
1431 /* 1412
1432 * Do an ordered write for the log block. 1413 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1433 * Its unnecessary to flush the first split block in the log wrap case.
1434 */
1435 if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
1436 XFS_BUF_ORDERED(bp); 1414 XFS_BUF_ORDERED(bp);
1437 1415
1438 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1416 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
@@ -3025,7 +3003,8 @@ _xfs_log_force(
3025 3003
3026 XFS_STATS_INC(xs_log_force); 3004 XFS_STATS_INC(xs_log_force);
3027 3005
3028 xlog_cil_push(log, 1); 3006 if (log->l_cilp)
3007 xlog_cil_force(log);
3029 3008
3030 spin_lock(&log->l_icloglock); 3009 spin_lock(&log->l_icloglock);
3031 3010
@@ -3177,7 +3156,7 @@ _xfs_log_force_lsn(
3177 XFS_STATS_INC(xs_log_force); 3156 XFS_STATS_INC(xs_log_force);
3178 3157
3179 if (log->l_cilp) { 3158 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn); 3159 lsn = xlog_cil_force_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN) 3160 if (lsn == NULLCOMMITLSN)
3182 return 0; 3161 return 0;
3183 } 3162 }
@@ -3734,7 +3713,7 @@ xfs_log_force_umount(
3734 * call below. 3713 * call below.
3735 */ 3714 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3715 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1); 3716 xlog_cil_force(log);
3738 3717
3739 /* 3718 /*
3740 * We must hold both the GRANT lock and the LOG lock, 3719 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 04c78e642cc..916eb7db14d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -55,14 +55,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
55/* 55/*
56 * Flags to xfs_log_reserve() 56 * Flags to xfs_log_reserve()
57 * 57 *
58 * XFS_LOG_SLEEP: If space is not available, sleep (default)
59 * XFS_LOG_NOSLEEP: If space is not available, return error
60 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are 58 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
61 * performed against this type of reservation, the reservation 59 * performed against this type of reservation, the reservation
62 * is not decreased. Long running transactions should use this. 60 * is not decreased. Long running transactions should use this.
63 */ 61 */
64#define XFS_LOG_SLEEP 0x0
65#define XFS_LOG_NOSLEEP 0x1
66#define XFS_LOG_PERM_RESERV 0x2 62#define XFS_LOG_PERM_RESERV 0x2
67 63
68/* 64/*
@@ -104,7 +100,7 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
104#define XLOG_REG_TYPE_MAX 19 100#define XLOG_REG_TYPE_MAX 19
105 101
106typedef struct xfs_log_iovec { 102typedef struct xfs_log_iovec {
107 xfs_caddr_t i_addr; /* beginning address of region */ 103 void *i_addr; /* beginning address of region */
108 int i_len; /* length in bytes of region */ 104 int i_len; /* length in bytes of region */
109 uint i_type; /* type of region */ 105 uint i_type; /* type of region */
110} xfs_log_iovec_t; 106} xfs_log_iovec_t;
@@ -201,9 +197,4 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202 198
203#endif 199#endif
204
205
206extern int xlog_debug; /* set to 1 to enable real log */
207
208
209#endif /* __XFS_LOG_H__ */ 200#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index bb17cc044bf..23d6ceb5e97 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -26,8 +26,6 @@
26#include "xfs_log_priv.h" 26#include "xfs_log_priv.h"
27#include "xfs_sb.h" 27#include "xfs_sb.h"
28#include "xfs_ag.h" 28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h" 29#include "xfs_mount.h"
32#include "xfs_error.h" 30#include "xfs_error.h"
33#include "xfs_alloc.h" 31#include "xfs_alloc.h"
@@ -70,6 +68,7 @@ xlog_cil_init(
70 ctx->sequence = 1; 68 ctx->sequence = 1;
71 ctx->cil = cil; 69 ctx->cil = cil;
72 cil->xc_ctx = ctx; 70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
73 72
74 cil->xc_log = log; 73 cil->xc_log = log;
75 log->l_cilp = cil; 74 log->l_cilp = cil;
@@ -147,102 +146,6 @@ xlog_cil_init_post_recovery(
147} 146}
148 147
149/* 148/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers 149 * Format log item into a flat buffers
247 * 150 *
248 * For delayed logging, we need to hold a formatted buffer containing all the 151 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -271,15 +174,10 @@ xlog_cil_insert(
271static void 174static void
272xlog_cil_format_items( 175xlog_cil_format_items(
273 struct log *log, 176 struct log *log,
274 struct xfs_log_vec *log_vector, 177 struct xfs_log_vec *log_vector)
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{ 178{
278 struct xfs_log_vec *lv; 179 struct xfs_log_vec *lv;
279 180
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector); 181 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) { 182 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr; 183 void *ptr;
@@ -292,7 +190,7 @@ xlog_cil_format_items(
292 len += lv->lv_iovecp[index].i_len; 190 len += lv->lv_iovecp[index].i_len;
293 191
294 lv->lv_buf_len = len; 192 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); 193 lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf; 194 ptr = lv->lv_buf;
297 195
298 for (index = 0; index < lv->lv_niovecs; index++) { 196 for (index = 0; index < lv->lv_niovecs; index++) {
@@ -303,97 +201,153 @@ xlog_cil_format_items(
303 ptr += vec->i_len; 201 ptr += vec->i_len;
304 } 202 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 203 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 } 204 }
309} 205}
310 206
311static void 207/*
312xlog_cil_free_logvec( 208 * Prepare the log item for insertion into the CIL. Calculate the difference in
313 struct xfs_log_vec *log_vector) 209 * log space and vectors it will consume, and if it is a new item pin it as
210 * well.
211 */
212STATIC void
213xfs_cil_prepare_item(
214 struct log *log,
215 struct xfs_log_vec *lv,
216 int *len,
217 int *diff_iovecs)
314{ 218{
315 struct xfs_log_vec *lv; 219 struct xfs_log_vec *old = lv->lv_item->li_lv;
220
221 if (old) {
222 /* existing lv on log item, space used is a delta */
223 ASSERT(!list_empty(&lv->lv_item->li_cil));
224 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
225
226 *len += lv->lv_buf_len - old->lv_buf_len;
227 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
228 kmem_free(old->lv_buf);
229 kmem_free(old);
230 } else {
231 /* new lv, must pin the log item */
232 ASSERT(!lv->lv_item->li_lv);
233 ASSERT(list_empty(&lv->lv_item->li_cil));
234
235 *len += lv->lv_buf_len;
236 *diff_iovecs += lv->lv_niovecs;
237 IOP_PIN(lv->lv_item);
316 238
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 } 239 }
240
241 /* attach new log vector to log item */
242 lv->lv_item->li_lv = lv;
243
244 /*
245 * If this is the first time the item is being committed to the
246 * CIL, store the sequence number on the log item so we can
247 * tell in future commits whether this is the first checkpoint
248 * the item is being committed into.
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
323} 252}
324 253
325/* 254/*
326 * Commit a transaction with the given vector to the Committed Item List. 255 * Insert the log items into the CIL and calculate the difference in space
327 * 256 * consumed by the item. Add the space to the checkpoint ticket and calculate
328 * To do this, we need to format the item, pin it in memory if required and 257 * if the change requires additional log metadata. If it does, take that space
329 * account for the space used by the transaction. Once we have done that we 258 * as well. Remove the amount of space we addded to the checkpoint ticket from
330 * need to release the unused reservation for the transaction, attach the 259 * the current transaction ticket so that the accounting works out correctly.
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */ 260 */
342int 261static void
343xfs_log_commit_cil( 262xlog_cil_insert_items(
344 struct xfs_mount *mp, 263 struct log *log,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector, 264 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn, 265 struct xlog_ticket *ticket)
348 int flags)
349{ 266{
350 struct log *log = mp->m_log; 267 struct xfs_cil *cil = log->l_cilp;
351 int log_flags = 0; 268 struct xfs_cil_ctx *ctx = cil->xc_ctx;
352 int push = 0; 269 struct xfs_log_vec *lv;
353 270 int len = 0;
354 if (flags & XFS_TRANS_RELEASE_LOG_RES) 271 int diff_iovecs = 0;
355 log_flags = XFS_LOG_REL_PERM_RESERV; 272 int iclog_space;
356 273
357 if (XLOG_FORCED_SHUTDOWN(log)) { 274 ASSERT(log_vector);
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361 275
362 /* lock out background commit */ 276 /*
363 down_read(&log->l_cilp->xc_ctx_lock); 277 * Do all the accounting aggregation and switching of log vectors
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); 278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL.
290 */
291 for (lv = log_vector; lv; lv = lv->lv_next)
292 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
365 293
366 /* check we didn't blow the reservation */ 294 /* account for space used by new iovec headers */
367 if (tp->t_ticket->t_curr_res < 0) 295 len += diff_iovecs * sizeof(xlog_op_header_t);
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369 296
370 /* attach the transaction to the CIL if it has any busy extents */ 297 spin_lock(&cil->xc_cil_lock);
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377 298
378 tp->t_commit_lsn = *commit_lsn; 299 /* move the items to the tail of the CIL */
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 300 for (lv = log_vector; lv; lv = lv->lv_next)
380 xfs_trans_unreserve_and_mod_sb(tp); 301 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
381 302
382 /* check for background commit before unlock */ 303 ctx->nvecs += diff_iovecs;
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386 304
387 /* 305 /*
388 * We need to push CIL every so often so we don't cache more than we 306 * Now transfer enough transaction reservation to the context ticket
389 * can fit in the log. The limit really is that a checkpoint can't be 307 * for the checkpoint. The context ticket is special - the unit
390 * more than half the log (the current checkpoint is not allowed to 308 * reservation has to grow as well as the current reservation as we
391 * overwrite the previous checkpoint), but commit latency and memory 309 * steal from tickets so we can correctly determine the space used
392 * usage limit this to a smaller size in most cases. 310 * during the transaction commit.
393 */ 311 */
394 if (push) 312 if (ctx->ticket->t_curr_res == 0) {
395 xlog_cil_push(log, 0); 313 /* first commit in checkpoint, steal the header reservation */
396 return 0; 314 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
315 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
316 ticket->t_curr_res -= ctx->ticket->t_unit_res;
317 }
318
319 /* do we need space for more log record headers? */
320 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
321 if (len > 0 && (ctx->space_used / iclog_space !=
322 (ctx->space_used + len) / iclog_space)) {
323 int hdrs;
324
325 hdrs = (len + iclog_space - 1) / iclog_space;
326 /* need to take into account split region headers, too */
327 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
328 ctx->ticket->t_unit_res += hdrs;
329 ctx->ticket->t_curr_res += hdrs;
330 ticket->t_curr_res -= hdrs;
331 ASSERT(ticket->t_curr_res >= len);
332 }
333 ticket->t_curr_res -= len;
334 ctx->space_used += len;
335
336 spin_unlock(&cil->xc_cil_lock);
337}
338
339static void
340xlog_cil_free_logvec(
341 struct xfs_log_vec *log_vector)
342{
343 struct xfs_log_vec *lv;
344
345 for (lv = log_vector; lv; ) {
346 struct xfs_log_vec *next = lv->lv_next;
347 kmem_free(lv->lv_buf);
348 kmem_free(lv);
349 lv = next;
350 }
397} 351}
398 352
399/* 353/*
@@ -429,13 +383,23 @@ xlog_cil_committed(
429} 383}
430 384
431/* 385/*
432 * Push the Committed Item List to the log. If the push_now flag is not set, 386 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
433 * then it is a background flush and so we can chose to ignore it. 387 * is a background flush and so we can chose to ignore it. Otherwise, if the
388 * current sequence is the same as @push_seq we need to do a flush. If
389 * @push_seq is less than the current sequence, then it has already been
390 * flushed and we don't need to do anything - the caller will wait for it to
391 * complete if necessary.
392 *
393 * @push_seq is a value rather than a flag because that allows us to do an
394 * unlocked check of the sequence number for a match. Hence we can allows log
395 * forces to run racily and not issue pushes for the same sequence twice. If we
396 * get a race between multiple pushes for the same sequence they will block on
397 * the first one and then abort, hence avoiding needless pushes.
434 */ 398 */
435int 399STATIC int
436xlog_cil_push( 400xlog_cil_push(
437 struct log *log, 401 struct log *log,
438 int push_now) 402 xfs_lsn_t push_seq)
439{ 403{
440 struct xfs_cil *cil = log->l_cilp; 404 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv; 405 struct xfs_log_vec *lv;
@@ -455,12 +419,20 @@ xlog_cil_push(
455 if (!cil) 419 if (!cil)
456 return 0; 420 return 0;
457 421
422 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
423
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 424 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log); 425 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460 426
461 /* lock out transaction commit, but don't block on background push */ 427 /*
428 * Lock out transaction commit, but don't block for background pushes
429 * unless we are well over the CIL space limit. See the definition of
430 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
431 * used here.
432 */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) { 433 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now) 434 if (!push_seq &&
435 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
464 goto out_free_ticket; 436 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock); 437 down_write(&cil->xc_ctx_lock);
466 } 438 }
@@ -471,7 +443,11 @@ xlog_cil_push(
471 goto out_skip; 443 goto out_skip;
472 444
473 /* check for spurious background flush */ 445 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 446 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
447 goto out_skip;
448
449 /* check for a previously pushed seqeunce */
450 if (push_seq && push_seq < cil->xc_ctx->sequence)
475 goto out_skip; 451 goto out_skip;
476 452
477 /* 453 /*
@@ -517,6 +493,13 @@ xlog_cil_push(
517 cil->xc_ctx = new_ctx; 493 cil->xc_ctx = new_ctx;
518 494
519 /* 495 /*
496 * mirror the new sequence into the cil structure so that we can do
497 * unlocked checks against the current sequence in log forces without
498 * risking deferencing a freed context pointer.
499 */
500 cil->xc_current_sequence = new_ctx->sequence;
501
502 /*
520 * The switch is now done, so we can drop the context lock and move out 503 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record, 504 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so 505 * though - we need to synchronise with previous and future commits so
@@ -554,7 +537,7 @@ xlog_cil_push(
554 thdr.th_type = XFS_TRANS_CHECKPOINT; 537 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid; 538 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs; 539 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr; 540 lhdr.i_addr = &thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t); 541 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; 542 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); 543 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
@@ -628,6 +611,105 @@ out_abort:
628} 611}
629 612
630/* 613/*
614 * Commit a transaction with the given vector to the Committed Item List.
615 *
616 * To do this, we need to format the item, pin it in memory if required and
617 * account for the space used by the transaction. Once we have done that we
618 * need to release the unused reservation for the transaction, attach the
619 * transaction to the checkpoint context so we carry the busy extents through
620 * to checkpoint completion, and then unlock all the items in the transaction.
621 *
622 * For more specific information about the order of operations in
623 * xfs_log_commit_cil() please refer to the comments in
624 * xfs_trans_commit_iclog().
625 *
626 * Called with the context lock already held in read mode to lock out
627 * background commit, returns without it held once background commits are
628 * allowed again.
629 */
630int
631xfs_log_commit_cil(
632 struct xfs_mount *mp,
633 struct xfs_trans *tp,
634 struct xfs_log_vec *log_vector,
635 xfs_lsn_t *commit_lsn,
636 int flags)
637{
638 struct log *log = mp->m_log;
639 int log_flags = 0;
640 int push = 0;
641
642 if (flags & XFS_TRANS_RELEASE_LOG_RES)
643 log_flags = XFS_LOG_REL_PERM_RESERV;
644
645 if (XLOG_FORCED_SHUTDOWN(log)) {
646 xlog_cil_free_logvec(log_vector);
647 return XFS_ERROR(EIO);
648 }
649
650 /*
651 * do all the hard work of formatting items (including memory
652 * allocation) outside the CIL context lock. This prevents stalling CIL
653 * pushes when we are low on memory and a transaction commit spends a
654 * lot of time in memory reclaim.
655 */
656 xlog_cil_format_items(log, log_vector);
657
658 /* lock out background commit */
659 down_read(&log->l_cilp->xc_ctx_lock);
660 if (commit_lsn)
661 *commit_lsn = log->l_cilp->xc_ctx->sequence;
662
663 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
664
665 /* check we didn't blow the reservation */
666 if (tp->t_ticket->t_curr_res < 0)
667 xlog_print_tic_res(log->l_mp, tp->t_ticket);
668
669 /* attach the transaction to the CIL if it has any busy extents */
670 if (!list_empty(&tp->t_busy)) {
671 spin_lock(&log->l_cilp->xc_cil_lock);
672 list_splice_init(&tp->t_busy,
673 &log->l_cilp->xc_ctx->busy_extents);
674 spin_unlock(&log->l_cilp->xc_cil_lock);
675 }
676
677 tp->t_commit_lsn = *commit_lsn;
678 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
679 xfs_trans_unreserve_and_mod_sb(tp);
680
681 /*
682 * Once all the items of the transaction have been copied to the CIL,
683 * the items can be unlocked and freed.
684 *
685 * This needs to be done before we drop the CIL context lock because we
686 * have to update state in the log items and unlock them before they go
687 * to disk. If we don't, then the CIL checkpoint can race with us and
688 * we can run checkpoint completion before we've updated and unlocked
689 * the log items. This affects (at least) processing of stale buffers,
690 * inodes and EFIs.
691 */
692 xfs_trans_free_items(tp, *commit_lsn, 0);
693
694 /* check for background commit before unlock */
695 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
696 push = 1;
697
698 up_read(&log->l_cilp->xc_ctx_lock);
699
700 /*
701 * We need to push CIL every so often so we don't cache more than we
702 * can fit in the log. The limit really is that a checkpoint can't be
703 * more than half the log (the current checkpoint is not allowed to
704 * overwrite the previous checkpoint), but commit latency and memory
705 * usage limit this to a smaller size in most cases.
706 */
707 if (push)
708 xlog_cil_push(log, 0);
709 return 0;
710}
711
712/*
631 * Conditionally push the CIL based on the sequence passed in. 713 * Conditionally push the CIL based on the sequence passed in.
632 * 714 *
633 * We only need to push if we haven't already pushed the sequence 715 * We only need to push if we haven't already pushed the sequence
@@ -641,39 +723,34 @@ out_abort:
641 * commit lsn is there. It'll be empty, so this is broken for now. 723 * commit lsn is there. It'll be empty, so this is broken for now.
642 */ 724 */
643xfs_lsn_t 725xfs_lsn_t
644xlog_cil_push_lsn( 726xlog_cil_force_lsn(
645 struct log *log, 727 struct log *log,
646 xfs_lsn_t push_seq) 728 xfs_lsn_t sequence)
647{ 729{
648 struct xfs_cil *cil = log->l_cilp; 730 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx; 731 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 732 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651 733
652restart: 734 ASSERT(sequence <= cil->xc_current_sequence);
653 down_write(&cil->xc_ctx_lock); 735
654 ASSERT(push_seq <= cil->xc_ctx->sequence); 736 /*
655 737 * check to see if we need to force out the current context.
656 /* check to see if we need to force out the current context */ 738 * xlog_cil_push() handles racing pushes for the same sequence,
657 if (push_seq == cil->xc_ctx->sequence) { 739 * so no need to deal with it here.
658 up_write(&cil->xc_ctx_lock); 740 */
659 xlog_cil_push(log, 1); 741 if (sequence == cil->xc_current_sequence)
660 goto restart; 742 xlog_cil_push(log, sequence);
661 }
662 743
663 /* 744 /*
664 * See if we can find a previous sequence still committing. 745 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete 746 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block 747 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well. 748 * on commits for those as well.
672 */ 749 */
750restart:
673 spin_lock(&cil->xc_cil_lock); 751 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) { 752 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq) 753 if (ctx->sequence > sequence)
677 continue; 754 continue;
678 if (!ctx->commit_lsn) { 755 if (!ctx->commit_lsn) {
679 /* 756 /*
@@ -683,7 +760,7 @@ restart:
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 760 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart; 761 goto restart;
685 } 762 }
686 if (ctx->sequence != push_seq) 763 if (ctx->sequence != sequence)
687 continue; 764 continue;
688 /* found it! */ 765 /* found it! */
689 commit_lsn = ctx->commit_lsn; 766 commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965..edcdfe01617 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
422 struct rw_semaphore xc_ctx_lock; 422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 423 struct list_head xc_committing;
424 sv_t xc_commit_wait; 424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
425}; 426};
426 427
427/* 428/*
428 * The amount of log space we should the CIL to aggregate is difficult to size. 429 * The amount of log space we allow the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space 430 * Whatever we choose, we have to make sure we can get a reservation for the
430 * effectively, that it is large enough to capture sufficient relogging to 431 * log space effectively, that it is large enough to capture sufficient
431 * reduce log buffer IO significantly, but it is not too large for the log or 432 * relogging to reduce log buffer IO significantly, but it is not too large for
432 * induces too much latency when writing out through the iclogs. We track both 433 * the log or induces too much latency when writing out through the iclogs. We
433 * space consumed and the number of vectors in the checkpoint context, so we 434 * track both space consumed and the number of vectors in the checkpoint
434 * need to decide which to use for limiting. 435 * context, so we need to decide which to use for limiting.
435 * 436 *
436 * Every log buffer we write out during a push needs a header reserved, which 437 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of 438 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
458 * checkpoint transaction ticket is specific to the checkpoint context, rather 459 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself. 460 * than the CIL itself.
460 * 461 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the 462 * With dynamic reservations, we can effectively make up arbitrary limits for
462 * checkpoint size so long as they don't violate any other size rules. Hence 463 * the checkpoint size so long as they don't violate any other size rules.
463 * the initial maximum size for the checkpoint transaction will be set to a 464 * Recovery imposes a rule that no transaction exceed half the log, so we are
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit 465 * limited by that. Furthermore, the log transaction reservation subsystem
465 * right now based on the latency of writing out a large amount of data through 466 * tries to keep 25% of the log free, so we need to keep below that limit or we
466 * the circular iclog buffers. 467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
467 */ 474 */
468 475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
469#define XLOG_CIL_SPACE_LIMIT(log) \ 476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471 477
472/* 478/*
473 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log); 568void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log); 569void xlog_cil_destroy(struct log *log);
564 570
565int xlog_cil_push(struct log *log, int push_now); 571/*
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 572 * CIL force routines
573 */
574xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
575
576static inline void
577xlog_cil_force(struct log *log)
578{
579 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
580}
567 581
568/* 582/*
569 * Unmount record type is used as a pseudo transaction type for the ticket. 583 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9ac5cfab27b..966d3f97458 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_error.h" 28#include "xfs_error.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
@@ -111,7 +107,8 @@ xlog_get_bp(
111 nbblks += log->l_sectBBsize; 107 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize); 108 nbblks = round_up(nbblks, log->l_sectBBsize);
113 109
114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 110 return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
111 BBTOB(nbblks), 0);
115} 112}
116 113
117STATIC void 114STATIC void
@@ -171,7 +168,7 @@ xlog_bread_noalign(
171 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 168 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
172 169
173 xfsbdstrat(log->l_mp, bp); 170 xfsbdstrat(log->l_mp, bp);
174 error = xfs_iowait(bp); 171 error = xfs_buf_iowait(bp);
175 if (error) 172 if (error)
176 xfs_ioerror_alert("xlog_bread", log->l_mp, 173 xfs_ioerror_alert("xlog_bread", log->l_mp,
177 bp, XFS_BUF_ADDR(bp)); 174 bp, XFS_BUF_ADDR(bp));
@@ -325,12 +322,13 @@ xlog_recover_iodone(
325 * this during recovery. One strike! 322 * this during recovery. One strike!
326 */ 323 */
327 xfs_ioerror_alert("xlog_recover_iodone", 324 xfs_ioerror_alert("xlog_recover_iodone",
328 bp->b_mount, bp, XFS_BUF_ADDR(bp)); 325 bp->b_target->bt_mount, bp,
329 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 326 XFS_BUF_ADDR(bp));
327 xfs_force_shutdown(bp->b_target->bt_mount,
328 SHUTDOWN_META_IO_ERROR);
330 } 329 }
331 bp->b_mount = NULL;
332 XFS_BUF_CLR_IODONE_FUNC(bp); 330 XFS_BUF_CLR_IODONE_FUNC(bp);
333 xfs_biodone(bp); 331 xfs_buf_ioend(bp, 0);
334} 332}
335 333
336/* 334/*
@@ -1565,9 +1563,7 @@ xlog_recover_reorder_trans(
1565 1563
1566 list_splice_init(&trans->r_itemq, &sort_list); 1564 list_splice_init(&trans->r_itemq, &sort_list);
1567 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1565 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1568 xfs_buf_log_format_t *buf_f; 1566 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1569
1570 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1571 1567
1572 switch (ITEM_TYPE(item)) { 1568 switch (ITEM_TYPE(item)) {
1573 case XFS_LI_BUF: 1569 case XFS_LI_BUF:
@@ -1892,9 +1888,8 @@ xlog_recover_do_inode_buffer(
1892 * current di_next_unlinked field. Extract its value 1888 * current di_next_unlinked field. Extract its value
1893 * and copy it to the buffer copy. 1889 * and copy it to the buffer copy.
1894 */ 1890 */
1895 logged_nextp = (xfs_agino_t *) 1891 logged_nextp = item->ri_buf[item_index].i_addr +
1896 ((char *)(item->ri_buf[item_index].i_addr) + 1892 next_unlinked_offset - reg_buf_offset;
1897 (next_unlinked_offset - reg_buf_offset));
1898 if (unlikely(*logged_nextp == 0)) { 1893 if (unlikely(*logged_nextp == 0)) {
1899 xfs_fs_cmn_err(CE_ALERT, mp, 1894 xfs_fs_cmn_err(CE_ALERT, mp,
1900 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1895 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1973,8 +1968,7 @@ xlog_recover_do_reg_buffer(
1973 item->ri_buf[i].i_len, __func__); 1968 item->ri_buf[i].i_len, __func__);
1974 goto next; 1969 goto next;
1975 } 1970 }
1976 error = xfs_qm_dqcheck((xfs_disk_dquot_t *) 1971 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
1977 item->ri_buf[i].i_addr,
1978 -1, 0, XFS_QMOPT_DOWARN, 1972 -1, 0, XFS_QMOPT_DOWARN,
1979 "dquot_buf_recover"); 1973 "dquot_buf_recover");
1980 if (error) 1974 if (error)
@@ -2187,7 +2181,7 @@ xlog_recover_do_buffer_trans(
2187 xlog_recover_item_t *item, 2181 xlog_recover_item_t *item,
2188 int pass) 2182 int pass)
2189{ 2183{
2190 xfs_buf_log_format_t *buf_f; 2184 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2191 xfs_mount_t *mp; 2185 xfs_mount_t *mp;
2192 xfs_buf_t *bp; 2186 xfs_buf_t *bp;
2193 int error; 2187 int error;
@@ -2197,8 +2191,6 @@ xlog_recover_do_buffer_trans(
2197 ushort flags; 2191 ushort flags;
2198 uint buf_flags; 2192 uint buf_flags;
2199 2193
2200 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2201
2202 if (pass == XLOG_RECOVER_PASS1) { 2194 if (pass == XLOG_RECOVER_PASS1) {
2203 /* 2195 /*
2204 * In this pass we're only looking for buf items 2196 * In this pass we're only looking for buf items
@@ -2285,8 +2277,7 @@ xlog_recover_do_buffer_trans(
2285 XFS_BUF_STALE(bp); 2277 XFS_BUF_STALE(bp);
2286 error = xfs_bwrite(mp, bp); 2278 error = xfs_bwrite(mp, bp);
2287 } else { 2279 } else {
2288 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2280 ASSERT(bp->b_target->bt_mount == mp);
2289 bp->b_mount = mp;
2290 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2281 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2291 xfs_bdwrite(mp, bp); 2282 xfs_bdwrite(mp, bp);
2292 } 2283 }
@@ -2319,10 +2310,9 @@ xlog_recover_do_inode_trans(
2319 } 2310 }
2320 2311
2321 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2312 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2322 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; 2313 in_f = item->ri_buf[0].i_addr;
2323 } else { 2314 } else {
2324 in_f = (xfs_inode_log_format_t *)kmem_alloc( 2315 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2325 sizeof(xfs_inode_log_format_t), KM_SLEEP);
2326 need_free = 1; 2316 need_free = 1;
2327 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2317 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2328 if (error) 2318 if (error)
@@ -2370,7 +2360,7 @@ xlog_recover_do_inode_trans(
2370 error = EFSCORRUPTED; 2360 error = EFSCORRUPTED;
2371 goto error; 2361 goto error;
2372 } 2362 }
2373 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); 2363 dicp = item->ri_buf[1].i_addr;
2374 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2364 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2375 xfs_buf_relse(bp); 2365 xfs_buf_relse(bp);
2376 xfs_fs_cmn_err(CE_ALERT, mp, 2366 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2461,7 +2451,7 @@ xlog_recover_do_inode_trans(
2461 } 2451 }
2462 2452
2463 /* The core is in in-core format */ 2453 /* The core is in in-core format */
2464 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); 2454 xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
2465 2455
2466 /* the rest is in on-disk format */ 2456 /* the rest is in on-disk format */
2467 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { 2457 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2551,8 +2541,7 @@ xlog_recover_do_inode_trans(
2551 } 2541 }
2552 2542
2553write_inode_buffer: 2543write_inode_buffer:
2554 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2544 ASSERT(bp->b_target->bt_mount == mp);
2555 bp->b_mount = mp;
2556 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2557 xfs_bdwrite(mp, bp); 2546 xfs_bdwrite(mp, bp);
2558error: 2547error:
@@ -2578,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
2578 return (0); 2567 return (0);
2579 } 2568 }
2580 2569
2581 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; 2570 qoff_f = item->ri_buf[0].i_addr;
2582 ASSERT(qoff_f); 2571 ASSERT(qoff_f);
2583 2572
2584 /* 2573 /*
@@ -2622,9 +2611,8 @@ xlog_recover_do_dquot_trans(
2622 if (mp->m_qflags == 0) 2611 if (mp->m_qflags == 0)
2623 return (0); 2612 return (0);
2624 2613
2625 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; 2614 recddq = item->ri_buf[1].i_addr;
2626 2615 if (recddq == NULL) {
2627 if (item->ri_buf[1].i_addr == NULL) {
2628 cmn_err(CE_ALERT, 2616 cmn_err(CE_ALERT,
2629 "XFS: NULL dquot in %s.", __func__); 2617 "XFS: NULL dquot in %s.", __func__);
2630 return XFS_ERROR(EIO); 2618 return XFS_ERROR(EIO);
@@ -2654,7 +2642,7 @@ xlog_recover_do_dquot_trans(
2654 * The other possibility, of course, is that the quota subsystem was 2642 * The other possibility, of course, is that the quota subsystem was
2655 * removed since the last mount - ENOSYS. 2643 * removed since the last mount - ENOSYS.
2656 */ 2644 */
2657 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; 2645 dq_f = item->ri_buf[0].i_addr;
2658 ASSERT(dq_f); 2646 ASSERT(dq_f);
2659 if ((error = xfs_qm_dqcheck(recddq, 2647 if ((error = xfs_qm_dqcheck(recddq,
2660 dq_f->qlf_id, 2648 dq_f->qlf_id,
@@ -2690,8 +2678,7 @@ xlog_recover_do_dquot_trans(
2690 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2678 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2691 2679
2692 ASSERT(dq_f->qlf_size == 2); 2680 ASSERT(dq_f->qlf_size == 2);
2693 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2681 ASSERT(bp->b_target->bt_mount == mp);
2694 bp->b_mount = mp;
2695 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2682 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2696 xfs_bdwrite(mp, bp); 2683 xfs_bdwrite(mp, bp);
2697 2684
@@ -2721,7 +2708,7 @@ xlog_recover_do_efi_trans(
2721 return 0; 2708 return 0;
2722 } 2709 }
2723 2710
2724 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; 2711 efi_formatp = item->ri_buf[0].i_addr;
2725 2712
2726 mp = log->l_mp; 2713 mp = log->l_mp;
2727 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2714 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2767,7 +2754,7 @@ xlog_recover_do_efd_trans(
2767 return; 2754 return;
2768 } 2755 }
2769 2756
2770 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; 2757 efd_formatp = item->ri_buf[0].i_addr;
2771 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2758 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2772 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2759 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2773 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + 2760 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
@@ -3829,7 +3816,7 @@ xlog_do_recover(
3829 XFS_BUF_READ(bp); 3816 XFS_BUF_READ(bp);
3830 XFS_BUF_UNASYNC(bp); 3817 XFS_BUF_UNASYNC(bp);
3831 xfsbdstrat(log->l_mp, bp); 3818 xfsbdstrat(log->l_mp, bp);
3832 error = xfs_iowait(bp); 3819 error = xfs_buf_iowait(bp);
3833 if (error) { 3820 if (error) {
3834 xfs_ioerror_alert("xlog_do_recover", 3821 xfs_ioerror_alert("xlog_do_recover",
3835 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3822 log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 69f62d8b281..b1498ab5a39 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 32#include "xfs_dinode.h"
36#include "xfs_inode.h" 33#include "xfs_inode.h"
37#include "xfs_btree.h" 34#include "xfs_btree.h"
@@ -55,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
55 int); 52 int);
56STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, 53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
57 int); 54 int);
58STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
59 int64_t, int);
60STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 55STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
61
62#else 56#else
63 57
64#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) 58#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
65#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
66#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
67
68#endif 60#endif
69 61
70static const struct { 62static const struct {
@@ -202,6 +194,8 @@ xfs_uuid_unmount(
202 194
203/* 195/*
204 * Reference counting access wrappers to the perag structures. 196 * Reference counting access wrappers to the perag structures.
197 * Because we never free per-ag structures, the only thing we
198 * have to protect against changes is the tree structure itself.
205 */ 199 */
206struct xfs_perag * 200struct xfs_perag *
207xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) 201xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -209,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
209 struct xfs_perag *pag; 203 struct xfs_perag *pag;
210 int ref = 0; 204 int ref = 0;
211 205
212 spin_lock(&mp->m_perag_lock); 206 rcu_read_lock();
213 pag = radix_tree_lookup(&mp->m_perag_tree, agno); 207 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
214 if (pag) { 208 if (pag) {
215 ASSERT(atomic_read(&pag->pag_ref) >= 0); 209 ASSERT(atomic_read(&pag->pag_ref) >= 0);
216 /* catch leaks in the positive direction during testing */
217 ASSERT(atomic_read(&pag->pag_ref) < 1000);
218 ref = atomic_inc_return(&pag->pag_ref); 210 ref = atomic_inc_return(&pag->pag_ref);
219 } 211 }
220 spin_unlock(&mp->m_perag_lock); 212 rcu_read_unlock();
221 trace_xfs_perag_get(mp, agno, ref, _RET_IP_); 213 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
222 return pag; 214 return pag;
223} 215}
224 216
217/*
218 * search from @first to find the next perag with the given tag set.
219 */
220struct xfs_perag *
221xfs_perag_get_tag(
222 struct xfs_mount *mp,
223 xfs_agnumber_t first,
224 int tag)
225{
226 struct xfs_perag *pag;
227 int found;
228 int ref;
229
230 rcu_read_lock();
231 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
232 (void **)&pag, first, 1, tag);
233 if (found <= 0) {
234 rcu_read_unlock();
235 return NULL;
236 }
237 ref = atomic_inc_return(&pag->pag_ref);
238 rcu_read_unlock();
239 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
240 return pag;
241}
242
225void 243void
226xfs_perag_put(struct xfs_perag *pag) 244xfs_perag_put(struct xfs_perag *pag)
227{ 245{
@@ -232,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
232 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); 250 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
233} 251}
234 252
253STATIC void
254__xfs_free_perag(
255 struct rcu_head *head)
256{
257 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
258
259 ASSERT(atomic_read(&pag->pag_ref) == 0);
260 kmem_free(pag);
261}
262
235/* 263/*
236 * Free up the resources associated with a mount structure. Assume that 264 * Free up the per-ag resources associated with the mount structure.
237 * the structure was initially zeroed, so we can tell which fields got
238 * initialized.
239 */ 265 */
240STATIC void 266STATIC void
241xfs_free_perag( 267xfs_free_perag(
@@ -247,10 +273,9 @@ xfs_free_perag(
247 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 273 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
248 spin_lock(&mp->m_perag_lock); 274 spin_lock(&mp->m_perag_lock);
249 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
250 ASSERT(pag);
251 ASSERT(atomic_read(&pag->pag_ref) == 0);
252 spin_unlock(&mp->m_perag_lock); 276 spin_unlock(&mp->m_perag_lock);
253 kmem_free(pag); 277 ASSERT(pag);
278 call_rcu(&pag->rcu_head, __xfs_free_perag);
254 } 279 }
255} 280}
256 281
@@ -447,7 +472,10 @@ xfs_initialize_perag(
447 pag->pag_agno = index; 472 pag->pag_agno = index;
448 pag->pag_mount = mp; 473 pag->pag_mount = mp;
449 rwlock_init(&pag->pag_ici_lock); 474 rwlock_init(&pag->pag_ici_lock);
475 mutex_init(&pag->pag_ici_reclaim_lock);
450 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 476 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
477 spin_lock_init(&pag->pag_buf_lock);
478 pag->pag_buf_tree = RB_ROOT;
451 479
452 if (radix_tree_preload(GFP_NOFS)) 480 if (radix_tree_preload(GFP_NOFS))
453 goto out_unwind; 481 goto out_unwind;
@@ -642,7 +670,6 @@ int
642xfs_readsb(xfs_mount_t *mp, int flags) 670xfs_readsb(xfs_mount_t *mp, int flags)
643{ 671{
644 unsigned int sector_size; 672 unsigned int sector_size;
645 unsigned int extra_flags;
646 xfs_buf_t *bp; 673 xfs_buf_t *bp;
647 int error; 674 int error;
648 675
@@ -655,28 +682,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
655 * access to the superblock. 682 * access to the superblock.
656 */ 683 */
657 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 684 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
658 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
659 685
660 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 686reread:
661 extra_flags); 687 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
662 if (!bp || XFS_BUF_ISERROR(bp)) { 688 XFS_SB_DADDR, sector_size, 0);
663 xfs_fs_mount_cmn_err(flags, "SB read failed"); 689 if (!bp) {
664 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 690 xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
665 goto fail; 691 return EIO;
666 } 692 }
667 ASSERT(XFS_BUF_ISBUSY(bp));
668 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
669 693
670 /* 694 /*
671 * Initialize the mount structure from the superblock. 695 * Initialize the mount structure from the superblock.
672 * But first do some basic consistency checking. 696 * But first do some basic consistency checking.
673 */ 697 */
674 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 698 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
675
676 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 699 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
677 if (error) { 700 if (error) {
678 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 701 xfs_fs_mount_cmn_err(flags, "SB validate failed");
679 goto fail; 702 goto release_buf;
680 } 703 }
681 704
682 /* 705 /*
@@ -687,7 +710,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
687 "device supports only %u byte sectors (not %u)", 710 "device supports only %u byte sectors (not %u)",
688 sector_size, mp->m_sb.sb_sectsize); 711 sector_size, mp->m_sb.sb_sectsize);
689 error = ENOSYS; 712 error = ENOSYS;
690 goto fail; 713 goto release_buf;
691 } 714 }
692 715
693 /* 716 /*
@@ -695,33 +718,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
695 * re-read the superblock so the buffer is correctly sized. 718 * re-read the superblock so the buffer is correctly sized.
696 */ 719 */
697 if (sector_size < mp->m_sb.sb_sectsize) { 720 if (sector_size < mp->m_sb.sb_sectsize) {
698 XFS_BUF_UNMANAGE(bp);
699 xfs_buf_relse(bp); 721 xfs_buf_relse(bp);
700 sector_size = mp->m_sb.sb_sectsize; 722 sector_size = mp->m_sb.sb_sectsize;
701 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, 723 goto reread;
702 BTOBB(sector_size), extra_flags);
703 if (!bp || XFS_BUF_ISERROR(bp)) {
704 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
705 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
706 goto fail;
707 }
708 ASSERT(XFS_BUF_ISBUSY(bp));
709 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
710 } 724 }
711 725
712 /* Initialize per-cpu counters */ 726 /* Initialize per-cpu counters */
713 xfs_icsb_reinit_counters(mp); 727 xfs_icsb_reinit_counters(mp);
714 728
715 mp->m_sb_bp = bp; 729 mp->m_sb_bp = bp;
716 xfs_buf_relse(bp); 730 xfs_buf_unlock(bp);
717 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
718 return 0; 731 return 0;
719 732
720 fail: 733release_buf:
721 if (bp) { 734 xfs_buf_relse(bp);
722 XFS_BUF_UNMANAGE(bp);
723 xfs_buf_relse(bp);
724 }
725 return error; 735 return error;
726} 736}
727 737
@@ -994,42 +1004,35 @@ xfs_check_sizes(xfs_mount_t *mp)
994{ 1004{
995 xfs_buf_t *bp; 1005 xfs_buf_t *bp;
996 xfs_daddr_t d; 1006 xfs_daddr_t d;
997 int error;
998 1007
999 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1008 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1000 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1009 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1001 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1010 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
1002 return XFS_ERROR(EFBIG); 1011 return XFS_ERROR(EFBIG);
1003 } 1012 }
1004 error = xfs_read_buf(mp, mp->m_ddev_targp, 1013 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1005 d - XFS_FSS_TO_BB(mp, 1), 1014 d - XFS_FSS_TO_BB(mp, 1),
1006 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1015 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1007 if (!error) { 1016 if (!bp) {
1008 xfs_buf_relse(bp); 1017 cmn_err(CE_WARN, "XFS: last sector read failed");
1009 } else { 1018 return EIO;
1010 cmn_err(CE_WARN, "XFS: size check 2 failed");
1011 if (error == ENOSPC)
1012 error = XFS_ERROR(EFBIG);
1013 return error;
1014 } 1019 }
1020 xfs_buf_relse(bp);
1015 1021
1016 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1022 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1017 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1023 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1018 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1024 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1019 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1025 cmn_err(CE_WARN, "XFS: log size mismatch detected");
1020 return XFS_ERROR(EFBIG); 1026 return XFS_ERROR(EFBIG);
1021 } 1027 }
1022 error = xfs_read_buf(mp, mp->m_logdev_targp, 1028 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1023 d - XFS_FSB_TO_BB(mp, 1), 1029 d - XFS_FSB_TO_BB(mp, 1),
1024 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1030 XFS_FSB_TO_B(mp, 1), 0);
1025 if (!error) { 1031 if (!bp) {
1026 xfs_buf_relse(bp); 1032 cmn_err(CE_WARN, "XFS: log device read failed");
1027 } else { 1033 return EIO;
1028 cmn_err(CE_WARN, "XFS: size check 3 failed");
1029 if (error == ENOSPC)
1030 error = XFS_ERROR(EFBIG);
1031 return error;
1032 } 1034 }
1035 xfs_buf_relse(bp);
1033 } 1036 }
1034 return 0; 1037 return 0;
1035} 1038}
@@ -1604,7 +1607,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1604 XFS_BUF_UNASYNC(sbp); 1607 XFS_BUF_UNASYNC(sbp);
1605 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1608 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1606 xfsbdstrat(mp, sbp); 1609 xfsbdstrat(mp, sbp);
1607 error = xfs_iowait(sbp); 1610 error = xfs_buf_iowait(sbp);
1608 if (error) 1611 if (error)
1609 xfs_ioerror_alert("xfs_unmountfs_writesb", 1612 xfs_ioerror_alert("xfs_unmountfs_writesb",
1610 mp, sbp, XFS_BUF_ADDR(sbp)); 1613 mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1835,135 +1838,72 @@ xfs_mod_incore_sb_unlocked(
1835 */ 1838 */
1836int 1839int
1837xfs_mod_incore_sb( 1840xfs_mod_incore_sb(
1838 xfs_mount_t *mp, 1841 struct xfs_mount *mp,
1839 xfs_sb_field_t field, 1842 xfs_sb_field_t field,
1840 int64_t delta, 1843 int64_t delta,
1841 int rsvd) 1844 int rsvd)
1842{ 1845{
1843 int status; 1846 int status;
1844 1847
1845 /* check for per-cpu counters */
1846 switch (field) {
1847#ifdef HAVE_PERCPU_SB 1848#ifdef HAVE_PERCPU_SB
1848 case XFS_SBS_ICOUNT: 1849 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1849 case XFS_SBS_IFREE:
1850 case XFS_SBS_FDBLOCKS:
1851 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1852 status = xfs_icsb_modify_counters(mp, field,
1853 delta, rsvd);
1854 break;
1855 }
1856 /* FALLTHROUGH */
1857#endif 1850#endif
1858 default: 1851 spin_lock(&mp->m_sb_lock);
1859 spin_lock(&mp->m_sb_lock); 1852 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1860 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1853 spin_unlock(&mp->m_sb_lock);
1861 spin_unlock(&mp->m_sb_lock);
1862 break;
1863 }
1864 1854
1865 return status; 1855 return status;
1866} 1856}
1867 1857
1868/* 1858/*
1869 * xfs_mod_incore_sb_batch() is used to change more than one field 1859 * Change more than one field in the in-core superblock structure at a time.
1870 * in the in-core superblock structure at a time. This modification
1871 * is protected by a lock internal to this module. The fields and
1872 * changes to those fields are specified in the array of xfs_mod_sb
1873 * structures passed in.
1874 * 1860 *
1875 * Either all of the specified deltas will be applied or none of 1861 * The fields and changes to those fields are specified in the array of
1876 * them will. If any modified field dips below 0, then all modifications 1862 * xfs_mod_sb structures passed in. Either all of the specified deltas
1877 * will be backed out and EINVAL will be returned. 1863 * will be applied or none of them will. If any modified field dips below 0,
1864 * then all modifications will be backed out and EINVAL will be returned.
1865 *
1866 * Note that this function may not be used for the superblock values that
1867 * are tracked with the in-memory per-cpu counters - a direct call to
1868 * xfs_icsb_modify_counters is required for these.
1878 */ 1869 */
1879int 1870int
1880xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1871xfs_mod_incore_sb_batch(
1872 struct xfs_mount *mp,
1873 xfs_mod_sb_t *msb,
1874 uint nmsb,
1875 int rsvd)
1881{ 1876{
1882 int status=0; 1877 xfs_mod_sb_t *msbp = &msb[0];
1883 xfs_mod_sb_t *msbp; 1878 int error = 0;
1884 1879
1885 /* 1880 /*
1886 * Loop through the array of mod structures and apply each 1881 * Loop through the array of mod structures and apply each individually.
1887 * individually. If any fail, then back out all those 1882 * If any fail, then back out all those which have already been applied.
1888 * which have already been applied. Do all of this within 1883 * Do all of this within the scope of the m_sb_lock so that all of the
1889 * the scope of the m_sb_lock so that all of the changes will 1884 * changes will be atomic.
1890 * be atomic.
1891 */ 1885 */
1892 spin_lock(&mp->m_sb_lock); 1886 spin_lock(&mp->m_sb_lock);
1893 msbp = &msb[0];
1894 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1887 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1895 /* 1888 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1896 * Apply the delta at index n. If it fails, break 1889 msbp->msb_field > XFS_SBS_FDBLOCKS);
1897 * from the loop so we'll fall into the undo loop
1898 * below.
1899 */
1900 switch (msbp->msb_field) {
1901#ifdef HAVE_PERCPU_SB
1902 case XFS_SBS_ICOUNT:
1903 case XFS_SBS_IFREE:
1904 case XFS_SBS_FDBLOCKS:
1905 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1906 spin_unlock(&mp->m_sb_lock);
1907 status = xfs_icsb_modify_counters(mp,
1908 msbp->msb_field,
1909 msbp->msb_delta, rsvd);
1910 spin_lock(&mp->m_sb_lock);
1911 break;
1912 }
1913 /* FALLTHROUGH */
1914#endif
1915 default:
1916 status = xfs_mod_incore_sb_unlocked(mp,
1917 msbp->msb_field,
1918 msbp->msb_delta, rsvd);
1919 break;
1920 }
1921 1890
1922 if (status != 0) { 1891 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1923 break; 1892 msbp->msb_delta, rsvd);
1924 } 1893 if (error)
1894 goto unwind;
1925 } 1895 }
1896 spin_unlock(&mp->m_sb_lock);
1897 return 0;
1926 1898
1927 /* 1899unwind:
1928 * If we didn't complete the loop above, then back out 1900 while (--msbp >= msb) {
1929 * any changes made to the superblock. If you add code 1901 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1930 * between the loop above and here, make sure that you 1902 -msbp->msb_delta, rsvd);
1931 * preserve the value of status. Loop back until 1903 ASSERT(error == 0);
1932 * we step below the beginning of the array. Make sure
1933 * we don't touch anything back there.
1934 */
1935 if (status != 0) {
1936 msbp--;
1937 while (msbp >= msb) {
1938 switch (msbp->msb_field) {
1939#ifdef HAVE_PERCPU_SB
1940 case XFS_SBS_ICOUNT:
1941 case XFS_SBS_IFREE:
1942 case XFS_SBS_FDBLOCKS:
1943 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1944 spin_unlock(&mp->m_sb_lock);
1945 status = xfs_icsb_modify_counters(mp,
1946 msbp->msb_field,
1947 -(msbp->msb_delta),
1948 rsvd);
1949 spin_lock(&mp->m_sb_lock);
1950 break;
1951 }
1952 /* FALLTHROUGH */
1953#endif
1954 default:
1955 status = xfs_mod_incore_sb_unlocked(mp,
1956 msbp->msb_field,
1957 -(msbp->msb_delta),
1958 rsvd);
1959 break;
1960 }
1961 ASSERT(status == 0);
1962 msbp--;
1963 }
1964 } 1904 }
1965 spin_unlock(&mp->m_sb_lock); 1905 spin_unlock(&mp->m_sb_lock);
1966 return status; 1906 return error;
1967} 1907}
1968 1908
1969/* 1909/*
@@ -2001,18 +1941,13 @@ xfs_getsb(
2001 */ 1941 */
2002void 1942void
2003xfs_freesb( 1943xfs_freesb(
2004 xfs_mount_t *mp) 1944 struct xfs_mount *mp)
2005{ 1945{
2006 xfs_buf_t *bp; 1946 struct xfs_buf *bp = mp->m_sb_bp;
2007 1947
2008 /* 1948 xfs_buf_lock(bp);
2009 * Use xfs_getsb() so that the buffer will be locked
2010 * when we call xfs_buf_relse().
2011 */
2012 bp = xfs_getsb(mp, 0);
2013 XFS_BUF_UNMANAGE(bp);
2014 xfs_buf_relse(bp);
2015 mp->m_sb_bp = NULL; 1949 mp->m_sb_bp = NULL;
1950 xfs_buf_relse(bp);
2016} 1951}
2017 1952
2018/* 1953/*
@@ -2499,7 +2434,7 @@ xfs_icsb_balance_counter(
2499 spin_unlock(&mp->m_sb_lock); 2434 spin_unlock(&mp->m_sb_lock);
2500} 2435}
2501 2436
2502STATIC int 2437int
2503xfs_icsb_modify_counters( 2438xfs_icsb_modify_counters(
2504 xfs_mount_t *mp, 2439 xfs_mount_t *mp,
2505 xfs_sb_field_t field, 2440 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5761087ee8e..5861b498074 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
53 53
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct cred;
57struct log; 56struct log;
58struct xfs_mount_args; 57struct xfs_mount_args;
59struct xfs_inode; 58struct xfs_inode;
@@ -66,65 +65,6 @@ struct xfs_nameops;
66struct xfs_ail; 65struct xfs_ail;
67struct xfs_quotainfo; 66struct xfs_quotainfo;
68 67
69
70/*
71 * Prototypes and functions for the Data Migration subsystem.
72 */
73
74typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
75 xfs_off_t, size_t, int, int *);
76typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
77typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t,
81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
84 char *, char *);
85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
86 dm_right_t, mode_t, int, int);
87
88typedef struct xfs_dmops {
89 xfs_send_data_t xfs_send_data;
90 xfs_send_mmap_t xfs_send_mmap;
91 xfs_send_destroy_t xfs_send_destroy;
92 xfs_send_namesp_t xfs_send_namesp;
93 xfs_send_mount_t xfs_send_mount;
94 xfs_send_unmount_t xfs_send_unmount;
95} xfs_dmops_t;
96
97#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
98 (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
99
100#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
101 (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
102#define XFS_SEND_MMAP(mp, vma,fl) \
103 (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
104#define XFS_SEND_DESTROY(mp, ip,right) \
105 (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
106#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
107 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
108#define XFS_SEND_MOUNT(mp,right,path,name) \
109 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
110#define XFS_SEND_PREUNMOUNT(mp) \
111do { \
112 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
113 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
114 (mp)->m_rootip, DM_RIGHT_NULL, \
115 (mp)->m_rootip, DM_RIGHT_NULL, \
116 NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
117 } \
118} while (0)
119#define XFS_SEND_UNMOUNT(mp) \
120do { \
121 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
122 (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
123 DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
124 } \
125} while (0)
126
127
128#ifdef HAVE_PERCPU_SB 68#ifdef HAVE_PERCPU_SB
129 69
130/* 70/*
@@ -150,6 +90,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *);
150extern void xfs_icsb_destroy_counters(struct xfs_mount *); 90extern void xfs_icsb_destroy_counters(struct xfs_mount *);
151extern void xfs_icsb_sync_counters(struct xfs_mount *, int); 91extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
152extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); 92extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
93extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
94 int64_t, int);
153 95
154#else 96#else
155#define xfs_icsb_init_counters(mp) (0) 97#define xfs_icsb_init_counters(mp) (0)
@@ -157,6 +99,8 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
157#define xfs_icsb_reinit_counters(mp) do { } while (0) 99#define xfs_icsb_reinit_counters(mp) do { } while (0)
158#define xfs_icsb_sync_counters(mp, flags) do { } while (0) 100#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
159#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 101#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
102#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
103 xfs_mod_incore_sb(mp, field, delta, rsvd)
160#endif 104#endif
161 105
162typedef struct xfs_mount { 106typedef struct xfs_mount {
@@ -241,8 +185,6 @@ typedef struct xfs_mount {
241 uint m_chsize; /* size of next field */ 185 uint m_chsize; /* size of next field */
242 struct xfs_chash *m_chash; /* fs private inode per-cluster 186 struct xfs_chash *m_chash; /* fs private inode per-cluster
243 * hash table */ 187 * hash table */
244 struct xfs_dmops *m_dm_ops; /* vector of DMI ops */
245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
246 atomic_t m_active_trans; /* number trans frozen */ 188 atomic_t m_active_trans; /* number trans frozen */
247#ifdef HAVE_PERCPU_SB 189#ifdef HAVE_PERCPU_SB
248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ 190 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -269,7 +211,6 @@ typedef struct xfs_mount {
269 must be synchronous except 211 must be synchronous except
270 for space allocations */ 212 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ 213#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 214#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 215#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
275 operations, typically for 216 operations, typically for
@@ -282,8 +223,6 @@ typedef struct xfs_mount {
282#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ 223#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
283#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ 224#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
284#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ 225#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
285#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
286 /* osyncisdsync is now default*/
287#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above 226#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
288 * 32 bits in size */ 227 * 32 bits in size */
289#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ 228#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
@@ -296,8 +235,6 @@ typedef struct xfs_mount {
296#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ 235#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
297#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred 236#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
298 * I/O size in stat() */ 237 * I/O size in stat() */
299#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
300 counters */
301#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams 238#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
302 allocator */ 239 allocator */
303#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 240#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
@@ -391,6 +328,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
391 * perag get/put wrappers for ref counting 328 * perag get/put wrappers for ref counting
392 */ 329 */
393struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); 330struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
331struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
332 int tag);
394void xfs_perag_put(struct xfs_perag *pag); 333void xfs_perag_put(struct xfs_perag *pag);
395 334
396/* 335/*
@@ -440,11 +379,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
440 379
441extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 380extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
442 381
443extern int xfs_dmops_get(struct xfs_mount *);
444extern void xfs_dmops_put(struct xfs_mount *);
445
446extern struct xfs_dmops xfs_dmcore_xfs;
447
448#endif /* __KERNEL__ */ 382#endif /* __KERNEL__ */
449 383
450extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 384extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb51..00000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_REFCACHE_H__
19#define __XFS_REFCACHE_H__
20
21#ifdef HAVE_REFCACHE
22/*
23 * Maximum size (in inodes) for the NFS reference cache
24 */
25#define XFS_REFCACHE_SIZE_MAX 512
26
27struct xfs_inode;
28struct xfs_mount;
29
30extern void xfs_refcache_insert(struct xfs_inode *);
31extern void xfs_refcache_purge_ip(struct xfs_inode *);
32extern void xfs_refcache_purge_mp(struct xfs_mount *);
33extern void xfs_refcache_purge_some(struct xfs_mount *);
34extern void xfs_refcache_resize(int);
35extern void xfs_refcache_destroy(void);
36
37extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
38
39#else
40
41#define xfs_refcache_insert(ip) do { } while (0)
42#define xfs_refcache_purge_ip(ip) do { } while (0)
43#define xfs_refcache_purge_mp(mp) do { } while (0)
44#define xfs_refcache_purge_some(mp) do { } while (0)
45#define xfs_refcache_resize(size) do { } while (0)
46#define xfs_refcache_destroy() do { } while (0)
47
48#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
49
50#endif
51
52#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b81..d2af0a8381a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 30#include "xfs_dinode.h"
34#include "xfs_inode.h" 31#include "xfs_inode.h"
35#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
116 int spaceres; 113 int spaceres;
117 int num_inodes; 114 int num_inodes;
118 115
119 xfs_itrace_entry(src_dp); 116 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
120 xfs_itrace_entry(target_dp);
121
122 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
123 DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
124 error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
125 src_dp, DM_RIGHT_NULL,
126 target_dp, DM_RIGHT_NULL,
127 src_name->name, target_name->name,
128 0, 0, 0);
129 if (error)
130 return error;
131 }
132 /* Return through std_return after this point. */
133 117
134 new_parent = (src_dp != target_dp); 118 new_parent = (src_dp != target_dp);
135 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); 119 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
184 /* 168 /*
185 * Join all the inodes to the transaction. From this point on, 169 * Join all the inodes to the transaction. From this point on,
186 * we can rely on either trans_commit or trans_cancel to unlock 170 * we can rely on either trans_commit or trans_cancel to unlock
187 * them. Note that we need to add a vnode reference to the 171 * them.
188 * directories since trans_commit & trans_cancel will decrement
189 * them when they unlock the inodes. Also, we need to be careful
190 * not to add an inode to the transaction more than once.
191 */ 172 */
192 IHOLD(src_dp); 173 xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
193 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 174 if (new_parent)
194 175 xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
195 if (new_parent) { 176 xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
196 IHOLD(target_dp); 177 if (target_ip)
197 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 178 xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
198 }
199
200 IHOLD(src_ip);
201 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
202
203 if (target_ip) {
204 IHOLD(target_ip);
205 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
206 }
207 179
208 /* 180 /*
209 * If we are using project inheritance, we only allow renames 181 * If we are using project inheritance, we only allow renames
@@ -211,7 +183,7 @@ xfs_rename(
211 * tree quota mechanism would be circumvented. 183 * tree quota mechanism would be circumvented.
212 */ 184 */
213 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
214 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { 186 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
215 error = XFS_ERROR(EXDEV); 187 error = XFS_ERROR(EXDEV);
216 goto error_return; 188 goto error_return;
217 } 189 }
@@ -239,7 +211,9 @@ xfs_rename(
239 goto error_return; 211 goto error_return;
240 if (error) 212 if (error)
241 goto abort_return; 213 goto abort_return;
242 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 214
215 xfs_trans_ichgtime(tp, target_dp,
216 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
243 217
244 if (new_parent && src_is_directory) { 218 if (new_parent && src_is_directory) {
245 error = xfs_bumplink(tp, target_dp); 219 error = xfs_bumplink(tp, target_dp);
@@ -277,7 +251,9 @@ xfs_rename(
277 &first_block, &free_list, spaceres); 251 &first_block, &free_list, spaceres);
278 if (error) 252 if (error)
279 goto abort_return; 253 goto abort_return;
280 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 254
255 xfs_trans_ichgtime(tp, target_dp,
256 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
281 257
282 /* 258 /*
283 * Decrement the link count on the target since the target 259 * Decrement the link count on the target since the target
@@ -320,7 +296,7 @@ xfs_rename(
320 * inode isn't really being changed, but old unix file systems did 296 * inode isn't really being changed, but old unix file systems did
321 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
322 */ 298 */
323 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
324 300
325 /* 301 /*
326 * Adjust the link count on src_dp. This is necessary when 302 * Adjust the link count on src_dp. This is necessary when
@@ -343,7 +319,7 @@ xfs_rename(
343 if (error) 319 if (error)
344 goto abort_return; 320 goto abort_return;
345 321
346 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 322 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
347 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 323 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
348 if (new_parent) 324 if (new_parent)
349 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
@@ -369,26 +345,13 @@ xfs_rename(
369 * trans_commit will unlock src_ip, target_ip & decrement 345 * trans_commit will unlock src_ip, target_ip & decrement
370 * the vnode references. 346 * the vnode references.
371 */ 347 */
372 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 348 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
373
374 /* Fall through to std_return with error = 0 or errno from
375 * xfs_trans_commit */
376std_return:
377 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
378 DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
379 (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
380 src_dp, DM_RIGHT_NULL,
381 target_dp, DM_RIGHT_NULL,
382 src_name->name, target_name->name,
383 0, error, 0);
384 }
385 return error;
386 349
387 abort_return: 350 abort_return:
388 cancel_flags |= XFS_TRANS_ABORT; 351 cancel_flags |= XFS_TRANS_ABORT;
389 /* FALLTHROUGH */
390 error_return: 352 error_return:
391 xfs_bmap_cancel(&free_list); 353 xfs_bmap_cancel(&free_list);
392 xfs_trans_cancel(tp, cancel_flags); 354 xfs_trans_cancel(tp, cancel_flags);
393 goto std_return; 355 std_return:
356 return error;
394} 357}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a2d32ce335a..12a19138531 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 30#include "xfs_dinode.h"
36#include "xfs_inode.h" 31#include "xfs_inode.h"
37#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 32#include "xfs_alloc.h"
40#include "xfs_bmap.h" 33#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
@@ -46,6 +39,7 @@
46#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
47#include "xfs_utils.h" 40#include "xfs_utils.h"
48#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_buf.h"
49 43
50 44
51/* 45/*
@@ -129,7 +123,7 @@ xfs_growfs_rt_alloc(
129 cancelflags |= XFS_TRANS_ABORT; 123 cancelflags |= XFS_TRANS_ABORT;
130 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, 124 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
131 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, 125 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
132 resblks, &map, &nmap, &flist, NULL); 126 resblks, &map, &nmap, &flist);
133 if (!error && nmap < 1) 127 if (!error && nmap < 1)
134 error = XFS_ERROR(ENOSPC); 128 error = XFS_ERROR(ENOSPC);
135 if (error) 129 if (error)
@@ -1890,13 +1884,13 @@ xfs_growfs_rt(
1890 /* 1884 /*
1891 * Read in the last block of the device, make sure it exists. 1885 * Read in the last block of the device, make sure it exists.
1892 */ 1886 */
1893 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1887 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
1894 XFS_FSB_TO_BB(mp, nrblocks - 1), 1888 XFS_FSB_TO_BB(mp, nrblocks - 1),
1895 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1889 XFS_FSB_TO_B(mp, 1), 0);
1896 if (error) 1890 if (!bp)
1897 return error; 1891 return EIO;
1898 ASSERT(bp);
1899 xfs_buf_relse(bp); 1892 xfs_buf_relse(bp);
1893
1900 /* 1894 /*
1901 * Calculate new parameters. These are the final values to be reached. 1895 * Calculate new parameters. These are the final values to be reached.
1902 */ 1896 */
@@ -2222,7 +2216,6 @@ xfs_rtmount_init(
2222{ 2216{
2223 xfs_buf_t *bp; /* buffer for last block of subvolume */ 2217 xfs_buf_t *bp; /* buffer for last block of subvolume */
2224 xfs_daddr_t d; /* address of last block of subvolume */ 2218 xfs_daddr_t d; /* address of last block of subvolume */
2225 int error; /* error return value */
2226 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2227 2220
2228 sbp = &mp->m_sb; 2221 sbp = &mp->m_sb;
@@ -2249,15 +2242,12 @@ xfs_rtmount_init(
2249 (unsigned long long) mp->m_sb.sb_rblocks); 2242 (unsigned long long) mp->m_sb.sb_rblocks);
2250 return XFS_ERROR(EFBIG); 2243 return XFS_ERROR(EFBIG);
2251 } 2244 }
2252 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2245 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
2253 d - XFS_FSB_TO_BB(mp, 1), 2246 d - XFS_FSB_TO_BB(mp, 1),
2254 XFS_FSB_TO_BB(mp, 1), 0, &bp); 2247 XFS_FSB_TO_B(mp, 1), 0);
2255 if (error) { 2248 if (!bp) {
2256 cmn_err(CE_WARN, 2249 cmn_err(CE_WARN, "XFS: realtime device size check failed");
2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2250 return EIO;
2258 if (error == ENOSPC)
2259 return XFS_ERROR(EFBIG);
2260 return error;
2261 } 2251 }
2262 xfs_buf_relse(bp); 2252 xfs_buf_relse(bp);
2263 return 0; 2253 return 0;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a..56861d5daae 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_itable.h"
39#include "xfs_btree.h"
40#include "xfs_alloc.h"
41#include "xfs_ialloc.h"
42#include "xfs_attr.h"
43#include "xfs_bmap.h"
44#include "xfs_error.h" 31#include "xfs_error.h"
45#include "xfs_buf_item.h"
46#include "xfs_rw.h" 32#include "xfs_rw.h"
47#include "xfs_trace.h"
48 33
49/* 34/*
50 * Force a shutdown of the filesystem instantly while keeping 35 * Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c65749..1eb2ba58681 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
83 84
84#define XFS_SB_VERSION2_OKREALFBITS \ 85#define XFS_SB_VERSION2_OKREALFBITS \
85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
86 XFS_SB_VERSION2_ATTR2BIT) 87 XFS_SB_VERSION2_ATTR2BIT | \
88 XFS_SB_VERSION2_PROJID32BIT)
87#define XFS_SB_VERSION2_OKSASHFBITS \ 89#define XFS_SB_VERSION2_OKSASHFBITS \
88 (0) 90 (0)
89#define XFS_SB_VERSION2_OKREALBITS \ 91#define XFS_SB_VERSION2_OKREALBITS \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
495 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 497 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
496} 498}
497 499
500static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
501{
502 return xfs_sb_version_hasmorebits(sbp) &&
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504}
505
498/* 506/*
499 * end of superblock version macros 507 * end of superblock version macros
500 */ 508 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 28547dfce03..f6d956b7711 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved. 4 * All Rights Reserved.
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
24#include "xfs_trans.h" 25#include "xfs_trans.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
31#include "xfs_da_btree.h" 30#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 34#include "xfs_dinode.h"
38#include "xfs_inode.h" 35#include "xfs_inode.h"
39#include "xfs_btree.h" 36#include "xfs_btree.h"
@@ -47,6 +44,7 @@
47#include "xfs_trace.h" 44#include "xfs_trace.h"
48 45
49kmem_zone_t *xfs_trans_zone; 46kmem_zone_t *xfs_trans_zone;
47kmem_zone_t *xfs_log_item_desc_zone;
50 48
51 49
52/* 50/*
@@ -597,8 +595,7 @@ _xfs_trans_alloc(
597 tp->t_magic = XFS_TRANS_MAGIC; 595 tp->t_magic = XFS_TRANS_MAGIC;
598 tp->t_type = type; 596 tp->t_type = type;
599 tp->t_mountp = mp; 597 tp->t_mountp = mp;
600 tp->t_items_free = XFS_LIC_NUM_SLOTS; 598 INIT_LIST_HEAD(&tp->t_items);
601 xfs_lic_init(&(tp->t_items));
602 INIT_LIST_HEAD(&tp->t_busy); 599 INIT_LIST_HEAD(&tp->t_busy);
603 return tp; 600 return tp;
604} 601}
@@ -643,8 +640,7 @@ xfs_trans_dup(
643 ntp->t_magic = XFS_TRANS_MAGIC; 640 ntp->t_magic = XFS_TRANS_MAGIC;
644 ntp->t_type = tp->t_type; 641 ntp->t_type = tp->t_type;
645 ntp->t_mountp = tp->t_mountp; 642 ntp->t_mountp = tp->t_mountp;
646 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 643 INIT_LIST_HEAD(&ntp->t_items);
647 xfs_lic_init(&(ntp->t_items));
648 INIT_LIST_HEAD(&ntp->t_busy); 644 INIT_LIST_HEAD(&ntp->t_busy);
649 645
650 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 646 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -700,7 +696,7 @@ xfs_trans_reserve(
700 * fail if the count would go below zero. 696 * fail if the count would go below zero.
701 */ 697 */
702 if (blocks > 0) { 698 if (blocks > 0) {
703 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 699 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
704 -((int64_t)blocks), rsvd); 700 -((int64_t)blocks), rsvd);
705 if (error != 0) { 701 if (error != 0) {
706 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -771,7 +767,7 @@ undo_log:
771 767
772undo_blocks: 768undo_blocks:
773 if (blocks > 0) { 769 if (blocks > 0) {
774 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 770 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
775 (int64_t)blocks, rsvd); 771 (int64_t)blocks, rsvd);
776 tp->t_blk_res = 0; 772 tp->t_blk_res = 0;
777 } 773 }
@@ -1013,7 +1009,7 @@ void
1013xfs_trans_unreserve_and_mod_sb( 1009xfs_trans_unreserve_and_mod_sb(
1014 xfs_trans_t *tp) 1010 xfs_trans_t *tp)
1015{ 1011{
1016 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 1012 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
1017 xfs_mod_sb_t *msbp; 1013 xfs_mod_sb_t *msbp;
1018 xfs_mount_t *mp = tp->t_mountp; 1014 xfs_mount_t *mp = tp->t_mountp;
1019 /* REFERENCED */ 1015 /* REFERENCED */
@@ -1021,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
1021 int rsvd; 1017 int rsvd;
1022 int64_t blkdelta = 0; 1018 int64_t blkdelta = 0;
1023 int64_t rtxdelta = 0; 1019 int64_t rtxdelta = 0;
1020 int64_t idelta = 0;
1021 int64_t ifreedelta = 0;
1024 1022
1025 msbp = msb; 1023 msbp = msb;
1026 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 1024 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
1027 1025
1028 /* calculate free blocks delta */ 1026 /* calculate deltas */
1029 if (tp->t_blk_res > 0) 1027 if (tp->t_blk_res > 0)
1030 blkdelta = tp->t_blk_res; 1028 blkdelta = tp->t_blk_res;
1031
1032 if ((tp->t_fdblocks_delta != 0) && 1029 if ((tp->t_fdblocks_delta != 0) &&
1033 (xfs_sb_version_haslazysbcount(&mp->m_sb) || 1030 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1034 (tp->t_flags & XFS_TRANS_SB_DIRTY))) 1031 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
1035 blkdelta += tp->t_fdblocks_delta; 1032 blkdelta += tp->t_fdblocks_delta;
1036 1033
1037 if (blkdelta != 0) {
1038 msbp->msb_field = XFS_SBS_FDBLOCKS;
1039 msbp->msb_delta = blkdelta;
1040 msbp++;
1041 }
1042
1043 /* calculate free realtime extents delta */
1044 if (tp->t_rtx_res > 0) 1034 if (tp->t_rtx_res > 0)
1045 rtxdelta = tp->t_rtx_res; 1035 rtxdelta = tp->t_rtx_res;
1046
1047 if ((tp->t_frextents_delta != 0) && 1036 if ((tp->t_frextents_delta != 0) &&
1048 (tp->t_flags & XFS_TRANS_SB_DIRTY)) 1037 (tp->t_flags & XFS_TRANS_SB_DIRTY))
1049 rtxdelta += tp->t_frextents_delta; 1038 rtxdelta += tp->t_frextents_delta;
1050 1039
1040 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1041 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1042 idelta = tp->t_icount_delta;
1043 ifreedelta = tp->t_ifree_delta;
1044 }
1045
1046 /* apply the per-cpu counters */
1047 if (blkdelta) {
1048 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
1049 blkdelta, rsvd);
1050 if (error)
1051 goto out;
1052 }
1053
1054 if (idelta) {
1055 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
1056 idelta, rsvd);
1057 if (error)
1058 goto out_undo_fdblocks;
1059 }
1060
1061 if (ifreedelta) {
1062 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
1063 ifreedelta, rsvd);
1064 if (error)
1065 goto out_undo_icount;
1066 }
1067
1068 /* apply remaining deltas */
1051 if (rtxdelta != 0) { 1069 if (rtxdelta != 0) {
1052 msbp->msb_field = XFS_SBS_FREXTENTS; 1070 msbp->msb_field = XFS_SBS_FREXTENTS;
1053 msbp->msb_delta = rtxdelta; 1071 msbp->msb_delta = rtxdelta;
1054 msbp++; 1072 msbp++;
1055 } 1073 }
1056 1074
1057 /* apply remaining deltas */
1058
1059 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1060 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1061 if (tp->t_icount_delta != 0) {
1062 msbp->msb_field = XFS_SBS_ICOUNT;
1063 msbp->msb_delta = tp->t_icount_delta;
1064 msbp++;
1065 }
1066 if (tp->t_ifree_delta != 0) {
1067 msbp->msb_field = XFS_SBS_IFREE;
1068 msbp->msb_delta = tp->t_ifree_delta;
1069 msbp++;
1070 }
1071 }
1072
1073 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 1075 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
1074 if (tp->t_dblocks_delta != 0) { 1076 if (tp->t_dblocks_delta != 0) {
1075 msbp->msb_field = XFS_SBS_DBLOCKS; 1077 msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1119,7 +1121,125 @@ xfs_trans_unreserve_and_mod_sb(
1119 if (msbp > msb) { 1121 if (msbp > msb) {
1120 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, 1122 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
1121 (uint)(msbp - msb), rsvd); 1123 (uint)(msbp - msb), rsvd);
1122 ASSERT(error == 0); 1124 if (error)
1125 goto out_undo_ifreecount;
1126 }
1127
1128 return;
1129
1130out_undo_ifreecount:
1131 if (ifreedelta)
1132 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
1133out_undo_icount:
1134 if (idelta)
1135 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
1136out_undo_fdblocks:
1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out:
1140 ASSERT(error = 0);
1141 return;
1142}
1143
1144/*
1145 * Add the given log item to the transaction's list of log items.
1146 *
1147 * The log item will now point to its new descriptor with its li_desc field.
1148 */
1149void
1150xfs_trans_add_item(
1151 struct xfs_trans *tp,
1152 struct xfs_log_item *lip)
1153{
1154 struct xfs_log_item_desc *lidp;
1155
1156 ASSERT(lip->li_mountp = tp->t_mountp);
1157 ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
1158
1159 lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
1160
1161 lidp->lid_item = lip;
1162 lidp->lid_flags = 0;
1163 lidp->lid_size = 0;
1164 list_add_tail(&lidp->lid_trans, &tp->t_items);
1165
1166 lip->li_desc = lidp;
1167}
1168
1169STATIC void
1170xfs_trans_free_item_desc(
1171 struct xfs_log_item_desc *lidp)
1172{
1173 list_del_init(&lidp->lid_trans);
1174 kmem_zone_free(xfs_log_item_desc_zone, lidp);
1175}
1176
1177/*
1178 * Unlink and free the given descriptor.
1179 */
1180void
1181xfs_trans_del_item(
1182 struct xfs_log_item *lip)
1183{
1184 xfs_trans_free_item_desc(lip->li_desc);
1185 lip->li_desc = NULL;
1186}
1187
1188/*
1189 * Unlock all of the items of a transaction and free all the descriptors
1190 * of that transaction.
1191 */
1192void
1193xfs_trans_free_items(
1194 struct xfs_trans *tp,
1195 xfs_lsn_t commit_lsn,
1196 int flags)
1197{
1198 struct xfs_log_item_desc *lidp, *next;
1199
1200 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1201 struct xfs_log_item *lip = lidp->lid_item;
1202
1203 lip->li_desc = NULL;
1204
1205 if (commit_lsn != NULLCOMMITLSN)
1206 IOP_COMMITTING(lip, commit_lsn);
1207 if (flags & XFS_TRANS_ABORT)
1208 lip->li_flags |= XFS_LI_ABORTED;
1209 IOP_UNLOCK(lip);
1210
1211 xfs_trans_free_item_desc(lidp);
1212 }
1213}
1214
1215/*
1216 * Unlock the items associated with a transaction.
1217 *
1218 * Items which were not logged should be freed. Those which were logged must
1219 * still be tracked so they can be unpinned when the transaction commits.
1220 */
1221STATIC void
1222xfs_trans_unlock_items(
1223 struct xfs_trans *tp,
1224 xfs_lsn_t commit_lsn)
1225{
1226 struct xfs_log_item_desc *lidp, *next;
1227
1228 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1229 struct xfs_log_item *lip = lidp->lid_item;
1230
1231 lip->li_desc = NULL;
1232
1233 if (commit_lsn != NULLCOMMITLSN)
1234 IOP_COMMITTING(lip, commit_lsn);
1235 IOP_UNLOCK(lip);
1236
1237 /*
1238 * Free the descriptor if the item is not dirty
1239 * within this transaction.
1240 */
1241 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1242 xfs_trans_free_item_desc(lidp);
1123 } 1243 }
1124} 1244}
1125 1245
@@ -1134,30 +1254,27 @@ xfs_trans_count_vecs(
1134 struct xfs_trans *tp) 1254 struct xfs_trans *tp)
1135{ 1255{
1136 int nvecs; 1256 int nvecs;
1137 xfs_log_item_desc_t *lidp; 1257 struct xfs_log_item_desc *lidp;
1138 1258
1139 nvecs = 1; 1259 nvecs = 1;
1140 lidp = xfs_trans_first_item(tp);
1141 ASSERT(lidp != NULL);
1142 1260
1143 /* In the non-debug case we need to start bailing out if we 1261 /* In the non-debug case we need to start bailing out if we
1144 * didn't find a log_item here, return zero and let trans_commit 1262 * didn't find a log_item here, return zero and let trans_commit
1145 * deal with it. 1263 * deal with it.
1146 */ 1264 */
1147 if (lidp == NULL) 1265 if (list_empty(&tp->t_items)) {
1266 ASSERT(0);
1148 return 0; 1267 return 0;
1268 }
1149 1269
1150 while (lidp != NULL) { 1270 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1151 /* 1271 /*
1152 * Skip items which aren't dirty in this transaction. 1272 * Skip items which aren't dirty in this transaction.
1153 */ 1273 */
1154 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1274 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1155 lidp = xfs_trans_next_item(tp, lidp);
1156 continue; 1275 continue;
1157 }
1158 lidp->lid_size = IOP_SIZE(lidp->lid_item); 1276 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1159 nvecs += lidp->lid_size; 1277 nvecs += lidp->lid_size;
1160 lidp = xfs_trans_next_item(tp, lidp);
1161 } 1278 }
1162 1279
1163 return nvecs; 1280 return nvecs;
@@ -1177,7 +1294,7 @@ xfs_trans_fill_vecs(
1177 struct xfs_trans *tp, 1294 struct xfs_trans *tp,
1178 struct xfs_log_iovec *log_vector) 1295 struct xfs_log_iovec *log_vector)
1179{ 1296{
1180 xfs_log_item_desc_t *lidp; 1297 struct xfs_log_item_desc *lidp;
1181 struct xfs_log_iovec *vecp; 1298 struct xfs_log_iovec *vecp;
1182 uint nitems; 1299 uint nitems;
1183 1300
@@ -1188,14 +1305,11 @@ xfs_trans_fill_vecs(
1188 vecp = log_vector + 1; 1305 vecp = log_vector + 1;
1189 1306
1190 nitems = 0; 1307 nitems = 0;
1191 lidp = xfs_trans_first_item(tp); 1308 ASSERT(!list_empty(&tp->t_items));
1192 ASSERT(lidp); 1309 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1193 while (lidp) {
1194 /* Skip items which aren't dirty in this transaction. */ 1310 /* Skip items which aren't dirty in this transaction. */
1195 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1311 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1196 lidp = xfs_trans_next_item(tp, lidp);
1197 continue; 1312 continue;
1198 }
1199 1313
1200 /* 1314 /*
1201 * The item may be marked dirty but not log anything. This can 1315 * The item may be marked dirty but not log anything. This can
@@ -1206,7 +1320,6 @@ xfs_trans_fill_vecs(
1206 IOP_FORMAT(lidp->lid_item, vecp); 1320 IOP_FORMAT(lidp->lid_item, vecp);
1207 vecp += lidp->lid_size; 1321 vecp += lidp->lid_size;
1208 IOP_PIN(lidp->lid_item); 1322 IOP_PIN(lidp->lid_item);
1209 lidp = xfs_trans_next_item(tp, lidp);
1210 } 1323 }
1211 1324
1212 /* 1325 /*
@@ -1284,7 +1397,7 @@ xfs_trans_item_committed(
1284 * log item flags, if anyone else stales the buffer we do not want to 1397 * log item flags, if anyone else stales the buffer we do not want to
1285 * pay any attention to it. 1398 * pay any attention to it.
1286 */ 1399 */
1287 IOP_UNPIN(lip); 1400 IOP_UNPIN(lip, 0);
1288} 1401}
1289 1402
1290/* 1403/*
@@ -1298,27 +1411,15 @@ xfs_trans_item_committed(
1298 */ 1411 */
1299STATIC void 1412STATIC void
1300xfs_trans_committed( 1413xfs_trans_committed(
1301 struct xfs_trans *tp, 1414 void *arg,
1302 int abortflag) 1415 int abortflag)
1303{ 1416{
1304 xfs_log_item_desc_t *lidp; 1417 struct xfs_trans *tp = arg;
1305 xfs_log_item_chunk_t *licp; 1418 struct xfs_log_item_desc *lidp, *next;
1306 xfs_log_item_chunk_t *next_licp;
1307
1308 /* Call the transaction's completion callback if there is one. */
1309 if (tp->t_callback != NULL)
1310 tp->t_callback(tp, tp->t_callarg);
1311 1419
1312 for (lidp = xfs_trans_first_item(tp); 1420 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1313 lidp != NULL;
1314 lidp = xfs_trans_next_item(tp, lidp)) {
1315 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); 1421 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1316 } 1422 xfs_trans_free_item_desc(lidp);
1317
1318 /* free the item chunks, ignoring the embedded chunk */
1319 for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
1320 next_licp = licp->lic_next;
1321 kmem_free(licp);
1322 } 1423 }
1323 1424
1324 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
@@ -1333,16 +1434,14 @@ xfs_trans_uncommit(
1333 struct xfs_trans *tp, 1434 struct xfs_trans *tp,
1334 uint flags) 1435 uint flags)
1335{ 1436{
1336 xfs_log_item_desc_t *lidp; 1437 struct xfs_log_item_desc *lidp;
1337 1438
1338 for (lidp = xfs_trans_first_item(tp); 1439 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1339 lidp != NULL;
1340 lidp = xfs_trans_next_item(tp, lidp)) {
1341 /* 1440 /*
1342 * Unpin all but those that aren't dirty. 1441 * Unpin all but those that aren't dirty.
1343 */ 1442 */
1344 if (lidp->lid_flags & XFS_LID_DIRTY) 1443 if (lidp->lid_flags & XFS_LID_DIRTY)
1345 IOP_UNPIN_REMOVE(lidp->lid_item, tp); 1444 IOP_UNPIN(lidp->lid_item, 1);
1346 } 1445 }
1347 1446
1348 xfs_trans_unreserve_and_mod_sb(tp); 1447 xfs_trans_unreserve_and_mod_sb(tp);
@@ -1445,7 +1544,7 @@ xfs_trans_commit_iclog(
1445 * running in simulation mode (the log is explicitly turned 1544 * running in simulation mode (the log is explicitly turned
1446 * off). 1545 * off).
1447 */ 1546 */
1448 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; 1547 tp->t_logcb.cb_func = xfs_trans_committed;
1449 tp->t_logcb.cb_arg = tp; 1548 tp->t_logcb.cb_arg = tp;
1450 1549
1451 /* 1550 /*
@@ -1508,33 +1607,28 @@ STATIC struct xfs_log_vec *
1508xfs_trans_alloc_log_vecs( 1607xfs_trans_alloc_log_vecs(
1509 xfs_trans_t *tp) 1608 xfs_trans_t *tp)
1510{ 1609{
1511 xfs_log_item_desc_t *lidp; 1610 struct xfs_log_item_desc *lidp;
1512 struct xfs_log_vec *lv = NULL; 1611 struct xfs_log_vec *lv = NULL;
1513 struct xfs_log_vec *ret_lv = NULL; 1612 struct xfs_log_vec *ret_lv = NULL;
1514 1613
1515 lidp = xfs_trans_first_item(tp);
1516 1614
1517 /* Bail out if we didn't find a log item. */ 1615 /* Bail out if we didn't find a log item. */
1518 if (!lidp) { 1616 if (list_empty(&tp->t_items)) {
1519 ASSERT(0); 1617 ASSERT(0);
1520 return NULL; 1618 return NULL;
1521 } 1619 }
1522 1620
1523 while (lidp != NULL) { 1621 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1524 struct xfs_log_vec *new_lv; 1622 struct xfs_log_vec *new_lv;
1525 1623
1526 /* Skip items which aren't dirty in this transaction. */ 1624 /* Skip items which aren't dirty in this transaction. */
1527 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1625 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1528 lidp = xfs_trans_next_item(tp, lidp);
1529 continue; 1626 continue;
1530 }
1531 1627
1532 /* Skip items that do not have any vectors for writing */ 1628 /* Skip items that do not have any vectors for writing */
1533 lidp->lid_size = IOP_SIZE(lidp->lid_item); 1629 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1534 if (!lidp->lid_size) { 1630 if (!lidp->lid_size)
1535 lidp = xfs_trans_next_item(tp, lidp);
1536 continue; 1631 continue;
1537 }
1538 1632
1539 new_lv = kmem_zalloc(sizeof(*new_lv) + 1633 new_lv = kmem_zalloc(sizeof(*new_lv) +
1540 lidp->lid_size * sizeof(struct xfs_log_iovec), 1634 lidp->lid_size * sizeof(struct xfs_log_iovec),
@@ -1549,7 +1643,6 @@ xfs_trans_alloc_log_vecs(
1549 else 1643 else
1550 lv->lv_next = new_lv; 1644 lv->lv_next = new_lv;
1551 lv = new_lv; 1645 lv = new_lv;
1552 lidp = xfs_trans_next_item(tp, lidp);
1553 } 1646 }
1554 1647
1555 return ret_lv; 1648 return ret_lv;
@@ -1579,9 +1672,6 @@ xfs_trans_commit_cil(
1579 return error; 1672 return error;
1580 1673
1581 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1674 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1582
1583 /* xfs_trans_free_items() unlocks them first */
1584 xfs_trans_free_items(tp, *commit_lsn, 0);
1585 xfs_trans_free(tp); 1675 xfs_trans_free(tp);
1586 return 0; 1676 return 0;
1587} 1677}
@@ -1708,12 +1798,6 @@ xfs_trans_cancel(
1708 int flags) 1798 int flags)
1709{ 1799{
1710 int log_flags; 1800 int log_flags;
1711#ifdef DEBUG
1712 xfs_log_item_chunk_t *licp;
1713 xfs_log_item_desc_t *lidp;
1714 xfs_log_item_t *lip;
1715 int i;
1716#endif
1717 xfs_mount_t *mp = tp->t_mountp; 1801 xfs_mount_t *mp = tp->t_mountp;
1718 1802
1719 /* 1803 /*
@@ -1732,21 +1816,11 @@ xfs_trans_cancel(
1732 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1816 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1733 } 1817 }
1734#ifdef DEBUG 1818#ifdef DEBUG
1735 if (!(flags & XFS_TRANS_ABORT)) { 1819 if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
1736 licp = &(tp->t_items); 1820 struct xfs_log_item_desc *lidp;
1737 while (licp != NULL) { 1821
1738 lidp = licp->lic_descs; 1822 list_for_each_entry(lidp, &tp->t_items, lid_trans)
1739 for (i = 0; i < licp->lic_unused; i++, lidp++) { 1823 ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
1740 if (xfs_lic_isfree(licp, i)) {
1741 continue;
1742 }
1743
1744 lip = lidp->lid_item;
1745 if (!XFS_FORCED_SHUTDOWN(mp))
1746 ASSERT(!(lip->li_type == XFS_LI_EFD));
1747 }
1748 licp = licp->lic_next;
1749 }
1750 } 1824 }
1751#endif 1825#endif
1752 xfs_trans_unreserve_and_mod_sb(tp); 1826 xfs_trans_unreserve_and_mod_sb(tp);
@@ -1834,7 +1908,6 @@ xfs_trans_roll(
1834 if (error) 1908 if (error)
1835 return error; 1909 return error;
1836 1910
1837 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); 1911 xfs_trans_ijoin(trans, dp);
1838 xfs_trans_ihold(trans, dp);
1839 return 0; 1912 return 0;
1840} 1913}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e639e8e9a2a..246286b77a8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -161,105 +161,14 @@ typedef struct xfs_trans_header {
161 * the amount of space needed to log the item it describes 161 * the amount of space needed to log the item it describes
162 * once we get to commit processing (see xfs_trans_commit()). 162 * once we get to commit processing (see xfs_trans_commit()).
163 */ 163 */
164typedef struct xfs_log_item_desc { 164struct xfs_log_item_desc {
165 struct xfs_log_item *lid_item; 165 struct xfs_log_item *lid_item;
166 ushort lid_size; 166 ushort lid_size;
167 unsigned char lid_flags; 167 unsigned char lid_flags;
168 unsigned char lid_index; 168 struct list_head lid_trans;
169} xfs_log_item_desc_t; 169};
170 170
171#define XFS_LID_DIRTY 0x1 171#define XFS_LID_DIRTY 0x1
172#define XFS_LID_PINNED 0x2
173
174/*
175 * This structure is used to maintain a chunk list of log_item_desc
176 * structures. The free field is a bitmask indicating which descriptors
177 * in this chunk's array are free. The unused field is the first value
178 * not used since this chunk was allocated.
179 */
180#define XFS_LIC_NUM_SLOTS 15
181typedef struct xfs_log_item_chunk {
182 struct xfs_log_item_chunk *lic_next;
183 ushort lic_free;
184 ushort lic_unused;
185 xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS];
186} xfs_log_item_chunk_t;
187
188#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1)
189#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1)
190
191
192/*
193 * Initialize the given chunk. Set the chunk's free descriptor mask
194 * to indicate that all descriptors are free. The caller gets to set
195 * lic_unused to the right value (0 matches all free). The
196 * lic_descs.lid_index values are set up as each desc is allocated.
197 */
198static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
199{
200 cp->lic_free = XFS_LIC_FREEMASK;
201}
202
203static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
204{
205 cp->lic_descs[slot].lid_index = (unsigned char)(slot);
206}
207
208static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
209{
210 return cp->lic_free & XFS_LIC_FREEMASK;
211}
212
213static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
214{
215 cp->lic_free = XFS_LIC_FREEMASK;
216}
217
218static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
219{
220 return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
221}
222
223static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
224{
225 return (cp->lic_free & (1 << slot));
226}
227
228static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
229{
230 cp->lic_free &= ~(1 << slot);
231}
232
233static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
234{
235 cp->lic_free |= 1 << slot;
236}
237
238static inline xfs_log_item_desc_t *
239xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
240{
241 return &(cp->lic_descs[slot]);
242}
243
244static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
245{
246 return (uint)dp->lid_index;
247}
248
249/*
250 * Calculate the address of a chunk given a descriptor pointer:
251 * dp - dp->lid_index give the address of the start of the lic_descs array.
252 * From this we subtract the offset of the lic_descs field in a chunk.
253 * All of this yields the address of the chunk, which is
254 * cast to a chunk pointer.
255 */
256static inline xfs_log_item_chunk_t *
257xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
258{
259 return (xfs_log_item_chunk_t*) \
260 (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
261 (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
262}
263 172
264#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ 173#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
265/* 174/*
@@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
275/* 184/*
276 * Values for call flags parameter. 185 * Values for call flags parameter.
277 */ 186 */
278#define XFS_TRANS_NOSLEEP 0x1
279#define XFS_TRANS_WAIT 0x2
280#define XFS_TRANS_RELEASE_LOG_RES 0x4 187#define XFS_TRANS_RELEASE_LOG_RES 0x4
281#define XFS_TRANS_ABORT 0x8 188#define XFS_TRANS_ABORT 0x8
282 189
@@ -438,8 +345,7 @@ typedef struct xfs_item_ops {
438 uint (*iop_size)(xfs_log_item_t *); 345 uint (*iop_size)(xfs_log_item_t *);
439 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
440 void (*iop_pin)(xfs_log_item_t *); 347 void (*iop_pin)(xfs_log_item_t *);
441 void (*iop_unpin)(xfs_log_item_t *); 348 void (*iop_unpin)(xfs_log_item_t *, int remove);
442 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
443 uint (*iop_trylock)(xfs_log_item_t *); 349 uint (*iop_trylock)(xfs_log_item_t *);
444 void (*iop_unlock)(xfs_log_item_t *); 350 void (*iop_unlock)(xfs_log_item_t *);
445 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -451,8 +357,7 @@ typedef struct xfs_item_ops {
451#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
452#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
453#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 359#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
454#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) 360#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
455#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
456#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 361#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
457#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 362#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
458#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) 363#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -494,8 +399,6 @@ typedef struct xfs_trans {
494 * transaction. */ 399 * transaction. */
495 struct xfs_mount *t_mountp; /* ptr to fs mount struct */ 400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
496 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ 401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
497 xfs_trans_callback_t t_callback; /* transaction callback */
498 void *t_callarg; /* callback arg */
499 unsigned int t_flags; /* misc flags */ 402 unsigned int t_flags; /* misc flags */
500 int64_t t_icount_delta; /* superblock icount change */ 403 int64_t t_icount_delta; /* superblock icount change */
501 int64_t t_ifree_delta; /* superblock ifree change */ 404 int64_t t_ifree_delta; /* superblock ifree change */
@@ -516,8 +419,7 @@ typedef struct xfs_trans {
516 int64_t t_rblocks_delta;/* superblock rblocks change */ 419 int64_t t_rblocks_delta;/* superblock rblocks change */
517 int64_t t_rextents_delta;/* superblocks rextents chg */ 420 int64_t t_rextents_delta;/* superblocks rextents chg */
518 int64_t t_rextslog_delta;/* superblocks rextslog chg */ 421 int64_t t_rextslog_delta;/* superblocks rextslog chg */
519 unsigned int t_items_free; /* log item descs free */ 422 struct list_head t_items; /* log item descriptors */
520 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
521 xfs_trans_header_t t_header; /* header for in-log trans */ 423 xfs_trans_header_t t_header; /* header for in-log trans */
522 struct list_head t_busy; /* list of busy extents */ 424 struct list_head t_busy; /* list of busy extents */
523 unsigned long t_pflags; /* saved process flags state */ 425 unsigned long t_pflags; /* saved process flags state */
@@ -569,8 +471,9 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
569void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
570int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
571 xfs_ino_t , uint, uint, struct xfs_inode **); 473 xfs_ino_t , uint, uint, struct xfs_inode **);
572void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); 474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
573void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); 475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
574void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 477void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
575void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); 478void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
576struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); 479struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -595,6 +498,7 @@ int xfs_trans_ail_init(struct xfs_mount *);
595void xfs_trans_ail_destroy(struct xfs_mount *); 498void xfs_trans_ail_destroy(struct xfs_mount *);
596 499
597extern kmem_zone_t *xfs_trans_zone; 500extern kmem_zone_t *xfs_trans_zone;
501extern kmem_zone_t *xfs_log_item_desc_zone;
598 502
599#endif /* __KERNEL__ */ 503#endif /* __KERNEL__ */
600 504
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f724..dc9069568ff 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 63d81a22f4f..c47918c302a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
@@ -51,36 +47,17 @@ xfs_trans_buf_item_match(
51 xfs_daddr_t blkno, 47 xfs_daddr_t blkno,
52 int len) 48 int len)
53{ 49{
54 xfs_log_item_chunk_t *licp; 50 struct xfs_log_item_desc *lidp;
55 xfs_log_item_desc_t *lidp; 51 struct xfs_buf_log_item *blip;
56 xfs_buf_log_item_t *blip;
57 int i;
58 52
59 len = BBTOB(len); 53 len = BBTOB(len);
60 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { 54 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
61 if (xfs_lic_are_all_free(licp)) { 55 blip = (struct xfs_buf_log_item *)lidp->lid_item;
62 ASSERT(licp == &tp->t_items); 56 if (blip->bli_item.li_type == XFS_LI_BUF &&
63 ASSERT(licp->lic_next == NULL); 57 XFS_BUF_TARGET(blip->bli_buf) == target &&
64 return NULL; 58 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
65 } 59 XFS_BUF_COUNT(blip->bli_buf) == len)
66 60 return blip->bli_buf;
67 for (i = 0; i < licp->lic_unused; i++) {
68 /*
69 * Skip unoccupied slots.
70 */
71 if (xfs_lic_isfree(licp, i))
72 continue;
73
74 lidp = xfs_lic_slot(licp, i);
75 blip = (xfs_buf_log_item_t *)lidp->lid_item;
76 if (blip->bli_item.li_type != XFS_LI_BUF)
77 continue;
78
79 if (XFS_BUF_TARGET(blip->bli_buf) == target &&
80 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
81 XFS_BUF_COUNT(blip->bli_buf) == len)
82 return blip->bli_buf;
83 }
84 } 61 }
85 62
86 return NULL; 63 return NULL;
@@ -127,7 +104,7 @@ _xfs_trans_bjoin(
127 /* 104 /*
128 * Get a log_item_desc to point at the new item. 105 * Get a log_item_desc to point at the new item.
129 */ 106 */
130 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); 107 xfs_trans_add_item(tp, &bip->bli_item);
131 108
132 /* 109 /*
133 * Initialize b_fsprivate2 so we can find it with incore_match() 110 * Initialize b_fsprivate2 so we can find it with incore_match()
@@ -359,7 +336,7 @@ xfs_trans_read_buf(
359 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
360 XFS_BUF_READ(bp); 337 XFS_BUF_READ(bp);
361 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
362 error = xfs_iowait(bp); 339 error = xfs_buf_iowait(bp);
363 if (error) { 340 if (error) {
364 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_ioerror_alert("xfs_trans_read_buf", mp,
365 bp, blkno); 342 bp, blkno);
@@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
483{ 460{
484 xfs_buf_log_item_t *bip; 461 xfs_buf_log_item_t *bip;
485 xfs_log_item_t *lip; 462 xfs_log_item_t *lip;
486 xfs_log_item_desc_t *lidp;
487 463
488 /* 464 /*
489 * Default to a normal brelse() call if the tp is NULL. 465 * Default to a normal brelse() call if the tp is NULL.
@@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); 490 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
515 ASSERT(atomic_read(&bip->bli_refcount) > 0); 491 ASSERT(atomic_read(&bip->bli_refcount) > 0);
516 492
517 /*
518 * Find the item descriptor pointing to this buffer's
519 * log item. It must be there.
520 */
521 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
522 ASSERT(lidp != NULL);
523
524 trace_xfs_trans_brelse(bip); 493 trace_xfs_trans_brelse(bip);
525 494
526 /* 495 /*
@@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
536 * If the buffer is dirty within this transaction, we can't 505 * If the buffer is dirty within this transaction, we can't
537 * release it until we commit. 506 * release it until we commit.
538 */ 507 */
539 if (lidp->lid_flags & XFS_LID_DIRTY) 508 if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
540 return; 509 return;
541 510
542 /* 511 /*
@@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
553 /* 522 /*
554 * Free up the log item descriptor tracking the released item. 523 * Free up the log item descriptor tracking the released item.
555 */ 524 */
556 xfs_trans_free_item(tp, lidp); 525 xfs_trans_del_item(&bip->bli_item);
557 526
558 /* 527 /*
559 * Clear the hold flag in the buf log item if it is set. 528 * Clear the hold flag in the buf log item if it is set.
@@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
665 uint last) 634 uint last)
666{ 635{
667 xfs_buf_log_item_t *bip; 636 xfs_buf_log_item_t *bip;
668 xfs_log_item_desc_t *lidp;
669 637
670 ASSERT(XFS_BUF_ISBUSY(bp)); 638 ASSERT(XFS_BUF_ISBUSY(bp));
671 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 639 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
690 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 658 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
691 ASSERT(atomic_read(&bip->bli_refcount) > 0); 659 ASSERT(atomic_read(&bip->bli_refcount) > 0);
692 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 660 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
693 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; 661 bip->bli_item.li_cb = xfs_buf_iodone;
694 662
695 trace_xfs_trans_log_buf(bip); 663 trace_xfs_trans_log_buf(bip);
696 664
@@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; 675 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
708 } 676 }
709 677
710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
711 ASSERT(lidp != NULL);
712
713 tp->t_flags |= XFS_TRANS_DIRTY; 678 tp->t_flags |= XFS_TRANS_DIRTY;
714 lidp->lid_flags |= XFS_LID_DIRTY; 679 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
715 bip->bli_flags |= XFS_BLI_LOGGED; 680 bip->bli_flags |= XFS_BLI_LOGGED;
716 xfs_buf_item_log(bip, first, last); 681 xfs_buf_item_log(bip, first, last);
717} 682}
@@ -740,7 +705,6 @@ xfs_trans_binval(
740 xfs_trans_t *tp, 705 xfs_trans_t *tp,
741 xfs_buf_t *bp) 706 xfs_buf_t *bp)
742{ 707{
743 xfs_log_item_desc_t *lidp;
744 xfs_buf_log_item_t *bip; 708 xfs_buf_log_item_t *bip;
745 709
746 ASSERT(XFS_BUF_ISBUSY(bp)); 710 ASSERT(XFS_BUF_ISBUSY(bp));
@@ -748,8 +712,6 @@ xfs_trans_binval(
748 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 712 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
749 713
750 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 714 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
751 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
752 ASSERT(lidp != NULL);
753 ASSERT(atomic_read(&bip->bli_refcount) > 0); 715 ASSERT(atomic_read(&bip->bli_refcount) > 0);
754 716
755 trace_xfs_trans_binval(bip); 717 trace_xfs_trans_binval(bip);
@@ -764,7 +726,7 @@ xfs_trans_binval(
764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 726 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); 727 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 728 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 729 ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 730 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
769 return; 731 return;
770 } 732 }
@@ -797,7 +759,7 @@ xfs_trans_binval(
797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL; 759 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 760 memset((char *)(bip->bli_format.blf_data_map), 0,
799 (bip->bli_format.blf_map_size * sizeof(uint))); 761 (bip->bli_format.blf_map_size * sizeof(uint)));
800 lidp->lid_flags |= XFS_LID_DIRTY; 762 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
801 tp->t_flags |= XFS_TRANS_DIRTY; 763 tp->t_flags |= XFS_TRANS_DIRTY;
802} 764}
803 765
@@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf(
853 ASSERT(atomic_read(&bip->bli_refcount) > 0); 815 ASSERT(atomic_read(&bip->bli_refcount) > 0);
854 816
855 bip->bli_flags |= XFS_BLI_STALE_INODE; 817 bip->bli_flags |= XFS_BLI_STALE_INODE;
856 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) 818 bip->bli_item.li_cb = xfs_buf_iodone;
857 xfs_buf_iodone;
858} 819}
859 820
860
861
862/* 821/*
863 * Mark the buffer as being one which contains newly allocated 822 * Mark the buffer as being one which contains newly allocated
864 * inodes. We need to make sure that even if this buffer is 823 * inodes. We need to make sure that even if this buffer is
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e..f783d5e9fa7 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
29#include "xfs_extfree_item.h" 28#include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp,
49 /* 48 /*
50 * Get a log_item_desc to point at the new item. 49 * Get a log_item_desc to point at the new item.
51 */ 50 */
52 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); 51 xfs_trans_add_item(tp, &efip->efi_item);
53 52 return efip;
54 return (efip);
55} 53}
56 54
57/* 55/*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
65 xfs_fsblock_t start_block, 63 xfs_fsblock_t start_block,
66 xfs_extlen_t ext_len) 64 xfs_extlen_t ext_len)
67{ 65{
68 xfs_log_item_desc_t *lidp;
69 uint next_extent; 66 uint next_extent;
70 xfs_extent_t *extp; 67 xfs_extent_t *extp;
71 68
72 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
73 ASSERT(lidp != NULL);
74
75 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
76 lidp->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
77 71
78 next_extent = efip->efi_next_extent; 72 next_extent = efip->efi_next_extent;
79 ASSERT(next_extent < efip->efi_format.efi_nextents); 73 ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t *tp,
106 /* 100 /*
107 * Get a log_item_desc to point at the new item. 101 * Get a log_item_desc to point at the new item.
108 */ 102 */
109 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); 103 xfs_trans_add_item(tp, &efdp->efd_item);
110 104 return efdp;
111 return (efdp);
112} 105}
113 106
114/* 107/*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
122 xfs_fsblock_t start_block, 115 xfs_fsblock_t start_block,
123 xfs_extlen_t ext_len) 116 xfs_extlen_t ext_len)
124{ 117{
125 xfs_log_item_desc_t *lidp;
126 uint next_extent; 118 uint next_extent;
127 xfs_extent_t *extp; 119 xfs_extent_t *extp;
128 120
129 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
130 ASSERT(lidp != NULL);
131
132 tp->t_flags |= XFS_TRANS_DIRTY; 121 tp->t_flags |= XFS_TRANS_DIRTY;
133 lidp->lid_flags |= XFS_LID_DIRTY; 122 efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
134 123
135 next_extent = efdp->efd_next_extent; 124 next_extent = efdp->efd_next_extent;
136 ASSERT(next_extent < efdp->efd_format.efd_nextents); 125 ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2559dfec946..ccb34532768 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_trans_priv.h" 34#include "xfs_trans_priv.h"
40#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
36#include "xfs_trace.h"
41 37
42#ifdef XFS_TRANS_DEBUG 38#ifdef XFS_TRANS_DEBUG
43STATIC void 39STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
47#define xfs_trans_inode_broot_debug(ip) 43#define xfs_trans_inode_broot_debug(ip)
48#endif 44#endif
49 45
50
51/* 46/*
52 * Get an inode and join it to the transaction. 47 * Get an inode and join it to the transaction.
53 */ 48 */
@@ -63,76 +58,94 @@ xfs_trans_iget(
63 int error; 58 int error;
64 59
65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp); 60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
66 if (!error && tp) 61 if (!error && tp) {
67 xfs_trans_ijoin(tp, *ipp, lock_flags); 62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
68 return error; 65 return error;
69} 66}
70 67
71/* 68/*
72 * Add the locked inode to the transaction. 69 * Add a locked inode to the transaction.
73 * The inode must be locked, and it cannot be associated with any 70 *
74 * transaction. The caller must specify the locks already held 71 * The inode must be locked, and it cannot be associated with any transaction.
75 * on the inode.
76 */ 72 */
77void 73void
78xfs_trans_ijoin( 74xfs_trans_ijoin(
79 xfs_trans_t *tp, 75 struct xfs_trans *tp,
80 xfs_inode_t *ip, 76 struct xfs_inode *ip)
81 uint lock_flags)
82{ 77{
83 xfs_inode_log_item_t *iip; 78 xfs_inode_log_item_t *iip;
84 79
85 ASSERT(ip->i_transp == NULL); 80 ASSERT(ip->i_transp == NULL);
86 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 81 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
87 ASSERT(lock_flags & XFS_ILOCK_EXCL);
88 if (ip->i_itemp == NULL) 82 if (ip->i_itemp == NULL)
89 xfs_inode_item_init(ip, ip->i_mount); 83 xfs_inode_item_init(ip, ip->i_mount);
90 iip = ip->i_itemp; 84 iip = ip->i_itemp;
91 ASSERT(iip->ili_flags == 0); 85 ASSERT(iip->ili_lock_flags == 0);
92 86
93 /* 87 /*
94 * Get a log_item_desc to point at the new item. 88 * Get a log_item_desc to point at the new item.
95 */ 89 */
96 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); 90 xfs_trans_add_item(tp, &iip->ili_item);
97 91
98 xfs_trans_inode_broot_debug(ip); 92 xfs_trans_inode_broot_debug(ip);
99 93
100 /* 94 /*
101 * If the IO lock is already held, mark that in the inode log item.
102 */
103 if (lock_flags & XFS_IOLOCK_EXCL) {
104 iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
105 } else if (lock_flags & XFS_IOLOCK_SHARED) {
106 iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
107 }
108
109 /*
110 * Initialize i_transp so we can find it with xfs_inode_incore() 95 * Initialize i_transp so we can find it with xfs_inode_incore()
111 * in xfs_trans_iget() above. 96 * in xfs_trans_iget() above.
112 */ 97 */
113 ip->i_transp = tp; 98 ip->i_transp = tp;
114} 99}
115 100
116 101/*
102 * Add a locked inode to the transaction.
103 *
104 *
105 * Grabs a reference to the inode which will be dropped when the transaction
106 * is commited. The inode will also be unlocked at that point. The inode
107 * must be locked, and it cannot be associated with any transaction.
108 */
109void
110xfs_trans_ijoin_ref(
111 struct xfs_trans *tp,
112 struct xfs_inode *ip,
113 uint lock_flags)
114{
115 xfs_trans_ijoin(tp, ip);
116 IHOLD(ip);
117 ip->i_itemp->ili_lock_flags = lock_flags;
118}
117 119
118/* 120/*
119 * Mark the inode as not needing to be unlocked when the inode item's 121 * Transactional inode timestamp update. Requires the inode to be locked and
120 * IOP_UNLOCK() routine is called. The inode must already be locked 122 * joined to the transaction supplied. Relies on the transaction subsystem to
121 * and associated with the given transaction. 123 * track dirty state and update/writeback the inode accordingly.
122 */ 124 */
123/*ARGSUSED*/
124void 125void
125xfs_trans_ihold( 126xfs_trans_ichgtime(
126 xfs_trans_t *tp, 127 struct xfs_trans *tp,
127 xfs_inode_t *ip) 128 struct xfs_inode *ip,
129 int flags)
128{ 130{
129 ASSERT(ip->i_transp == tp); 131 struct inode *inode = VFS_I(ip);
130 ASSERT(ip->i_itemp != NULL); 132 timespec_t tv;
133
134 ASSERT(tp);
131 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 135 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
136 ASSERT(ip->i_transp == tp);
132 137
133 ip->i_itemp->ili_flags |= XFS_ILI_HOLD; 138 tv = current_fs_time(inode->i_sb);
134}
135 139
140 if ((flags & XFS_ICHGTIME_MOD) &&
141 !timespec_equal(&inode->i_mtime, &tv)) {
142 inode->i_mtime = tv;
143 }
144 if ((flags & XFS_ICHGTIME_CHG) &&
145 !timespec_equal(&inode->i_ctime, &tv)) {
146 inode->i_ctime = tv;
147 }
148}
136 149
137/* 150/*
138 * This is called to mark the fields indicated in fieldmask as needing 151 * This is called to mark the fields indicated in fieldmask as needing
@@ -149,17 +162,12 @@ xfs_trans_log_inode(
149 xfs_inode_t *ip, 162 xfs_inode_t *ip,
150 uint flags) 163 uint flags)
151{ 164{
152 xfs_log_item_desc_t *lidp;
153
154 ASSERT(ip->i_transp == tp); 165 ASSERT(ip->i_transp == tp);
155 ASSERT(ip->i_itemp != NULL); 166 ASSERT(ip->i_itemp != NULL);
156 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 167 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
157 168
158 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
159 ASSERT(lidp != NULL);
160
161 tp->t_flags |= XFS_TRANS_DIRTY; 169 tp->t_flags |= XFS_TRANS_DIRTY;
162 lidp->lid_flags |= XFS_LID_DIRTY; 170 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
163 171
164 /* 172 /*
165 * Always OR in the bits from the ili_last_fields field. 173 * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index f11d37d06dc..00000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,441 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has its own ailp */
26#include "xfs_bit.h"
27#include "xfs_buf_item.h"
28#include "xfs_sb.h"
29#include "xfs_ag.h"
30#include "xfs_dir2.h"
31#include "xfs_dmapi.h"
32#include "xfs_mount.h"
33
34STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
35 int, int, xfs_lsn_t);
36
37/*
38 * This is called to add the given log item to the transaction's
39 * list of log items. It must find a free log item descriptor
40 * or allocate a new one and add the item to that descriptor.
41 * The function returns a pointer to item descriptor used to point
42 * to the new item. The log item will now point to its new descriptor
43 * with its li_desc field.
44 */
45xfs_log_item_desc_t *
46xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
47{
48 xfs_log_item_desc_t *lidp;
49 xfs_log_item_chunk_t *licp;
50 int i=0;
51
52 /*
53 * If there are no free descriptors, allocate a new chunk
54 * of them and put it at the front of the chunk list.
55 */
56 if (tp->t_items_free == 0) {
57 licp = (xfs_log_item_chunk_t*)
58 kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
59 ASSERT(licp != NULL);
60 /*
61 * Initialize the chunk, and then
62 * claim the first slot in the newly allocated chunk.
63 */
64 xfs_lic_init(licp);
65 xfs_lic_claim(licp, 0);
66 licp->lic_unused = 1;
67 xfs_lic_init_slot(licp, 0);
68 lidp = xfs_lic_slot(licp, 0);
69
70 /*
71 * Link in the new chunk and update the free count.
72 */
73 licp->lic_next = tp->t_items.lic_next;
74 tp->t_items.lic_next = licp;
75 tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
76
77 /*
78 * Initialize the descriptor and the generic portion
79 * of the log item.
80 *
81 * Point the new slot at this item and return it.
82 * Also point the log item at its currently active
83 * descriptor and set the item's mount pointer.
84 */
85 lidp->lid_item = lip;
86 lidp->lid_flags = 0;
87 lidp->lid_size = 0;
88 lip->li_desc = lidp;
89 lip->li_mountp = tp->t_mountp;
90 lip->li_ailp = tp->t_mountp->m_ail;
91 return lidp;
92 }
93
94 /*
95 * Find the free descriptor. It is somewhere in the chunklist
96 * of descriptors.
97 */
98 licp = &tp->t_items;
99 while (licp != NULL) {
100 if (xfs_lic_vacancy(licp)) {
101 if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
102 i = licp->lic_unused;
103 ASSERT(xfs_lic_isfree(licp, i));
104 break;
105 }
106 for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
107 if (xfs_lic_isfree(licp, i))
108 break;
109 }
110 ASSERT(i <= XFS_LIC_MAX_SLOT);
111 break;
112 }
113 licp = licp->lic_next;
114 }
115 ASSERT(licp != NULL);
116 /*
117 * If we find a free descriptor, claim it,
118 * initialize it, and return it.
119 */
120 xfs_lic_claim(licp, i);
121 if (licp->lic_unused <= i) {
122 licp->lic_unused = i + 1;
123 xfs_lic_init_slot(licp, i);
124 }
125 lidp = xfs_lic_slot(licp, i);
126 tp->t_items_free--;
127 lidp->lid_item = lip;
128 lidp->lid_flags = 0;
129 lidp->lid_size = 0;
130 lip->li_desc = lidp;
131 lip->li_mountp = tp->t_mountp;
132 lip->li_ailp = tp->t_mountp->m_ail;
133 return lidp;
134}
135
136/*
137 * Free the given descriptor.
138 *
139 * This requires setting the bit in the chunk's free mask corresponding
140 * to the given slot.
141 */
142void
143xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
144{
145 uint slot;
146 xfs_log_item_chunk_t *licp;
147 xfs_log_item_chunk_t **licpp;
148
149 slot = xfs_lic_desc_to_slot(lidp);
150 licp = xfs_lic_desc_to_chunk(lidp);
151 xfs_lic_relse(licp, slot);
152 lidp->lid_item->li_desc = NULL;
153 tp->t_items_free++;
154
155 /*
156 * If there are no more used items in the chunk and this is not
157 * the chunk embedded in the transaction structure, then free
158 * the chunk. First pull it from the chunk list and then
159 * free it back to the heap. We didn't bother with a doubly
160 * linked list here because the lists should be very short
161 * and this is not a performance path. It's better to save
162 * the memory of the extra pointer.
163 *
164 * Also decrement the transaction structure's count of free items
165 * by the number in a chunk since we are freeing an empty chunk.
166 */
167 if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
168 licpp = &(tp->t_items.lic_next);
169 while (*licpp != licp) {
170 ASSERT(*licpp != NULL);
171 licpp = &((*licpp)->lic_next);
172 }
173 *licpp = licp->lic_next;
174 kmem_free(licp);
175 tp->t_items_free -= XFS_LIC_NUM_SLOTS;
176 }
177}
178
179/*
180 * This is called to find the descriptor corresponding to the given
181 * log item. It returns a pointer to the descriptor.
182 * The log item MUST have a corresponding descriptor in the given
183 * transaction. This routine does not return NULL, it panics.
184 *
185 * The descriptor pointer is kept in the log item's li_desc field.
186 * Just return it.
187 */
188/*ARGSUSED*/
189xfs_log_item_desc_t *
190xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
191{
192 ASSERT(lip->li_desc != NULL);
193
194 return lip->li_desc;
195}
196
197
198/*
199 * Return a pointer to the first descriptor in the chunk list.
200 * This does not return NULL if there are none, it panics.
201 *
202 * The first descriptor must be in either the first or second chunk.
203 * This is because the only chunk allowed to be empty is the first.
204 * All others are freed when they become empty.
205 *
206 * At some point this and xfs_trans_next_item() should be optimized
207 * to quickly look at the mask to determine if there is anything to
208 * look at.
209 */
210xfs_log_item_desc_t *
211xfs_trans_first_item(xfs_trans_t *tp)
212{
213 xfs_log_item_chunk_t *licp;
214 int i;
215
216 licp = &tp->t_items;
217 /*
218 * If it's not in the first chunk, skip to the second.
219 */
220 if (xfs_lic_are_all_free(licp)) {
221 licp = licp->lic_next;
222 }
223
224 /*
225 * Return the first non-free descriptor in the chunk.
226 */
227 ASSERT(!xfs_lic_are_all_free(licp));
228 for (i = 0; i < licp->lic_unused; i++) {
229 if (xfs_lic_isfree(licp, i)) {
230 continue;
231 }
232
233 return xfs_lic_slot(licp, i);
234 }
235 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
236 return NULL;
237}
238
239
240/*
241 * Given a descriptor, return the next descriptor in the chunk list.
242 * This returns NULL if there are no more used descriptors in the list.
243 *
244 * We do this by first locating the chunk in which the descriptor resides,
245 * and then scanning forward in the chunk and the list for the next
246 * used descriptor.
247 */
248/*ARGSUSED*/
249xfs_log_item_desc_t *
250xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
251{
252 xfs_log_item_chunk_t *licp;
253 int i;
254
255 licp = xfs_lic_desc_to_chunk(lidp);
256
257 /*
258 * First search the rest of the chunk. The for loop keeps us
259 * from referencing things beyond the end of the chunk.
260 */
261 for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
262 if (xfs_lic_isfree(licp, i)) {
263 continue;
264 }
265
266 return xfs_lic_slot(licp, i);
267 }
268
269 /*
270 * Now search the next chunk. It must be there, because the
271 * next chunk would have been freed if it were empty.
272 * If there is no next chunk, return NULL.
273 */
274 if (licp->lic_next == NULL) {
275 return NULL;
276 }
277
278 licp = licp->lic_next;
279 ASSERT(!xfs_lic_are_all_free(licp));
280 for (i = 0; i < licp->lic_unused; i++) {
281 if (xfs_lic_isfree(licp, i)) {
282 continue;
283 }
284
285 return xfs_lic_slot(licp, i);
286 }
287 ASSERT(0);
288 /* NOTREACHED */
289 return NULL; /* keep gcc quite */
290}
291
292/*
293 * This is called to unlock all of the items of a transaction and to free
294 * all the descriptors of that transaction.
295 *
296 * It walks the list of descriptors and unlocks each item. It frees
297 * each chunk except that embedded in the transaction as it goes along.
298 */
299void
300xfs_trans_free_items(
301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
303 int flags)
304{
305 xfs_log_item_chunk_t *licp;
306 xfs_log_item_chunk_t *next_licp;
307 int abort;
308
309 abort = flags & XFS_TRANS_ABORT;
310 licp = &tp->t_items;
311 /*
312 * Special case the embedded chunk so we don't free it below.
313 */
314 if (!xfs_lic_are_all_free(licp)) {
315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
316 xfs_lic_all_free(licp);
317 licp->lic_unused = 0;
318 }
319 licp = licp->lic_next;
320
321 /*
322 * Unlock each item in each chunk and free the chunks.
323 */
324 while (licp != NULL) {
325 ASSERT(!xfs_lic_are_all_free(licp));
326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
327 next_licp = licp->lic_next;
328 kmem_free(licp);
329 licp = next_licp;
330 }
331
332 /*
333 * Reset the transaction structure's free item count.
334 */
335 tp->t_items_free = XFS_LIC_NUM_SLOTS;
336 tp->t_items.lic_next = NULL;
337}
338
339
340
341/*
342 * This is called to unlock the items associated with a transaction.
343 * Items which were not logged should be freed.
344 * Those which were logged must still be tracked so they can be unpinned
345 * when the transaction commits.
346 */
347void
348xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
349{
350 xfs_log_item_chunk_t *licp;
351 xfs_log_item_chunk_t *next_licp;
352 xfs_log_item_chunk_t **licpp;
353 int freed;
354
355 freed = 0;
356 licp = &tp->t_items;
357
358 /*
359 * Special case the embedded chunk so we don't free.
360 */
361 if (!xfs_lic_are_all_free(licp)) {
362 freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
363 }
364 licpp = &(tp->t_items.lic_next);
365 licp = licp->lic_next;
366
367 /*
368 * Unlock each item in each chunk, free non-dirty descriptors,
369 * and free empty chunks.
370 */
371 while (licp != NULL) {
372 ASSERT(!xfs_lic_are_all_free(licp));
373 freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
374 next_licp = licp->lic_next;
375 if (xfs_lic_are_all_free(licp)) {
376 *licpp = next_licp;
377 kmem_free(licp);
378 freed -= XFS_LIC_NUM_SLOTS;
379 } else {
380 licpp = &(licp->lic_next);
381 }
382 ASSERT(*licpp == next_licp);
383 licp = next_licp;
384 }
385
386 /*
387 * Fix the free descriptor count in the transaction.
388 */
389 tp->t_items_free += freed;
390}
391
392/*
393 * Unlock each item pointed to by a descriptor in the given chunk.
394 * Stamp the commit lsn into each item if necessary.
395 * Free descriptors pointing to items which are not dirty if freeing_chunk
396 * is zero. If freeing_chunk is non-zero, then we need to unlock all
397 * items in the chunk.
398 *
399 * Return the number of descriptors freed.
400 */
401STATIC int
402xfs_trans_unlock_chunk(
403 xfs_log_item_chunk_t *licp,
404 int freeing_chunk,
405 int abort,
406 xfs_lsn_t commit_lsn)
407{
408 xfs_log_item_desc_t *lidp;
409 xfs_log_item_t *lip;
410 int i;
411 int freed;
412
413 freed = 0;
414 lidp = licp->lic_descs;
415 for (i = 0; i < licp->lic_unused; i++, lidp++) {
416 if (xfs_lic_isfree(licp, i)) {
417 continue;
418 }
419 lip = lidp->lid_item;
420 lip->li_desc = NULL;
421
422 if (commit_lsn != NULLCOMMITLSN)
423 IOP_COMMITTING(lip, commit_lsn);
424 if (abort)
425 lip->li_flags |= XFS_LI_ABORTED;
426 IOP_UNLOCK(lip);
427
428 /*
429 * Free the descriptor if the item is not dirty
430 * within this transaction and the caller is not
431 * going to just free the entire thing regardless.
432 */
433 if (!(freeing_chunk) &&
434 (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
435 xfs_lic_relse(licp, i);
436 freed++;
437 }
438 }
439
440 return freed;
441}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c6e4f2c8de6..62da86c90de 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,23 +23,10 @@ struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25 25
26/* 26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27 * From xfs_trans_item.c 27void xfs_trans_del_item(struct xfs_log_item *);
28 */
29struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *,
30 struct xfs_log_item *);
31void xfs_trans_free_item(struct xfs_trans *,
32 struct xfs_log_item_desc *);
33struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
34 struct xfs_log_item *);
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *);
38
39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41 int flags); 29 int flags);
42
43void xfs_trans_item_committed(struct xfs_log_item *lip, 30void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_lsn_t commit_lsn, int aborted); 31 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e3..26d1867d815 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */ 76typedef __uint32_t xlog_tid_t; /* transaction ID type */
79 77
80/* 78/*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde9..8b32d1a4c5a 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 30#include "xfs_dinode.h"
34#include "xfs_inode.h" 31#include "xfs_inode.h"
35#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
36#include "xfs_bmap.h" 33#include "xfs_bmap.h"
37#include "xfs_error.h" 34#include "xfs_error.h"
38#include "xfs_quota.h" 35#include "xfs_quota.h"
39#include "xfs_rw.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
41#include "xfs_utils.h" 37#include "xfs_utils.h"
42 38
@@ -60,7 +56,6 @@ xfs_dir_ialloc(
60 mode_t mode, 56 mode_t mode,
61 xfs_nlink_t nlink, 57 xfs_nlink_t nlink,
62 xfs_dev_t rdev, 58 xfs_dev_t rdev,
63 cred_t *credp,
64 prid_t prid, /* project id */ 59 prid_t prid, /* project id */
65 int okalloc, /* ok to allocate new space */ 60 int okalloc, /* ok to allocate new space */
66 xfs_inode_t **ipp, /* pointer to inode; it will be 61 xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -97,7 +92,7 @@ xfs_dir_ialloc(
97 * transaction commit so that no other process can steal 92 * transaction commit so that no other process can steal
98 * the inode(s) that we've just allocated. 93 * the inode(s) that we've just allocated.
99 */ 94 */
100 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, 95 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
101 &ialloc_context, &call_again, &ip); 96 &ialloc_context, &call_again, &ip);
102 97
103 /* 98 /*
@@ -201,7 +196,7 @@ xfs_dir_ialloc(
201 * other allocations in this allocation group, 196 * other allocations in this allocation group,
202 * this call should always succeed. 197 * this call should always succeed.
203 */ 198 */
204 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, 199 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
205 okalloc, &ialloc_context, &call_again, &ip); 200 okalloc, &ialloc_context, &call_again, &ip);
206 201
207 /* 202 /*
@@ -239,7 +234,7 @@ xfs_droplink(
239{ 234{
240 int error; 235 int error;
241 236
242 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 237 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
243 238
244 ASSERT (ip->i_d.di_nlink > 0); 239 ASSERT (ip->i_d.di_nlink > 0);
245 ip->i_d.di_nlink--; 240 ip->i_d.di_nlink--;
@@ -303,7 +298,7 @@ xfs_bumplink(
303{ 298{
304 if (ip->i_d.di_nlink >= XFS_MAXLINK) 299 if (ip->i_d.di_nlink >= XFS_MAXLINK)
305 return XFS_ERROR(EMLINK); 300 return XFS_ERROR(EMLINK);
306 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 301 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
307 302
308 ASSERT(ip->i_d.di_nlink > 0); 303 ASSERT(ip->i_d.di_nlink > 0);
309 ip->i_d.di_nlink++; 304 ip->i_d.di_nlink++;
@@ -324,86 +319,3 @@ xfs_bumplink(
324 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 319 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
325 return 0; 320 return 0;
326} 321}
327
328/*
329 * Try to truncate the given file to 0 length. Currently called
330 * only out of xfs_remove when it has to truncate a file to free
331 * up space for the remove to proceed.
332 */
333int
334xfs_truncate_file(
335 xfs_mount_t *mp,
336 xfs_inode_t *ip)
337{
338 xfs_trans_t *tp;
339 int error;
340
341#ifdef QUOTADEBUG
342 /*
343 * This is called to truncate the quotainodes too.
344 */
345 if (XFS_IS_UQUOTA_ON(mp)) {
346 if (ip->i_ino != mp->m_sb.sb_uquotino)
347 ASSERT(ip->i_udquot);
348 }
349 if (XFS_IS_OQUOTA_ON(mp)) {
350 if (ip->i_ino != mp->m_sb.sb_gquotino)
351 ASSERT(ip->i_gdquot);
352 }
353#endif
354 /*
355 * Make the call to xfs_itruncate_start before starting the
356 * transaction, because we cannot make the call while we're
357 * in a transaction.
358 */
359 xfs_ilock(ip, XFS_IOLOCK_EXCL);
360 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
361 if (error) {
362 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
363 return error;
364 }
365
366 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
367 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
368 XFS_TRANS_PERM_LOG_RES,
369 XFS_ITRUNCATE_LOG_COUNT))) {
370 xfs_trans_cancel(tp, 0);
371 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
372 return error;
373 }
374
375 /*
376 * Follow the normal truncate locking protocol. Since we
377 * hold the inode in the transaction, we know that its number
378 * of references will stay constant.
379 */
380 xfs_ilock(ip, XFS_ILOCK_EXCL);
381 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
382 xfs_trans_ihold(tp, ip);
383 /*
384 * Signal a sync xaction. The only case where that isn't
385 * the case is if we're truncating an already unlinked file
386 * on a wsync fs. In that case, we know the blocks can't
387 * reappear in the file because the links to file are
388 * permanently toast. Currently, we're always going to
389 * want a sync transaction because this code is being
390 * called from places where nlink is guaranteed to be 1
391 * but I'm leaving the tests in to protect against future
392 * changes -- rcc.
393 */
394 error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
395 XFS_DATA_FORK,
396 ((ip->i_d.di_nlink != 0 ||
397 !(mp->m_flags & XFS_MOUNT_WSYNC))
398 ? 1 : 0));
399 if (error) {
400 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
401 XFS_TRANS_ABORT);
402 } else {
403 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
404 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
405 }
406 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
407
408 return error;
409}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d26..456fca31493 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,10 +18,8 @@
18#ifndef __XFS_UTILS_H__ 18#ifndef __XFS_UTILS_H__
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
22extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
23 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
24 xfs_inode_t **, int *);
25extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); 23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
26extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); 24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
27extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); 25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c1646838898..8e4a63c4151 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h" 28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 29#include "xfs_mount.h"
31#include "xfs_da_btree.h" 30#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 33#include "xfs_dinode.h"
38#include "xfs_inode.h" 34#include "xfs_inode.h"
39#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
41#include "xfs_btree.h"
42#include "xfs_ialloc.h" 37#include "xfs_ialloc.h"
43#include "xfs_alloc.h" 38#include "xfs_alloc.h"
44#include "xfs_bmap.h" 39#include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
73 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; 68 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
74 int need_iolock = 1; 69 int need_iolock = 1;
75 70
76 xfs_itrace_entry(ip); 71 trace_xfs_setattr(ip);
77 72
78 if (mp->m_flags & XFS_MOUNT_RDONLY) 73 if (mp->m_flags & XFS_MOUNT_RDONLY)
79 return XFS_ERROR(EROFS); 74 return XFS_ERROR(EROFS);
@@ -119,7 +114,7 @@ xfs_setattr(
119 */ 114 */
120 ASSERT(udqp == NULL); 115 ASSERT(udqp == NULL);
121 ASSERT(gdqp == NULL); 116 ASSERT(gdqp == NULL);
122 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, 117 code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
123 qflags, &udqp, &gdqp); 118 qflags, &udqp, &gdqp);
124 if (code) 119 if (code)
125 return code; 120 return code;
@@ -143,16 +138,6 @@ xfs_setattr(
143 goto error_return; 138 goto error_return;
144 } 139 }
145 } else { 140 } else {
146 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
147 !(flags & XFS_ATTR_DMI)) {
148 int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
149 code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
150 iattr->ia_size, 0, dmflags, NULL);
151 if (code) {
152 lock_flags = 0;
153 goto error_return;
154 }
155 }
156 if (need_iolock) 141 if (need_iolock)
157 lock_flags |= XFS_IOLOCK_EXCL; 142 lock_flags |= XFS_IOLOCK_EXCL;
158 } 143 }
@@ -199,8 +184,11 @@ xfs_setattr(
199 ip->i_size == 0 && ip->i_d.di_nextents == 0) { 184 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
200 xfs_iunlock(ip, XFS_ILOCK_EXCL); 185 xfs_iunlock(ip, XFS_ILOCK_EXCL);
201 lock_flags &= ~XFS_ILOCK_EXCL; 186 lock_flags &= ~XFS_ILOCK_EXCL;
202 if (mask & ATTR_CTIME) 187 if (mask & ATTR_CTIME) {
203 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 188 inode->i_mtime = inode->i_ctime =
189 current_fs_time(inode->i_sb);
190 xfs_mark_inode_dirty_sync(ip);
191 }
204 code = 0; 192 code = 0;
205 goto error_return; 193 goto error_return;
206 } 194 }
@@ -236,8 +224,11 @@ xfs_setattr(
236 * transaction to modify the i_size. 224 * transaction to modify the i_size.
237 */ 225 */
238 code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); 226 code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
227 if (code)
228 goto error_return;
239 } 229 }
240 xfs_iunlock(ip, XFS_ILOCK_EXCL); 230 xfs_iunlock(ip, XFS_ILOCK_EXCL);
231 lock_flags &= ~XFS_ILOCK_EXCL;
241 232
242 /* 233 /*
243 * We are going to log the inode size change in this 234 * We are going to log the inode size change in this
@@ -251,40 +242,38 @@ xfs_setattr(
251 * really care about here and prevents waiting for other data 242 * really care about here and prevents waiting for other data
252 * not within the range we care about here. 243 * not within the range we care about here.
253 */ 244 */
254 if (!code && 245 if (ip->i_size != ip->i_d.di_size &&
255 ip->i_size != ip->i_d.di_size &&
256 iattr->ia_size > ip->i_d.di_size) { 246 iattr->ia_size > ip->i_d.di_size) {
257 code = xfs_flush_pages(ip, 247 code = xfs_flush_pages(ip,
258 ip->i_d.di_size, iattr->ia_size, 248 ip->i_d.di_size, iattr->ia_size,
259 XBF_ASYNC, FI_NONE); 249 XBF_ASYNC, FI_NONE);
250 if (code)
251 goto error_return;
260 } 252 }
261 253
262 /* wait for all I/O to complete */ 254 /* wait for all I/O to complete */
263 xfs_ioend_wait(ip); 255 xfs_ioend_wait(ip);
264 256
265 if (!code) 257 code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
266 code = xfs_itruncate_data(ip, iattr->ia_size); 258 xfs_get_blocks);
267 if (code) { 259 if (code)
268 ASSERT(tp == NULL);
269 lock_flags &= ~XFS_ILOCK_EXCL;
270 ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
271 goto error_return; 260 goto error_return;
272 } 261
273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 262 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
274 if ((code = xfs_trans_reserve(tp, 0, 263 code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
275 XFS_ITRUNCATE_LOG_RES(mp), 0, 264 XFS_TRANS_PERM_LOG_RES,
276 XFS_TRANS_PERM_LOG_RES, 265 XFS_ITRUNCATE_LOG_COUNT);
277 XFS_ITRUNCATE_LOG_COUNT))) { 266 if (code)
278 xfs_trans_cancel(tp, 0); 267 goto error_return;
279 if (need_iolock) 268
280 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 269 truncate_setsize(inode, iattr->ia_size);
281 return code; 270
282 }
283 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 271 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
272 lock_flags |= XFS_ILOCK_EXCL;
273
284 xfs_ilock(ip, XFS_ILOCK_EXCL); 274 xfs_ilock(ip, XFS_ILOCK_EXCL);
285 275
286 xfs_trans_ijoin(tp, ip, lock_flags); 276 xfs_trans_ijoin(tp, ip);
287 xfs_trans_ihold(tp, ip);
288 277
289 /* 278 /*
290 * Only change the c/mtime if we are changing the size 279 * Only change the c/mtime if we are changing the size
@@ -334,8 +323,7 @@ xfs_setattr(
334 xfs_iflags_set(ip, XFS_ITRUNCATED); 323 xfs_iflags_set(ip, XFS_ITRUNCATED);
335 } 324 }
336 } else if (tp) { 325 } else if (tp) {
337 xfs_trans_ijoin(tp, ip, lock_flags); 326 xfs_trans_ijoin(tp, ip);
338 xfs_trans_ihold(tp, ip);
339 } 327 }
340 328
341 /* 329 /*
@@ -470,17 +458,10 @@ xfs_setattr(
470 return XFS_ERROR(code); 458 return XFS_ERROR(code);
471 } 459 }
472 460
473 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
474 !(flags & XFS_ATTR_DMI)) {
475 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
476 NULL, DM_RIGHT_NULL, NULL, NULL,
477 0, 0, AT_DELAY_FLAG(flags));
478 }
479 return 0; 461 return 0;
480 462
481 abort_return: 463 abort_return:
482 commit_flags |= XFS_TRANS_ABORT; 464 commit_flags |= XFS_TRANS_ABORT;
483 /* FALLTHROUGH */
484 error_return: 465 error_return:
485 xfs_qm_dqrele(udqp); 466 xfs_qm_dqrele(udqp);
486 xfs_qm_dqrele(gdqp); 467 xfs_qm_dqrele(gdqp);
@@ -516,7 +497,7 @@ xfs_readlink_bmap(
516 int error = 0; 497 int error = 0;
517 498
518 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, 499 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
519 mval, &nmaps, NULL, NULL); 500 mval, &nmaps, NULL);
520 if (error) 501 if (error)
521 goto out; 502 goto out;
522 503
@@ -557,7 +538,7 @@ xfs_readlink(
557 int pathlen; 538 int pathlen;
558 int error = 0; 539 int error = 0;
559 540
560 xfs_itrace_entry(ip); 541 trace_xfs_readlink(ip);
561 542
562 if (XFS_FORCED_SHUTDOWN(mp)) 543 if (XFS_FORCED_SHUTDOWN(mp))
563 return XFS_ERROR(EIO); 544 return XFS_ERROR(EIO);
@@ -613,14 +594,14 @@ xfs_free_eofblocks(
613 */ 594 */
614 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); 595 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
615 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 596 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
616 map_len = last_fsb - end_fsb; 597 if (last_fsb <= end_fsb)
617 if (map_len <= 0)
618 return 0; 598 return 0;
599 map_len = last_fsb - end_fsb;
619 600
620 nimaps = 1; 601 nimaps = 1;
621 xfs_ilock(ip, XFS_ILOCK_SHARED); 602 xfs_ilock(ip, XFS_ILOCK_SHARED);
622 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, 603 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
623 NULL, 0, &imap, &nimaps, NULL, NULL); 604 NULL, 0, &imap, &nimaps, NULL);
624 xfs_iunlock(ip, XFS_ILOCK_SHARED); 605 xfs_iunlock(ip, XFS_ILOCK_SHARED);
625 606
626 if (!error && (nimaps != 0) && 607 if (!error && (nimaps != 0) &&
@@ -675,10 +656,7 @@ xfs_free_eofblocks(
675 } 656 }
676 657
677 xfs_ilock(ip, XFS_ILOCK_EXCL); 658 xfs_ilock(ip, XFS_ILOCK_EXCL);
678 xfs_trans_ijoin(tp, ip, 659 xfs_trans_ijoin(tp, ip);
679 XFS_IOLOCK_EXCL |
680 XFS_ILOCK_EXCL);
681 xfs_trans_ihold(tp, ip);
682 660
683 error = xfs_itruncate_finish(&tp, ip, 661 error = xfs_itruncate_finish(&tp, ip,
684 ip->i_size, 662 ip->i_size,
@@ -750,8 +728,7 @@ xfs_inactive_symlink_rmt(
750 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 728 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
751 size = (int)ip->i_d.di_size; 729 size = (int)ip->i_d.di_size;
752 ip->i_d.di_size = 0; 730 ip->i_d.di_size = 0;
753 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 731 xfs_trans_ijoin(tp, ip);
754 xfs_trans_ihold(tp, ip);
755 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 732 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
756 /* 733 /*
757 * Find the block(s) so we can inval and unmap them. 734 * Find the block(s) so we can inval and unmap them.
@@ -761,7 +738,7 @@ xfs_inactive_symlink_rmt(
761 nmaps = ARRAY_SIZE(mval); 738 nmaps = ARRAY_SIZE(mval);
762 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 739 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
763 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 740 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
764 &free_list, NULL))) 741 &free_list)))
765 goto error0; 742 goto error0;
766 /* 743 /*
767 * Invalidate the block(s). 744 * Invalidate the block(s).
@@ -776,7 +753,7 @@ xfs_inactive_symlink_rmt(
776 * Unmap the dead block(s) to the free_list. 753 * Unmap the dead block(s) to the free_list.
777 */ 754 */
778 if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, 755 if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
779 &first_block, &free_list, NULL, &done))) 756 &first_block, &free_list, &done)))
780 goto error1; 757 goto error1;
781 ASSERT(done); 758 ASSERT(done);
782 /* 759 /*
@@ -795,8 +772,7 @@ xfs_inactive_symlink_rmt(
795 * Mark it dirty so it will be logged and moved forward in the log as 772 * Mark it dirty so it will be logged and moved forward in the log as
796 * part of every commit. 773 * part of every commit.
797 */ 774 */
798 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 775 xfs_trans_ijoin(tp, ip);
799 xfs_trans_ihold(tp, ip);
800 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 776 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
801 /* 777 /*
802 * Get a new, empty transaction to return to our caller. 778 * Get a new, empty transaction to return to our caller.
@@ -929,8 +905,7 @@ xfs_inactive_attrs(
929 goto error_cancel; 905 goto error_cancel;
930 906
931 xfs_ilock(ip, XFS_ILOCK_EXCL); 907 xfs_ilock(ip, XFS_ILOCK_EXCL);
932 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 908 xfs_trans_ijoin(tp, ip);
933 xfs_trans_ihold(tp, ip);
934 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 909 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
935 910
936 ASSERT(ip->i_d.di_anextents == 0); 911 ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1010,6 @@ xfs_inactive(
1035 int error; 1010 int error;
1036 int truncate; 1011 int truncate;
1037 1012
1038 xfs_itrace_entry(ip);
1039
1040 /* 1013 /*
1041 * If the inode is already free, then there can be nothing 1014 * If the inode is already free, then there can be nothing
1042 * to clean up here. 1015 * to clean up here.
@@ -1060,9 +1033,6 @@ xfs_inactive(
1060 1033
1061 mp = ip->i_mount; 1034 mp = ip->i_mount;
1062 1035
1063 if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1064 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1065
1066 error = 0; 1036 error = 0;
1067 1037
1068 /* If this is a read-only mount, don't do this (would generate I/O) */ 1038 /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1090,7 @@ xfs_inactive(
1120 } 1090 }
1121 1091
1122 xfs_ilock(ip, XFS_ILOCK_EXCL); 1092 xfs_ilock(ip, XFS_ILOCK_EXCL);
1123 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1093 xfs_trans_ijoin(tp, ip);
1124 xfs_trans_ihold(tp, ip);
1125 1094
1126 /* 1095 /*
1127 * normally, we have to run xfs_itruncate_finish sync. 1096 * normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1123,7 @@ xfs_inactive(
1154 return VN_INACTIVE_CACHE; 1123 return VN_INACTIVE_CACHE;
1155 } 1124 }
1156 1125
1157 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1126 xfs_trans_ijoin(tp, ip);
1158 xfs_trans_ihold(tp, ip);
1159 } else { 1127 } else {
1160 error = xfs_trans_reserve(tp, 0, 1128 error = xfs_trans_reserve(tp, 0,
1161 XFS_IFREE_LOG_RES(mp), 1129 XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1136,7 @@ xfs_inactive(
1168 } 1136 }
1169 1137
1170 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1138 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1171 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1139 xfs_trans_ijoin(tp, ip);
1172 xfs_trans_ihold(tp, ip);
1173 } 1140 }
1174 1141
1175 /* 1142 /*
@@ -1257,7 +1224,7 @@ xfs_lookup(
1257 int error; 1224 int error;
1258 uint lock_mode; 1225 uint lock_mode;
1259 1226
1260 xfs_itrace_entry(dp); 1227 trace_xfs_lookup(dp, name);
1261 1228
1262 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 1229 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1263 return XFS_ERROR(EIO); 1230 return XFS_ERROR(EIO);
@@ -1289,8 +1256,7 @@ xfs_create(
1289 struct xfs_name *name, 1256 struct xfs_name *name,
1290 mode_t mode, 1257 mode_t mode,
1291 xfs_dev_t rdev, 1258 xfs_dev_t rdev,
1292 xfs_inode_t **ipp, 1259 xfs_inode_t **ipp)
1293 cred_t *credp)
1294{ 1260{
1295 int is_dir = S_ISDIR(mode); 1261 int is_dir = S_ISDIR(mode);
1296 struct xfs_mount *mp = dp->i_mount; 1262 struct xfs_mount *mp = dp->i_mount;
@@ -1302,32 +1268,22 @@ xfs_create(
1302 boolean_t unlock_dp_on_error = B_FALSE; 1268 boolean_t unlock_dp_on_error = B_FALSE;
1303 uint cancel_flags; 1269 uint cancel_flags;
1304 int committed; 1270 int committed;
1305 xfs_prid_t prid; 1271 prid_t prid;
1306 struct xfs_dquot *udqp = NULL; 1272 struct xfs_dquot *udqp = NULL;
1307 struct xfs_dquot *gdqp = NULL; 1273 struct xfs_dquot *gdqp = NULL;
1308 uint resblks; 1274 uint resblks;
1309 uint log_res; 1275 uint log_res;
1310 uint log_count; 1276 uint log_count;
1311 1277
1312 xfs_itrace_entry(dp); 1278 trace_xfs_create(dp, name);
1313 1279
1314 if (XFS_FORCED_SHUTDOWN(mp)) 1280 if (XFS_FORCED_SHUTDOWN(mp))
1315 return XFS_ERROR(EIO); 1281 return XFS_ERROR(EIO);
1316 1282
1317 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1318 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1319 dp, DM_RIGHT_NULL, NULL,
1320 DM_RIGHT_NULL, name->name, NULL,
1321 mode, 0, 0);
1322
1323 if (error)
1324 return error;
1325 }
1326
1327 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1283 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1328 prid = dp->i_d.di_projid; 1284 prid = xfs_get_projid(dp);
1329 else 1285 else
1330 prid = dfltprid; 1286 prid = XFS_PROJID_DEFAULT;
1331 1287
1332 /* 1288 /*
1333 * Make sure that we have allocated dquot(s) on disk. 1289 * Make sure that we have allocated dquot(s) on disk.
@@ -1406,7 +1362,7 @@ xfs_create(
1406 * entry pointing to them, but a directory also the "." entry 1362 * entry pointing to them, but a directory also the "." entry
1407 * pointing to itself. 1363 * pointing to itself.
1408 */ 1364 */
1409 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, 1365 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1410 prid, resblks > 0, &ip, &committed); 1366 prid, resblks > 0, &ip, &committed);
1411 if (error) { 1367 if (error) {
1412 if (error == ENOSPC) 1368 if (error == ENOSPC)
@@ -1427,8 +1383,7 @@ xfs_create(
1427 * the transaction cancel unlocking dp so don't do it explicitly in the 1383 * the transaction cancel unlocking dp so don't do it explicitly in the
1428 * error path. 1384 * error path.
1429 */ 1385 */
1430 IHOLD(dp); 1386 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
1431 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1432 unlock_dp_on_error = B_FALSE; 1387 unlock_dp_on_error = B_FALSE;
1433 1388
1434 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1389 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1438,7 +1393,7 @@ xfs_create(
1438 ASSERT(error != ENOSPC); 1393 ASSERT(error != ENOSPC);
1439 goto out_trans_abort; 1394 goto out_trans_abort;
1440 } 1395 }
1441 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1396 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1442 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1397 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1443 1398
1444 if (is_dir) { 1399 if (is_dir) {
@@ -1487,16 +1442,7 @@ xfs_create(
1487 xfs_qm_dqrele(gdqp); 1442 xfs_qm_dqrele(gdqp);
1488 1443
1489 *ipp = ip; 1444 *ipp = ip;
1490 1445 return 0;
1491 /* Fallthrough to std_return with error = 0 */
1492 std_return:
1493 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1494 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1495 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1496 error, 0);
1497 }
1498
1499 return error;
1500 1446
1501 out_bmap_cancel: 1447 out_bmap_cancel:
1502 xfs_bmap_cancel(&free_list); 1448 xfs_bmap_cancel(&free_list);
@@ -1510,8 +1456,8 @@ xfs_create(
1510 1456
1511 if (unlock_dp_on_error) 1457 if (unlock_dp_on_error)
1512 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1458 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1513 1459 std_return:
1514 goto std_return; 1460 return error;
1515 1461
1516 out_abort_rele: 1462 out_abort_rele:
1517 /* 1463 /*
@@ -1726,20 +1672,11 @@ xfs_remove(
1726 uint resblks; 1672 uint resblks;
1727 uint log_count; 1673 uint log_count;
1728 1674
1729 xfs_itrace_entry(dp); 1675 trace_xfs_remove(dp, name);
1730 xfs_itrace_entry(ip);
1731 1676
1732 if (XFS_FORCED_SHUTDOWN(mp)) 1677 if (XFS_FORCED_SHUTDOWN(mp))
1733 return XFS_ERROR(EIO); 1678 return XFS_ERROR(EIO);
1734 1679
1735 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
1736 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
1737 NULL, DM_RIGHT_NULL, name->name, NULL,
1738 ip->i_d.di_mode, 0, 0);
1739 if (error)
1740 return error;
1741 }
1742
1743 error = xfs_qm_dqattach(dp, 0); 1680 error = xfs_qm_dqattach(dp, 0);
1744 if (error) 1681 if (error)
1745 goto std_return; 1682 goto std_return;
@@ -1782,15 +1719,8 @@ xfs_remove(
1782 1719
1783 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 1720 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1784 1721
1785 /* 1722 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
1786 * At this point, we've gotten both the directory and the entry 1723 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1787 * inodes locked.
1788 */
1789 IHOLD(ip);
1790 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1791
1792 IHOLD(dp);
1793 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1794 1724
1795 /* 1725 /*
1796 * If we're removing a directory perform some additional validation. 1726 * If we're removing a directory perform some additional validation.
@@ -1814,7 +1744,7 @@ xfs_remove(
1814 ASSERT(error != ENOENT); 1744 ASSERT(error != ENOENT);
1815 goto out_bmap_cancel; 1745 goto out_bmap_cancel;
1816 } 1746 }
1817 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1747 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1818 1748
1819 if (is_dir) { 1749 if (is_dir) {
1820 /* 1750 /*
@@ -1877,21 +1807,15 @@ xfs_remove(
1877 if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) 1807 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1878 xfs_filestream_deassociate(ip); 1808 xfs_filestream_deassociate(ip);
1879 1809
1880 std_return: 1810 return 0;
1881 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
1882 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
1883 NULL, DM_RIGHT_NULL, name->name, NULL,
1884 ip->i_d.di_mode, error, 0);
1885 }
1886
1887 return error;
1888 1811
1889 out_bmap_cancel: 1812 out_bmap_cancel:
1890 xfs_bmap_cancel(&free_list); 1813 xfs_bmap_cancel(&free_list);
1891 cancel_flags |= XFS_TRANS_ABORT; 1814 cancel_flags |= XFS_TRANS_ABORT;
1892 out_trans_cancel: 1815 out_trans_cancel:
1893 xfs_trans_cancel(tp, cancel_flags); 1816 xfs_trans_cancel(tp, cancel_flags);
1894 goto std_return; 1817 std_return:
1818 return error;
1895} 1819}
1896 1820
1897int 1821int
@@ -1909,25 +1833,13 @@ xfs_link(
1909 int committed; 1833 int committed;
1910 int resblks; 1834 int resblks;
1911 1835
1912 xfs_itrace_entry(tdp); 1836 trace_xfs_link(tdp, target_name);
1913 xfs_itrace_entry(sip);
1914 1837
1915 ASSERT(!S_ISDIR(sip->i_d.di_mode)); 1838 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1916 1839
1917 if (XFS_FORCED_SHUTDOWN(mp)) 1840 if (XFS_FORCED_SHUTDOWN(mp))
1918 return XFS_ERROR(EIO); 1841 return XFS_ERROR(EIO);
1919 1842
1920 if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
1921 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
1922 tdp, DM_RIGHT_NULL,
1923 sip, DM_RIGHT_NULL,
1924 target_name->name, NULL, 0, 0, 0);
1925 if (error)
1926 return error;
1927 }
1928
1929 /* Return through std_return after this point. */
1930
1931 error = xfs_qm_dqattach(sip, 0); 1843 error = xfs_qm_dqattach(sip, 0);
1932 if (error) 1844 if (error)
1933 goto std_return; 1845 goto std_return;
@@ -1953,15 +1865,8 @@ xfs_link(
1953 1865
1954 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1866 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1955 1867
1956 /* 1868 xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
1957 * Increment vnode ref counts since xfs_trans_commit & 1869 xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
1958 * xfs_trans_cancel will both unlock the inodes and
1959 * decrement the associated ref counts.
1960 */
1961 IHOLD(sip);
1962 IHOLD(tdp);
1963 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1964 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1965 1870
1966 /* 1871 /*
1967 * If the source has too many links, we can't make any more to it. 1872 * If the source has too many links, we can't make any more to it.
@@ -1977,7 +1882,7 @@ xfs_link(
1977 * the tree quota mechanism could be circumvented. 1882 * the tree quota mechanism could be circumvented.
1978 */ 1883 */
1979 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1884 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1980 (tdp->i_d.di_projid != sip->i_d.di_projid))) { 1885 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1981 error = XFS_ERROR(EXDEV); 1886 error = XFS_ERROR(EXDEV);
1982 goto error_return; 1887 goto error_return;
1983 } 1888 }
@@ -1992,7 +1897,7 @@ xfs_link(
1992 &first_block, &free_list, resblks); 1897 &first_block, &free_list, resblks);
1993 if (error) 1898 if (error)
1994 goto abort_return; 1899 goto abort_return;
1995 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1900 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1996 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1901 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1997 1902
1998 error = xfs_bumplink(tp, sip); 1903 error = xfs_bumplink(tp, sip);
@@ -2014,27 +1919,14 @@ xfs_link(
2014 goto abort_return; 1919 goto abort_return;
2015 } 1920 }
2016 1921
2017 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1922 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2018 if (error)
2019 goto std_return;
2020
2021 /* Fall through to std_return with error = 0. */
2022std_return:
2023 if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2024 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2025 tdp, DM_RIGHT_NULL,
2026 sip, DM_RIGHT_NULL,
2027 target_name->name, NULL, 0, error, 0);
2028 }
2029 return error;
2030 1923
2031 abort_return: 1924 abort_return:
2032 cancel_flags |= XFS_TRANS_ABORT; 1925 cancel_flags |= XFS_TRANS_ABORT;
2033 /* FALLTHROUGH */
2034
2035 error_return: 1926 error_return:
2036 xfs_trans_cancel(tp, cancel_flags); 1927 xfs_trans_cancel(tp, cancel_flags);
2037 goto std_return; 1928 std_return:
1929 return error;
2038} 1930}
2039 1931
2040int 1932int
@@ -2043,8 +1935,7 @@ xfs_symlink(
2043 struct xfs_name *link_name, 1935 struct xfs_name *link_name,
2044 const char *target_path, 1936 const char *target_path,
2045 mode_t mode, 1937 mode_t mode,
2046 xfs_inode_t **ipp, 1938 xfs_inode_t **ipp)
2047 cred_t *credp)
2048{ 1939{
2049 xfs_mount_t *mp = dp->i_mount; 1940 xfs_mount_t *mp = dp->i_mount;
2050 xfs_trans_t *tp; 1941 xfs_trans_t *tp;
@@ -2065,7 +1956,7 @@ xfs_symlink(
2065 int byte_cnt; 1956 int byte_cnt;
2066 int n; 1957 int n;
2067 xfs_buf_t *bp; 1958 xfs_buf_t *bp;
2068 xfs_prid_t prid; 1959 prid_t prid;
2069 struct xfs_dquot *udqp, *gdqp; 1960 struct xfs_dquot *udqp, *gdqp;
2070 uint resblks; 1961 uint resblks;
2071 1962
@@ -2074,7 +1965,7 @@ xfs_symlink(
2074 ip = NULL; 1965 ip = NULL;
2075 tp = NULL; 1966 tp = NULL;
2076 1967
2077 xfs_itrace_entry(dp); 1968 trace_xfs_symlink(dp, link_name);
2078 1969
2079 if (XFS_FORCED_SHUTDOWN(mp)) 1970 if (XFS_FORCED_SHUTDOWN(mp))
2080 return XFS_ERROR(EIO); 1971 return XFS_ERROR(EIO);
@@ -2086,22 +1977,11 @@ xfs_symlink(
2086 if (pathlen >= MAXPATHLEN) /* total string too long */ 1977 if (pathlen >= MAXPATHLEN) /* total string too long */
2087 return XFS_ERROR(ENAMETOOLONG); 1978 return XFS_ERROR(ENAMETOOLONG);
2088 1979
2089 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2090 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2091 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2092 link_name->name,
2093 (unsigned char *)target_path, 0, 0, 0);
2094 if (error)
2095 return error;
2096 }
2097
2098 /* Return through std_return after this point. */
2099
2100 udqp = gdqp = NULL; 1980 udqp = gdqp = NULL;
2101 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1981 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2102 prid = dp->i_d.di_projid; 1982 prid = xfs_get_projid(dp);
2103 else 1983 else
2104 prid = (xfs_prid_t)dfltprid; 1984 prid = XFS_PROJID_DEFAULT;
2105 1985
2106 /* 1986 /*
2107 * Make sure that we have allocated dquot(s) on disk. 1987 * Make sure that we have allocated dquot(s) on disk.
@@ -2167,8 +2047,8 @@ xfs_symlink(
2167 /* 2047 /*
2168 * Allocate an inode for the symlink. 2048 * Allocate an inode for the symlink.
2169 */ 2049 */
2170 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 2050 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
2171 1, 0, credp, prid, resblks > 0, &ip, NULL); 2051 prid, resblks > 0, &ip, NULL);
2172 if (error) { 2052 if (error) {
2173 if (error == ENOSPC) 2053 if (error == ENOSPC)
2174 goto error_return; 2054 goto error_return;
@@ -2180,8 +2060,7 @@ xfs_symlink(
2180 * transaction cancel unlocking dp so don't do it explicitly in the 2060 * transaction cancel unlocking dp so don't do it explicitly in the
2181 * error path. 2061 * error path.
2182 */ 2062 */
2183 IHOLD(dp); 2063 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
2184 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2185 unlock_dp_on_error = B_FALSE; 2064 unlock_dp_on_error = B_FALSE;
2186 2065
2187 /* 2066 /*
@@ -2215,7 +2094,7 @@ xfs_symlink(
2215 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, 2094 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
2216 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2095 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2217 &first_block, resblks, mval, &nmaps, 2096 &first_block, resblks, mval, &nmaps,
2218 &free_list, NULL); 2097 &free_list);
2219 if (error) { 2098 if (error) {
2220 goto error1; 2099 goto error1;
2221 } 2100 }
@@ -2251,7 +2130,7 @@ xfs_symlink(
2251 &first_block, &free_list, resblks); 2130 &first_block, &free_list, resblks);
2252 if (error) 2131 if (error)
2253 goto error1; 2132 goto error1;
2254 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2133 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2255 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2134 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2256 2135
2257 /* 2136 /*
@@ -2278,21 +2157,8 @@ xfs_symlink(
2278 xfs_qm_dqrele(udqp); 2157 xfs_qm_dqrele(udqp);
2279 xfs_qm_dqrele(gdqp); 2158 xfs_qm_dqrele(gdqp);
2280 2159
2281 /* Fall through to std_return with error = 0 or errno from 2160 *ipp = ip;
2282 * xfs_trans_commit */ 2161 return 0;
2283std_return:
2284 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
2285 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
2286 dp, DM_RIGHT_NULL,
2287 error ? NULL : ip,
2288 DM_RIGHT_NULL, link_name->name,
2289 (unsigned char *)target_path,
2290 0, error, 0);
2291 }
2292
2293 if (!error)
2294 *ipp = ip;
2295 return error;
2296 2162
2297 error2: 2163 error2:
2298 IRELE(ip); 2164 IRELE(ip);
@@ -2306,8 +2172,8 @@ std_return:
2306 2172
2307 if (unlock_dp_on_error) 2173 if (unlock_dp_on_error)
2308 xfs_iunlock(dp, XFS_ILOCK_EXCL); 2174 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2309 2175 std_return:
2310 goto std_return; 2176 return error;
2311} 2177}
2312 2178
2313int 2179int
@@ -2333,13 +2199,12 @@ xfs_set_dmattrs(
2333 return error; 2199 return error;
2334 } 2200 }
2335 xfs_ilock(ip, XFS_ILOCK_EXCL); 2201 xfs_ilock(ip, XFS_ILOCK_EXCL);
2336 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2202 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
2337 2203
2338 ip->i_d.di_dmevmask = evmask; 2204 ip->i_d.di_dmevmask = evmask;
2339 ip->i_d.di_dmstate = state; 2205 ip->i_d.di_dmstate = state;
2340 2206
2341 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2207 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2342 IHOLD(ip);
2343 error = xfs_trans_commit(tp, 0); 2208 error = xfs_trans_commit(tp, 0);
2344 2209
2345 return error; 2210 return error;
@@ -2390,7 +2255,7 @@ xfs_alloc_file_space(
2390 int committed; 2255 int committed;
2391 int error; 2256 int error;
2392 2257
2393 xfs_itrace_entry(ip); 2258 trace_xfs_alloc_file_space(ip);
2394 2259
2395 if (XFS_FORCED_SHUTDOWN(mp)) 2260 if (XFS_FORCED_SHUTDOWN(mp))
2396 return XFS_ERROR(EIO); 2261 return XFS_ERROR(EIO);
@@ -2408,29 +2273,13 @@ xfs_alloc_file_space(
2408 count = len; 2273 count = len;
2409 imapp = &imaps[0]; 2274 imapp = &imaps[0];
2410 nimaps = 1; 2275 nimaps = 1;
2411 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 2276 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
2412 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2277 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2413 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2278 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2414 2279
2415 /* Generate a DMAPI event if needed. */
2416 if (alloc_type != 0 && offset < ip->i_size &&
2417 (attr_flags & XFS_ATTR_DMI) == 0 &&
2418 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2419 xfs_off_t end_dmi_offset;
2420
2421 end_dmi_offset = offset+len;
2422 if (end_dmi_offset > ip->i_size)
2423 end_dmi_offset = ip->i_size;
2424 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
2425 end_dmi_offset - offset, 0, NULL);
2426 if (error)
2427 return error;
2428 }
2429
2430 /* 2280 /*
2431 * Allocate file space until done or until there is an error 2281 * Allocate file space until done or until there is an error
2432 */ 2282 */
2433retry:
2434 while (allocatesize_fsb && !error) { 2283 while (allocatesize_fsb && !error) {
2435 xfs_fileoff_t s, e; 2284 xfs_fileoff_t s, e;
2436 2285
@@ -2451,15 +2300,22 @@ retry:
2451 e = allocatesize_fsb; 2300 e = allocatesize_fsb;
2452 } 2301 }
2453 2302
2303 /*
2304 * The transaction reservation is limited to a 32-bit block
2305 * count, hence we need to limit the number of blocks we are
2306 * trying to reserve to avoid an overflow. We can't allocate
2307 * more than @nimaps extents, and an extent is limited on disk
2308 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
2309 */
2310 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
2454 if (unlikely(rt)) { 2311 if (unlikely(rt)) {
2455 resrtextents = qblocks = (uint)(e - s); 2312 resrtextents = qblocks = resblks;
2456 resrtextents /= mp->m_sb.sb_rextsize; 2313 resrtextents /= mp->m_sb.sb_rextsize;
2457 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 2314 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2458 quota_flag = XFS_QMOPT_RES_RTBLKS; 2315 quota_flag = XFS_QMOPT_RES_RTBLKS;
2459 } else { 2316 } else {
2460 resrtextents = 0; 2317 resrtextents = 0;
2461 resblks = qblocks = \ 2318 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
2462 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2463 quota_flag = XFS_QMOPT_RES_REGBLKS; 2319 quota_flag = XFS_QMOPT_RES_REGBLKS;
2464 } 2320 }
2465 2321
@@ -2488,8 +2344,7 @@ retry:
2488 if (error) 2344 if (error)
2489 goto error1; 2345 goto error1;
2490 2346
2491 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2347 xfs_trans_ijoin(tp, ip);
2492 xfs_trans_ihold(tp, ip);
2493 2348
2494 /* 2349 /*
2495 * Issue the xfs_bmapi() call to allocate the blocks 2350 * Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2353,7 @@ retry:
2498 error = xfs_bmapi(tp, ip, startoffset_fsb, 2353 error = xfs_bmapi(tp, ip, startoffset_fsb,
2499 allocatesize_fsb, bmapi_flag, 2354 allocatesize_fsb, bmapi_flag,
2500 &firstfsb, 0, imapp, &nimaps, 2355 &firstfsb, 0, imapp, &nimaps,
2501 &free_list, NULL); 2356 &free_list);
2502 if (error) { 2357 if (error) {
2503 goto error0; 2358 goto error0;
2504 } 2359 }
@@ -2527,17 +2382,6 @@ retry:
2527 startoffset_fsb += allocated_fsb; 2382 startoffset_fsb += allocated_fsb;
2528 allocatesize_fsb -= allocated_fsb; 2383 allocatesize_fsb -= allocated_fsb;
2529 } 2384 }
2530dmapi_enospc_check:
2531 if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
2532 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
2533 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
2534 ip, DM_RIGHT_NULL,
2535 ip, DM_RIGHT_NULL,
2536 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
2537 if (error == 0)
2538 goto retry; /* Maybe DMAPI app. has made space */
2539 /* else fall through with error from XFS_SEND_DATA */
2540 }
2541 2385
2542 return error; 2386 return error;
2543 2387
@@ -2548,7 +2392,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
2548error1: /* Just cancel transaction */ 2392error1: /* Just cancel transaction */
2549 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 2393 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2550 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2394 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2551 goto dmapi_enospc_check; 2395 return error;
2552} 2396}
2553 2397
2554/* 2398/*
@@ -2588,9 +2432,9 @@ xfs_zero_remaining_bytes(
2588 if (endoff > ip->i_size) 2432 if (endoff > ip->i_size)
2589 endoff = ip->i_size; 2433 endoff = ip->i_size;
2590 2434
2591 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2435 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
2592 XFS_IS_REALTIME_INODE(ip) ? 2436 mp->m_rtdev_targp : mp->m_ddev_targp,
2593 mp->m_rtdev_targp : mp->m_ddev_targp); 2437 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
2594 if (!bp) 2438 if (!bp)
2595 return XFS_ERROR(ENOMEM); 2439 return XFS_ERROR(ENOMEM);
2596 2440
@@ -2598,7 +2442,7 @@ xfs_zero_remaining_bytes(
2598 offset_fsb = XFS_B_TO_FSBT(mp, offset); 2442 offset_fsb = XFS_B_TO_FSBT(mp, offset);
2599 nimap = 1; 2443 nimap = 1;
2600 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, 2444 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
2601 NULL, 0, &imap, &nimap, NULL, NULL); 2445 NULL, 0, &imap, &nimap, NULL);
2602 if (error || nimap < 1) 2446 if (error || nimap < 1)
2603 break; 2447 break;
2604 ASSERT(imap.br_blockcount >= 1); 2448 ASSERT(imap.br_blockcount >= 1);
@@ -2616,7 +2460,7 @@ xfs_zero_remaining_bytes(
2616 XFS_BUF_READ(bp); 2460 XFS_BUF_READ(bp);
2617 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 2461 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2618 xfsbdstrat(mp, bp); 2462 xfsbdstrat(mp, bp);
2619 error = xfs_iowait(bp); 2463 error = xfs_buf_iowait(bp);
2620 if (error) { 2464 if (error) {
2621 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2465 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2622 mp, bp, XFS_BUF_ADDR(bp)); 2466 mp, bp, XFS_BUF_ADDR(bp));
@@ -2629,7 +2473,7 @@ xfs_zero_remaining_bytes(
2629 XFS_BUF_UNREAD(bp); 2473 XFS_BUF_UNREAD(bp);
2630 XFS_BUF_WRITE(bp); 2474 XFS_BUF_WRITE(bp);
2631 xfsbdstrat(mp, bp); 2475 xfsbdstrat(mp, bp);
2632 error = xfs_iowait(bp); 2476 error = xfs_buf_iowait(bp);
2633 if (error) { 2477 if (error) {
2634 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2478 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2635 mp, bp, XFS_BUF_ADDR(bp)); 2479 mp, bp, XFS_BUF_ADDR(bp));
@@ -2661,7 +2505,6 @@ xfs_free_file_space(
2661{ 2505{
2662 int committed; 2506 int committed;
2663 int done; 2507 int done;
2664 xfs_off_t end_dmi_offset;
2665 xfs_fileoff_t endoffset_fsb; 2508 xfs_fileoff_t endoffset_fsb;
2666 int error; 2509 int error;
2667 xfs_fsblock_t firstfsb; 2510 xfs_fsblock_t firstfsb;
@@ -2680,7 +2523,7 @@ xfs_free_file_space(
2680 2523
2681 mp = ip->i_mount; 2524 mp = ip->i_mount;
2682 2525
2683 xfs_itrace_entry(ip); 2526 trace_xfs_free_file_space(ip);
2684 2527
2685 error = xfs_qm_dqattach(ip, 0); 2528 error = xfs_qm_dqattach(ip, 0);
2686 if (error) 2529 if (error)
@@ -2691,19 +2534,7 @@ xfs_free_file_space(
2691 return error; 2534 return error;
2692 rt = XFS_IS_REALTIME_INODE(ip); 2535 rt = XFS_IS_REALTIME_INODE(ip);
2693 startoffset_fsb = XFS_B_TO_FSB(mp, offset); 2536 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2694 end_dmi_offset = offset + len; 2537 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
2695 endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
2696
2697 if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
2698 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2699 if (end_dmi_offset > ip->i_size)
2700 end_dmi_offset = ip->i_size;
2701 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
2702 offset, end_dmi_offset - offset,
2703 AT_DELAY_FLAG(attr_flags), NULL);
2704 if (error)
2705 return error;
2706 }
2707 2538
2708 if (attr_flags & XFS_ATTR_NOLOCK) 2539 if (attr_flags & XFS_ATTR_NOLOCK)
2709 need_iolock = 0; 2540 need_iolock = 0;
@@ -2731,7 +2562,7 @@ xfs_free_file_space(
2731 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 2562 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2732 nimap = 1; 2563 nimap = 1;
2733 error = xfs_bmapi(NULL, ip, startoffset_fsb, 2564 error = xfs_bmapi(NULL, ip, startoffset_fsb,
2734 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); 2565 1, 0, NULL, 0, &imap, &nimap, NULL);
2735 if (error) 2566 if (error)
2736 goto out_unlock_iolock; 2567 goto out_unlock_iolock;
2737 ASSERT(nimap == 0 || nimap == 1); 2568 ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2577,7 @@ xfs_free_file_space(
2746 } 2577 }
2747 nimap = 1; 2578 nimap = 1;
2748 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 2579 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
2749 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); 2580 1, 0, NULL, 0, &imap, &nimap, NULL);
2750 if (error) 2581 if (error)
2751 goto out_unlock_iolock; 2582 goto out_unlock_iolock;
2752 ASSERT(nimap == 0 || nimap == 1); 2583 ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2645,7 @@ xfs_free_file_space(
2814 if (error) 2645 if (error)
2815 goto error1; 2646 goto error1;
2816 2647
2817 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2648 xfs_trans_ijoin(tp, ip);
2818 xfs_trans_ihold(tp, ip);
2819 2649
2820 /* 2650 /*
2821 * issue the bunmapi() call to free the blocks 2651 * issue the bunmapi() call to free the blocks
@@ -2823,7 +2653,7 @@ xfs_free_file_space(
2823 xfs_bmap_init(&free_list, &firstfsb); 2653 xfs_bmap_init(&free_list, &firstfsb);
2824 error = xfs_bunmapi(tp, ip, startoffset_fsb, 2654 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2825 endoffset_fsb - startoffset_fsb, 2655 endoffset_fsb - startoffset_fsb,
2826 0, 2, &firstfsb, &free_list, NULL, &done); 2656 0, 2, &firstfsb, &free_list, &done);
2827 if (error) { 2657 if (error) {
2828 goto error0; 2658 goto error0;
2829 } 2659 }
@@ -2882,8 +2712,7 @@ xfs_change_file_space(
2882 xfs_off_t llen; 2712 xfs_off_t llen;
2883 xfs_trans_t *tp; 2713 xfs_trans_t *tp;
2884 struct iattr iattr; 2714 struct iattr iattr;
2885 2715 int prealloc_type;
2886 xfs_itrace_entry(ip);
2887 2716
2888 if (!S_ISREG(ip->i_d.di_mode)) 2717 if (!S_ISREG(ip->i_d.di_mode))
2889 return XFS_ERROR(EINVAL); 2718 return XFS_ERROR(EINVAL);
@@ -2926,12 +2755,17 @@ xfs_change_file_space(
2926 * size to be changed. 2755 * size to be changed.
2927 */ 2756 */
2928 setprealloc = clrprealloc = 0; 2757 setprealloc = clrprealloc = 0;
2758 prealloc_type = XFS_BMAPI_PREALLOC;
2929 2759
2930 switch (cmd) { 2760 switch (cmd) {
2761 case XFS_IOC_ZERO_RANGE:
2762 prealloc_type |= XFS_BMAPI_CONVERT;
2763 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2764 /* FALLTHRU */
2931 case XFS_IOC_RESVSP: 2765 case XFS_IOC_RESVSP:
2932 case XFS_IOC_RESVSP64: 2766 case XFS_IOC_RESVSP64:
2933 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2767 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2934 1, attr_flags); 2768 prealloc_type, attr_flags);
2935 if (error) 2769 if (error)
2936 return error; 2770 return error;
2937 setprealloc = 1; 2771 setprealloc = 1;
@@ -2985,8 +2819,7 @@ xfs_change_file_space(
2985 2819
2986 xfs_ilock(ip, XFS_ILOCK_EXCL); 2820 xfs_ilock(ip, XFS_ILOCK_EXCL);
2987 2821
2988 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2822 xfs_trans_ijoin(tp, ip);
2989 xfs_trans_ihold(tp, ip);
2990 2823
2991 if ((attr_flags & XFS_ATTR_DMI) == 0) { 2824 if ((attr_flags & XFS_ATTR_DMI) == 0) {
2992 ip->i_d.di_mode &= ~S_ISUID; 2825 ip->i_d.di_mode &= ~S_ISUID;
@@ -3001,7 +2834,7 @@ xfs_change_file_space(
3001 if (ip->i_d.di_mode & S_IXGRP) 2834 if (ip->i_d.di_mode & S_IXGRP)
3002 ip->i_d.di_mode &= ~S_ISGID; 2835 ip->i_d.di_mode &= ~S_ISGID;
3003 2836
3004 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2837 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3005 } 2838 }
3006 if (setprealloc) 2839 if (setprealloc)
3007 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 2840 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dad..f6702927eee 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
2#define _XFS_VNODEOPS_H 1 2#define _XFS_VNODEOPS_H 1
3 3
4struct attrlist_cursor_kern; 4struct attrlist_cursor_kern;
5struct cred;
6struct file; 5struct file;
7struct iattr; 6struct iattr;
8struct inode; 7struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name); 26 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, 27int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); 28 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 29int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 30 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 31int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 33int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
35 xfs_off_t *offset, filldir_t filldir); 34 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, mode_t mode, struct xfs_inode **ipp, 36 const char *target_path, mode_t mode, struct xfs_inode **ipp);
38 cred_t *credp);
39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
40int xfs_change_file_space(struct xfs_inode *ip, int cmd, 38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);